diff -Nru linux-2.4.20-pre10-mjc1/Documentation/Changes linux-2.4.20-pre10-mjc2/Documentation/Changes
--- linux-2.4.20-pre10-mjc1/Documentation/Changes	2002-10-23 22:55:26.000000000 -0400
+++ linux-2.4.20-pre10-mjc2/Documentation/Changes	2002-10-25 16:51:13.000000000 -0400
@@ -56,6 +56,7 @@
 o  e2fsprogs              1.25                    # tune2fs
 o  jfsutils               1.0.12                  # fsck.jfs -V
 o  reiserfsprogs          3.6.3                   # reiserfsck -V 2>&1|grep reiserfsprogs
+o  xfsprogs               2.1.0                   # xfs_db -V
 o  pcmcia-cs              3.1.21                  # cardmgr -V
 o  PPP                    2.4.0                   # pppd --version
 o  isdn4k-utils           3.1pre1                 # isdnctrl 2>&1|grep version
@@ -190,6 +191,17 @@
 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
 reiserfsck. These utils work on both i386 and alpha platforms.
 
+Xfsprogs
+--------
+
+The latest version of xfsprogs contains mkfs.xfs, xfs_db, and the
+xfs_repair utilities, among others, for the XFS filesystem.  It is
+architecture independent and any version from 2.0.0 onward should
+work correctly with this version of the XFS kernel code.  For the new
+(v2) log format that has better support for stripe-size aligning on
+LVM and MD devices at least xfsprogs 2.1.0 is needed.
+
+
 Pcmcia-cs
 ---------
 
@@ -327,6 +339,10 @@
 -------------
 o  <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
 
+Xfsprogs
+--------
+o  <ftp://oss.sgi.com/projects/xfs/download/cmd_tars/xfsprogs-2.1.0.src.tar.gz>
+
 LVM toolset
 -----------
 o  <http://www.sistina.com/lvm/>
diff -Nru linux-2.4.20-pre10-mjc1/Documentation/Configure.help linux-2.4.20-pre10-mjc2/Documentation/Configure.help
--- linux-2.4.20-pre10-mjc1/Documentation/Configure.help	2002-10-23 22:55:50.000000000 -0400
+++ linux-2.4.20-pre10-mjc2/Documentation/Configure.help	2002-10-25 16:51:14.000000000 -0400
@@ -16237,6 +16237,92 @@
   Say Y here if you want to try writing to UFS partitions. This is
   experimental, so you should back up your UFS partitions beforehand.
 
+XFS filesystem support
+CONFIG_XFS_FS
+  XFS is a high performance journaling filesystem which originated
+  on the SGI IRIX platform.  It is completely multi-threaded, can
+  support large files and large filesystems, extended attributes,
+  variable block sizes, is extent based, and makes extensive use of
+  Btrees (directories, extents, free space) to aid both performance
+  and scalability.
+
+  Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
+  for complete details.  This implementation is on-disk compatible
+  with the IRIX version of XFS.
+
+  If you want to compile this file system as a module ( = code which
+  can be inserted in and removed from the running kernel whenever you
+  want), say M here and read <file:Documentation/modules.txt>.  The
+  module will be called xfs.o.  Be aware, however, that if the file
+  system of your root partition is compiled as a module, you'll need
+  to use an initial ramdisk (initrd) to boot.
+
+DMAPI support
+CONFIG_XFS_DMAPI
+  The Data Management API is a system interface used to implement
+  the interface defined in the X/Open document:
+    "Systems Management: Data Storage Management (XDSM) API",
+  dated February 1997.  This interface is used by hierarchical
+  storage management systems.
+
+  If XFS is built as module (= code which can be inserted in and
+  removed from the running kernel whenever you want), this code will
+  also be built as module.  It is called xfs_dmapi.o.
+
+  If unsure, say N.
+
+Quota support
+CONFIG_XFS_QUOTA
+  If you say Y here, you will be able to set limits for disk usage on
+  a per user and/or per group basis under XFS.  XFS considers quota
+  information as filesystem metadata and uses journaling to provide a
+  higher level guarantee of consistency.  The on-disk data format for
+  quota is also compatible with the IRIX version of XFS, allowing a
+  filesystem to be migrated between Linux and IRIX without any need
+  for conversion.
+
+  If unsure, say N.  More comprehensive documentation can be found in
+  README.quota in the xfsprogs package.  XFS quota can be used either
+  with or without the generic quota support enabled (CONFIG_QUOTA) -
+  they are completely independent subsystems.
+
+Realtime support (EXPERIMENTAL)
+CONFIG_XFS_RT
+  If you say Y here you will be able to mount and use XFS filesystems
+  which contain a realtime subvolume. The realtime subvolume is a
+  separate area of disk space where only file data is stored. The
+  realtime subvolume is designed to provide very deterministic
+  data rates suitable for media streaming applications.
+
+  See the xfs man page in section 5 for a bit more information.
+
+  This feature is unsupported at this time, is not yet fully
+  functional, and may cause serious problems.
+
+  If unsure, say N.
+
+Debugging support (EXPERIMENTAL)
+CONFIG_XFS_DEBUG
+  Say Y here to get an XFS build with many debugging features,
+  including ASSERT checks, function wrappers around macros,
+  and extra sanity-checking functions in various code paths.
+
+  Note that the resulting code will be HUGE and SLOW, and probably
+  not useful unless you are debugging a particular problem.
+
+  Say N unless you are an XFS developer, or play one on TV.
+
+Pagebuf debugging support (EXPERIMENTAL)
+CONFIG_PAGEBUF_DEBUG
+  Say Y here to get an XFS build which may help you debug pagebuf
+  problems.  Enabling this option will attach tracing information
+  to pagebufs, which can be read with the kdb kernel debugger.
+
+  Note that you will also have to enable the sysctl in
+  /proc/sys/vm/pagebuf/debug for this to work.
+
+  Say N unless you're interested in debugging pagebuf.
+
 Advanced partition selection
 CONFIG_PARTITION_ADVANCED
   Say Y here if you would like to use hard disks under Linux which
diff -Nru linux-2.4.20-pre10-mjc1/Documentation/filesystems/00-INDEX linux-2.4.20-pre10-mjc2/Documentation/filesystems/00-INDEX
--- linux-2.4.20-pre10-mjc1/Documentation/filesystems/00-INDEX	2002-10-23 22:55:26.000000000 -0400
+++ linux-2.4.20-pre10-mjc2/Documentation/filesystems/00-INDEX	2002-10-25 16:51:14.000000000 -0400
@@ -48,3 +48,5 @@
 	- info on using the VFAT filesystem used in Windows NT and Windows 95
 vfs.txt
 	- Overview of the Virtual File System
+xfs.txt
+	- info and mount options for the XFS filesystem.
diff -Nru linux-2.4.20-pre10-mjc1/Documentation/filesystems/xfs.txt linux-2.4.20-pre10-mjc2/Documentation/filesystems/xfs.txt
--- linux-2.4.20-pre10-mjc1/Documentation/filesystems/xfs.txt	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/Documentation/filesystems/xfs.txt	2002-10-25 16:51:14.000000000 -0400
@@ -0,0 +1,124 @@
+
+The SGI XFS Filesystem
+======================
+
+XFS is a high performance journaling filesystem which originated
+on the SGI IRIX platform.  It is completely multi-threaded, can
+support large files and large filesystems, extended attributes,
+variable block sizes, is extent based, and makes extensive use of
+Btrees (directories, extents, free space) to aid both performance
+and scalability.
+
+Refer to the documentation at http://oss.sgi.com/projects/xfs/
+for further details.  This implementation is on-disk compatible
+with the IRIX version of XFS.
+
+
+Options
+=======
+
+When mounting an XFS filesystem, the following options are accepted.
+
+  biosize=size
+	Sets the preferred buffered I/O size (default size is 64K).
+	"size" must be expressed as the logarithm (base2) of the
+	desired I/O size.
+	Valid values for this option are 14 through 16, inclusive
+	(i.e. 16K, 32K, and 64K bytes).  On machines with a 4K
+	pagesize, 13 (8K bytes) is also a valid size.
+	The preferred buffered I/O size can also be altered on an
+	individual file basis using the ioctl(2) system call.
+
+  dmapi
+	Enable the DMAPI (Data Management API) event callouts.
+	Use with the "mtpt" option.
+
+  irixsgid
+	Do not inherit the ISGID bit on subdirectories of ISGID
+	directories, if the process creating the subdirectory
+	is not a member of the parent directory group ID.
+	This matches IRIX behavior.
+
+  logbufs=value
+	Set the number of in-memory log buffers.  Valid numbers range
+	from 2-8 inclusive.
+	The default value is 8 buffers for filesystems with a
+	blocksize of 64K, 4 buffers for filesystems with a blocksize
+	of 32K, 3 buffers for filesystems with a blocksize of 16K
+	and 2 buffers for all other configurations.  Increasing the
+	number of buffers may increase performance on some workloads
+	at the cost of the memory used for the additional log buffers
+	and their associated control structures.
+
+  logbsize=value
+	Set the size of each in-memory log buffer.
+	Size may be specified in bytes, or in kilobytes with a "k" suffix.
+	Valid sizes for version 1 and version 2 logs are 16384 (16k) and 
+	32768 (32k).  Valid sizes for version 2 logs also include 
+	65536 (64k), 131072 (128k) and 262144 (256k).
+	The default value for machines with more than 32MB of memory
+	is 32768, machines with less memory use 16384 by default.
+
+  logdev=device and rtdev=device
+	Use an external log (metadata journal) and/or real-time device.
+	An XFS filesystem has up to three parts: a data section, a log
+	section, and a real-time section.  The real-time section is
+	optional, and the log section can be separate from the data
+	section or contained within it.
+
+  mtpt=mountpoint
+	Use with the "dmapi" option.  The value specified here will be
+	included in the DMAPI mount event, and should be the path of
+	the actual mountpoint that is used.
+
+  noalign
+	Data allocations will not be aligned at stripe unit boundaries.
+
+  noatime
+	Access timestamps are not updated when a file is read.
+
+  norecovery
+	The filesystem will be mounted without running log recovery.
+	If the filesystem was not cleanly unmounted, it is likely to
+	be inconsistent when mounted in "norecovery" mode.
+	Some files or directories may not be accessible because of this.
+	Filesystems mounted "norecovery" must be mounted read-only or
+	the mount will fail.
+
+  osyncisosync
+	Make O_SYNC writes implement true O_SYNC.  WITHOUT this option,
+	Linux XFS behaves as if an "osyncisdsync" option is used,
+	which will make writes to files opened with the O_SYNC flag set
+	behave as if the O_DSYNC flag had been used instead.
+	This can result in better performance without compromising
+	data safety.
+	However if this option is not in effect, timestamp updates from
+	O_SYNC writes can be lost if the system crashes.
+	If timestamp updates are critical, use the osyncisosync option.
+
+  quota/usrquota/uqnoenforce
+	User disk quota accounting enabled, and limits (optionally)
+	enforced.
+
+  grpquota/gqnoenforce
+	Group disk quota accounting enabled and limits (optionally)
+	enforced.
+
+  sunit=value and swidth=value
+	Used to specify the stripe unit and width for a RAID device or
+	a stripe volume.  "value" must be specified in 512-byte block
+	units.
+	If this option is not specified and the filesystem was made on
+	a stripe volume or the stripe width or unit were specified for
+	the RAID device at mkfs time, then the mount system call will
+	restore the value from the superblock.  For filesystems that
+	are made directly on RAID devices, these options can be used
+	to override the information in the superblock if the underlying
+	disk layout changes after the filesystem has been created.
+	The "swidth" option is required if the "sunit" option has been
+	specified, and must be a multiple of the "sunit" value.
+
+  nouuid
+        Don't check for double mounted file systems using the file system uuid.
+        This is useful to mount LVM snapshot volumes.
+
diff -Nru linux-2.4.20-pre10-mjc1/MAINTAINERS linux-2.4.20-pre10-mjc2/MAINTAINERS
--- linux-2.4.20-pre10-mjc1/MAINTAINERS	2002-10-23 22:55:45.000000000 -0400
+++ linux-2.4.20-pre10-mjc2/MAINTAINERS	2002-10-25 16:51:14.000000000 -0400
@@ -1939,6 +1939,14 @@
 L:	linux-x25@vger.kernel.org
 S:	Maintained
 
+XFS FILESYSTEM
+P:	Silicon Graphics Inc
+M:	owner-xfs@oss.sgi.com
+M:	lord@sgi.com
+L:	linux-xfs@oss.sgi.com
+W:	http://oss.sgi.com/projects/xfs
+S:	Supported
+
 X86 3-LEVEL PAGING (PAE) SUPPORT
 P:	Ingo Molnar
 M:	mingo@redhat.com
diff -Nru linux-2.4.20-pre10-mjc1/fs/Config.in linux-2.4.20-pre10-mjc2/fs/Config.in
--- linux-2.4.20-pre10-mjc1/fs/Config.in	2002-10-23 22:55:50.000000000 -0400
+++ linux-2.4.20-pre10-mjc2/fs/Config.in	2002-10-25 16:51:14.000000000 -0400
@@ -103,6 +103,13 @@
 tristate 'UFS file system support (read only)' CONFIG_UFS_FS
 dep_mbool '  UFS file system write support (DANGEROUS)' CONFIG_UFS_FS_WRITE $CONFIG_UFS_FS $CONFIG_EXPERIMENTAL
 
+tristate 'XFS filesystem support' CONFIG_XFS_FS
+dep_mbool    '  Realtime support (EXPERIMENTAL)' CONFIG_XFS_RT $CONFIG_XFS_FS $CONFIG_EXPERIMENTAL
+dep_mbool    '  Quota support' CONFIG_XFS_QUOTA $CONFIG_XFS_FS
+dep_mbool    '  DMAPI support' CONFIG_XFS_DMAPI $CONFIG_XFS_FS
+dep_mbool    '  Debugging support (EXPERIMENTAL)' CONFIG_XFS_DEBUG $CONFIG_XFS_FS $CONFIG_EXPERIMENTAL
+dep_mbool    '  Pagebuf debugging support (EXPERIMENTAL)' CONFIG_PAGEBUF_DEBUG $CONFIG_XFS_FS $CONFIG_EXPERIMENTAL
+
 if [ "$CONFIG_NET" = "y" ]; then
 
    mainmenu_option next_comment
diff -Nru linux-2.4.20-pre10-mjc1/fs/Makefile linux-2.4.20-pre10-mjc2/fs/Makefile
--- linux-2.4.20-pre10-mjc1/fs/Makefile	2002-10-23 22:55:50.000000000 -0400
+++ linux-2.4.20-pre10-mjc2/fs/Makefile	2002-10-25 16:51:14.000000000 -0400
@@ -8,7 +8,7 @@
 O_TARGET := fs.o
 
 export-objs :=	filesystems.o open.o dcache.o buffer.o dquot.o
-mod-subdirs :=	nls
+mod-subdirs :=	nls xfs
 
 obj-y :=	open.o read_write.o devices.o file_table.o buffer.o \
 		super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \
@@ -69,6 +69,7 @@
 subdir-$(CONFIG_BEFS_FS)	+= befs
 subdir.$(CONFIG_IMON)		+= imon
 subdir-$(CONFIG_JFS_FS)		+= jfs
+subdir-$(CONFIG_XFS_FS)		+= xfs
 
 
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/Makefile linux-2.4.20-pre10-mjc2/fs/xfs/Makefile
--- linux-2.4.20-pre10-mjc1/fs/xfs/Makefile	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/Makefile	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,155 @@
+#
+# Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+# Makefile for XFS on Linux.
+#
+
+# This needs -I. because everything does #include <xfs.h> instead of "xfs.h".
+# The code is wrong, local files should be included using "xfs.h", not <xfs.h>
+# but I am not going to change every file at the moment.
+EXTRA_CFLAGS +=	 -I. -funsigned-char
+
+ifeq ($(CONFIG_XFS_DEBUG),y)
+	EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG -DXFSDEBUG
+endif
+ifeq ($(CONFIG_PAGEBUF_DEBUG),y)
+	EXTRA_CFLAGS += -DPAGEBUF_TRACE
+endif
+
+subdir-$(CONFIG_XFS_FS)		+= pagebuf linux support
+
+ifeq ($(CONFIG_XFS_DMAPI),y)
+	subdir-$(CONFIG_XFS_FS)	+= dmapi
+endif
+
+# fs/Makefile enters fs/xfs twice if CONFIG_XFS_FS is y, once for kernel and
+# once for modules.  This is necessary because xfsidbg can be built as a module
+# even if xfs is in kernel.  Alas the shorthand form
+#   O_TARGET := xfs.o
+#   obj-m := $(O_TARGET)
+# fails when the makefile is run more than once, code gets compiled as both
+# kernel and as module, which one gets linked depends on the phase of the moon.
+# I just love these layer violations where a makefile behaves differently
+# depending on changes to its parent.  Work around by only setting obj-m when
+# xfs is selected as a module.	Keith Owens.
+
+O_TARGET			:= xfs.o
+ifeq ($(CONFIG_XFS_FS),m)
+  obj-m				:= $(O_TARGET)
+endif
+
+obj-$(CONFIG_XFS_DMAPI)		+= xfs_dmapi.o dmapi/dmapi_core.o
+
+obj-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
+
+obj-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
+				   xfs_dquot_item.o \
+				   xfs_trans_dquot.o \
+				   xfs_qm_syscalls.o \
+				   xfs_qm.o
+
+obj-$(CONFIG_FS_POSIX_ACL)	+= xfs_acl.o
+obj-$(CONFIG_FS_POSIX_CAP)	+= xfs_cap.o
+obj-$(CONFIG_FS_POSIX_MAC)	+= xfs_mac.o
+
+obj-y				+= xfs_alloc.o \
+				   xfs_alloc_btree.o \
+				   xfs_attr.o \
+				   xfs_attr_fetch.o \
+				   xfs_attr_leaf.o \
+				   xfs_bit.o \
+				   xfs_bmap.o \
+				   xfs_bmap_btree.o \
+				   xfs_btree.o \
+				   xfs_buf_item.o \
+				   xfs_da_btree.o \
+				   xfs_dir.o \
+				   xfs_dir2.o \
+				   xfs_dir2_block.o \
+				   xfs_dir2_data.o \
+				   xfs_dir2_leaf.o \
+				   xfs_dir2_node.o \
+				   xfs_dir2_sf.o \
+				   xfs_dir2_trace.o \
+				   xfs_dir_leaf.o \
+				   xfs_error.o \
+				   xfs_extfree_item.o \
+				   xfs_fsops.o \
+				   xfs_ialloc.o \
+				   xfs_ialloc_btree.o \
+				   xfs_iget.o \
+				   xfs_inode.o \
+				   xfs_inode_item.o \
+				   xfs_iocore.o \
+				   xfs_itable.o \
+				   xfs_dfrag.o \
+				   xfs_log.o \
+				   xfs_log_recover.o \
+				   xfs_macros.o \
+				   xfs_mount.o \
+				   xfs_rename.o \
+				   xfs_trans.o \
+				   xfs_trans_ail.o \
+				   xfs_trans_buf.o \
+				   xfs_trans_extfree.o \
+				   xfs_trans_inode.o \
+				   xfs_trans_item.o \
+				   xfs_utils.o \
+				   xfs_vfsops.o \
+				   xfs_vnodeops.o \
+				   xfs_rw.o
+
+# Objects not built in this directory
+obj-y				+= pagebuf/pagebuf.o \
+				   linux/linux_xfs.o \
+				   support/support_xfs.o
+
+# If both xfs and kdb modules are built in then xfsidbg is built in.  If xfs is
+# a module and kdb modules are being compiled then xfsidbg must be a module, to
+# follow xfs.  If xfs is built in then xfsidbg tracks the kdb module state.
+# This must come after the main xfs code so xfs initialises before xfsidbg.
+# KAO
+ifneq ($(CONFIG_KDB_MODULES),)
+  ifeq ($(CONFIG_XFS_FS),y)
+    obj-$(CONFIG_KDB_MODULES)	+= xfsidbg.o
+  else
+    obj-$(CONFIG_XFS_FS)	+= xfsidbg.o
+  endif
+endif
+
+CFLAGS_xfsidbg.o += -I $(TOPDIR)/arch/$(ARCH)/kdb
+
+include $(TOPDIR)/Rules.make
+
+# This is really nasty, but Rules.make was never designed for multi directory
+# modules.  Keith Owens.
+
+xfs.o: $(patsubst %,_modsubdir_%,$(subdir-m))
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/Makefile.in linux-2.4.20-pre10-mjc2/fs/xfs/Makefile.in
--- linux-2.4.20-pre10-mjc1/fs/xfs/Makefile.in	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/Makefile.in	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+# Makefile for XFS on Linux.
+#
+
+objlink(CONFIG_XFS_DMAPI	xfs.o xfs_dmapi.o dmapi/dmapi_core.o)
+
+objlink(CONFIG_XFS_RT		xfs.o xfs_rtalloc.o)
+
+objlink(CONFIG_XFS_QUOTA	xfs.o xfs_dquot.o xfs_dquot_item.o xfs_trans_dquot.o xfs_qm_syscalls.o xfs_qm.o)
+
+objlink(CONFIG_FS_POSIX_ACL	xfs.o xfs_acl.o)
+objlink(CONFIG_FS_POSIX_CAP	xfs.o xfs_cap.o)
+objlink(CONFIG_FS_POSIX_MAC	xfs.o xfs_mac.o)
+
+objlink(xfs.o xfs_alloc.o xfs_alloc_btree.o xfs_attr.o xfs_attr_fetch.o
+	xfs_attr_leaf.o xfs_bit.o xfs_bmap.o xfs_bmap_btree.o xfs_btree.o
+	xfs_buf_item.o xfs_da_btree.o xfs_dir.o xfs_dir2.o xfs_dir2_block.o
+	xfs_dir2_data.o xfs_dir2_leaf.o xfs_dir2_node.o xfs_dir2_sf.o
+	xfs_dir2_trace.o xfs_dir_leaf.o xfs_error.o xfs_extfree_item.o
+	xfs_fsops.o xfs_ialloc.o xfs_ialloc_btree.o xfs_iget.o xfs_inode.o
+	xfs_inode_item.o xfs_iocore.o xfs_itable.o xfs_dfrag.o xfs_log.o
+	xfs_log_recover.o xfs_macros.o xfs_mount.o xfs_rename.o xfs_trans.o
+	xfs_trans_ail.o xfs_trans_buf.o xfs_trans_extfree.o xfs_trans_inode.o
+	xfs_trans_item.o xfs_utils.o xfs_vfsops.o xfs_vnodeops.o xfs_rw.o)
+
+# Objects not built in this directory
+objlink(xfs.o pagebuf/pagebuf.o linux/linux_xfs.o support/support_xfs.o)
+
+select(CONFIG_XFS_FS xfs.o)
+
+# If both xfs and kdb modules are built in then xfsidbg is built in.  If xfs is
+# a module and kdb modules are being compiled then xfsidbg must be a module, to
+# follow xfs.  If xfs is built in then xfsidbg tracks the kdb module state.
+# This must come after the main xfs code so xfs initialises before xfsidbg.
+# KAO
+ifsel(CONFIG_KDB_MODULES)
+  ifselnmod(CONFIG_XFS_FS)
+    select(CONFIG_KDB_MODULES xfsidbg.o)
+  else
+    select(CONFIG_XFS_FS xfsidbg.o)
+  endif
+endif
+
+XFS_EXTRA_CFLAGS	:= $(src_includelist /fs/xfs) -funsigned-char
+ifsel(CONFIG_XFS_DEBUG)
+  XFS_EXTRA_CFLAGS	+= -g -DSTATIC="" -DDEBUG -DXFSDEBUG
+endif
+ifsel(CONFIG_PAGEBUF_DEBUG)
+  XFS_EXTRA_CFLAGS	+= -DPAGEBUF_TRACE
+endif
+extra_cflags_all($(XFS_EXTRA_CFLAGS))
+
+extra_cflags(xfsidbg.o $(src_includelist /arch/$(ARCH)/kdb))
+
+# FIXME: xfsidbg includes "pagebuf/page_buf_internal.h" which includes
+# "page_buf.h" and "page_buf_trace.h"
+extra_cflags(xfsidbg.o $(src_includelist pagebuf))
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/Makefile linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/Makefile
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/Makefile	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/Makefile	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,59 @@
+#
+# Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+
+EXTRA_CFLAGS += -I $(TOPDIR)/fs/xfs
+
+ifeq ($(CONFIG_XFS_DEBUG),y)
+	EXTRA_CFLAGS += -g -DDEBUG -DXFSDEBUG
+endif
+
+O_TARGET			:= dmapi_core.o
+ifneq ($(MAKECMDGOALS),modules_install)
+  obj-m				:= $(O_TARGET)
+endif
+
+obj-y				+= dmapi_sysent.o \
+				   dmapi_attr.o \
+				   dmapi_config.o \
+				   dmapi_bulkattr.o \
+				   dmapi_dmattr.o \
+				   dmapi_event.o \
+				   dmapi_handle.o \
+				   dmapi_hole.o \
+				   dmapi_io.o \
+				   dmapi_mountinfo.o \
+				   dmapi_region.o \
+				   dmapi_register.o \
+				   dmapi_right.o \
+				   dmapi_session.o
+
+include $(TOPDIR)/Rules.make
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/Makefile.in linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/Makefile.in
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/Makefile.in	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/Makefile.in	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,41 @@
+#
+#
+# Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+
+objlink(dmapi_core.o dmapi_sysent.o dmapi_attr.o dmapi_config.o dmapi_bulkattr.o
+	dmapi_dmattr.o dmapi_event.o dmapi_handle.o dmapi_hole.o dmapi_io.o
+	dmapi_mountinfo.o dmapi_region.o dmapi_register.o dmapi_right.o
+	dmapi_session.o)
+
+# No select() for dmapi in this directory.  It is a sub-component of XFS,
+# see fs/xfs/Makefile.in for the objlink.
+
+extra_cflags_all($(src_includelist /fs/xfs))
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/Status linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/Status
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/Status	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/Status	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,126 @@
+for linux:
+
+
+68 external interfaces in libdm
+
+   56 of those interfaces go through to dmi(), the kernel side of DMAPI
+
+
+
+Functions known to work
+----------------------------------------------
+
+dm_create_session
+dm_create_userevent
+dm_destroy_session
+dm_getall_sessions
+dm_getall_tokens
+dm_get_allocinfo
+dm_get_bulkattr
+dm_get_config_events
+dm_get_dmattr
+dm_get_eventlist
+dm_get_events
+dm_get_fileattr
+dm_get_region
+dm_handle_free
+dm_init_attrloc
+dm_init_service
+dm_obj_ref_hold
+dm_obj_ref_query
+dm_obj_ref_rele
+dm_path_to_fshandle
+dm_path_to_handle
+dm_punch_hole
+dm_query_session
+dm_read_invis
+dm_remove_dmattr
+dm_respond_event
+dm_send_msg
+dm_set_disp
+dm_set_dmattr
+dm_set_eventlist
+dm_set_fileattr
+dm_set_region
+dm_sync_by_handle
+dm_write_invis
+34
+
+Functions that seem to work (would like more rigorous test case)
+------------------------------------------
+
+dm_pending
+dm_probe_hole		- one test case of test_hole.c fails
+dm_request_right
+3
+
+Functions untested but probably work
+----------------------------------------------
+
+dm_find_eventmsg
+dm_handle_cmp
+dm_handle_to_fshandle
+dm_handle_to_ino
+dm_release_right
+5
+
+Functions that do not work
+-----------------------------------------
+
+dm_get_dioinfo		- directio not implemented
+1
+
+Functions not supported in SGI DMAPI
+-------------------------------------------------------------
+
+dm_clear_inherit
+dm_create_by_handle
+dm_getall_inherit
+dm_get_bulkall
+dm_mkdir_by_handle
+dm_set_inherit
+dm_symlink_by_handle
+
+
+
+
+Functions that seem to work (would like more rigorous test case)
+----------------------------------------------------------------
+
+dm_get_config
+dm_downgrade_right
+dm_get_mountinfo
+dm_set_return_on_destory
+dm_upgrade_right
+
+
+
+Functions that do not work
+-----------------------------------------------------------------
+
+dm_fd_to_handle		- Irix getf not implemented on linux
+dm_get_dirattrs		- null pointer reference
+dm_handle_to_path
+dm_getall_dmattr	- needs a copy_from_user in place of useracc
+
+
+Functions that are untested, but probably work
+-----------------------------------------------------------------
+
+dm_getall_disp
+dm_handle_hash
+dm_handle_is_valid
+dm_handle_to_fsid
+dm_handle_to_igen
+dm_make_fshandle
+dm_make_handle
+dm_move_event
+dm_query_right
+
+
+
+Other things not working
+----------------------------------
+
+- read/write events for memory-mapped I/O?
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi.h linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,1043 @@
+/*
+ * Copyright (c) 1995-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307,
+ * USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef _SYS_DMAPI_H
+#define _SYS_DMAPI_H
+
+#ifdef	__cplusplus
+extern	"C" {
+#endif
+
+#ifndef __KERNEL__
+#include <sys/types.h>
+#endif
+#include <linux/types.h>
+
+/**************************************************************************
+ *									  *
+ * The SGI implementation of DMAPI is based upon the X/Open document	  *
+ *	Systems Management: Data Storage Managment (XDSM) API		  *
+ * dated February 1997.	 Not all DMAPI functions and structure fields	  *
+ * have been implemented.  Most importantly, the DMAPI functions	  *
+ * dm_request_right, dm_release_right, dm_query_right, dm_upgrade_right	  *
+ * and dm_downgrade_right do not work as described in the specification.  *
+ *									  *
+ * The XFS filesystem currently does not allow its locking mechanisms to  *
+ * be externally accessed from user space.  While the above-mentioned	  *
+ * dm_xxx_right functions exist and can be called by applications, they	  *
+ * always return successfully without actually obtaining any locks	  *
+ * within the filesystem.						  *
+ *									  *
+ * Applications which do not need full rights support and which only	  *
+ * make dm_xxx_right calls in order to satisfy the input requirements of  *
+ * other DMAPI calls should be able to use these routines to avoid	  *
+ * having to implement special-case code for SGI platforms.  Applications *
+ * which truely need the capabilities of a full implementation of rights  *
+ * will unfortunately have to come up with alternate software solutions	  *
+ * until such time as rights can be completely implemented.		  *
+ *									  *
+ * Functions and structure fields defined within this file which are not  *
+ * supported in the SGI implementation of DMAPI are indicated by comments *
+ * following their definitions such as "not supported", or "not		  *
+ * completely supported".  Any function or field not so marked may be	  *
+ * assumed to work exactly according to the spec.			  *
+ *									  *
+ **************************************************************************/
+
+
+
+/* The first portion of this file contains defines and typedefs that are
+   DMAPI implementation-dependent, and could be different on other platforms.
+*/
+
+typedef __s64		dm_attrloc_t;
+typedef unsigned int	dm_boolean_t;
+typedef __u64		dm_eventset_t;
+typedef __u64		dm_fsid_t;
+typedef __u64		dm_ino_t;
+typedef __u32		dm_igen_t;
+typedef __s64		dm_off_t;
+typedef unsigned int	dm_sequence_t;
+typedef int		dm_sessid_t;
+typedef __u64		dm_size_t;
+typedef __s64		dm_ssize_t;
+typedef int		dm_token_t;
+
+/* XXX dev_t, mode_t, and nlink_t are not the same size in kernel space
+   and user space.  This affects the field offsets for dm_stat_t.
+   The following solution is temporary.
+
+   user space sizes:  dev_t=8  mode_t=4	 nlink_t=4
+   kernel space	   :  dev_t=2  mode_t=2	 nlink_t=2
+
+*/
+typedef __s64		dm_dev_t;
+typedef int		dm_mode_t;
+typedef int		dm_nlink_t;
+
+
+#define DM_REGION_NOEVENT	0x0
+#define DM_REGION_READ		0x1
+#define DM_REGION_WRITE		0x2
+#define DM_REGION_TRUNCATE	0x4
+
+/* Values for the mask argument used with dm_get_fileattr, dm_get_bulkattr,
+   dm_get_dirattrs, and dm_set_fileattr.
+*/
+
+#define DM_AT_MODE	0x0001
+#define DM_AT_UID	0x0002
+#define DM_AT_GID	0x0004
+#define DM_AT_ATIME	0x0008
+#define DM_AT_MTIME	0x0010
+#define DM_AT_CTIME	0x0020
+#define DM_AT_SIZE	0x0040
+#define DM_AT_DTIME	0x0080
+#define DM_AT_HANDLE	0x0100
+#define DM_AT_EMASK	0x0200
+#define DM_AT_PMANR	0x0400
+#define DM_AT_PATTR	0x0800
+#define DM_AT_STAT	0x1000
+#define DM_AT_CFLAG	0x2000
+
+#define DM_EV_WAIT	0x1		/* used in dm_get_events() */
+
+#define DM_MOUNT_RDONLY 0x1		/* me_mode field in dm_mount_event_t */
+
+#define DM_RR_WAIT	0x1
+
+#define DM_UNMOUNT_FORCE 0x1		/* ne_mode field in dm_namesp_event_t */
+
+#define DM_WRITE_SYNC	0x1		/* used in dm_write_invis() */
+
+#define DM_SESSION_INFO_LEN	256
+#define DM_NO_SESSION		0
+#define DM_TRUE			1
+#define DM_FALSE		0
+#define DM_INVALID_TOKEN	0
+#define DM_NO_TOKEN		(-1)
+#define DM_INVALID_HANP		NULL
+#define DM_INVALID_HLEN		0
+#define DM_GLOBAL_HANP		((void *)(1LL))
+#define DM_GLOBAL_HLEN		((size_t)(1))
+#define DM_VER_STR_CONTENTS	"SGI DMAPI (XDSM) API, Release 1.0."
+
+
+#define DMEV_SET(event_type, event_list) \
+	((event_list) |= (1 << (event_type)))
+#define DMEV_CLR(event_type, event_list) \
+	((event_list) &= ~(1 << (event_type)))
+#define DMEV_ISSET(event_type, event_list) \
+	(int)(((event_list) & (1 << (event_type))) != 0)
+#define DMEV_ZERO(event_list) \
+	(event_list) = 0
+
+
+typedef struct {
+	int	vd_offset;	/* offset from start of containing struct */
+	unsigned int	vd_length;	/* length of data starting at vd_offset */
+} dm_vardata_t;
+
+#define DM_GET_VALUE(p, field, type) \
+	((type) ((char *)(p) + (p)->field.vd_offset))
+
+#define DM_GET_LEN(p, field) \
+	((p)->field.vd_length)
+
+#define DM_STEP_TO_NEXT(p, type) \
+	((type) ((p)->_link ? (char *)(p) + (p)->_link : NULL))
+
+
+
+
+/* The remainder of this include file contains defines, typedefs, and
+   structures which are strictly defined by the DMAPI 2.3 specification.
+
+   (The _link field which appears in several structures is an
+   implementation-specific way to implement DM_STEP_TO_NEXT, and
+   should not be referenced directly by application code.)
+*/
+
+
+#define DM_ATTR_NAME_SIZE	8
+
+
+struct dm_attrname {
+	unsigned char	an_chars[DM_ATTR_NAME_SIZE];
+};
+typedef struct dm_attrname	dm_attrname_t;
+
+
+struct dm_attrlist {
+	int		_link;
+	dm_attrname_t	al_name;
+	dm_vardata_t	al_data;
+};
+typedef struct dm_attrlist	dm_attrlist_t;
+
+
+typedef enum {
+	DM_CONFIG_INVALID,
+	DM_CONFIG_BULKALL,
+	DM_CONFIG_CREATE_BY_HANDLE,
+	DM_CONFIG_DTIME_OVERLOAD,
+	DM_CONFIG_LEGACY,
+	DM_CONFIG_LOCK_UPGRADE,
+	DM_CONFIG_MAX_ATTR_ON_DESTROY,
+	DM_CONFIG_MAX_ATTRIBUTE_SIZE,
+	DM_CONFIG_MAX_HANDLE_SIZE,
+	DM_CONFIG_MAX_MANAGED_REGIONS,
+	DM_CONFIG_MAX_MESSAGE_DATA,
+	DM_CONFIG_OBJ_REF,
+	DM_CONFIG_PENDING,
+	DM_CONFIG_PERS_ATTRIBUTES,
+	DM_CONFIG_PERS_EVENTS,
+	DM_CONFIG_PERS_INHERIT_ATTRIBS,
+	DM_CONFIG_PERS_MANAGED_REGIONS,
+	DM_CONFIG_PUNCH_HOLE,
+	DM_CONFIG_TOTAL_ATTRIBUTE_SPACE,
+	DM_CONFIG_WILL_RETRY
+} dm_config_t;
+
+
+struct	dm_dioinfo {			/* non-standard SGI addition */
+	unsigned int	d_mem;
+	unsigned int	d_miniosz;
+	unsigned int	d_maxiosz;
+	dm_boolean_t	d_dio_only;
+};
+typedef struct dm_dioinfo	dm_dioinfo_t;
+
+
+struct dm_dispinfo {
+	int		_link;
+	unsigned int	di_pad1;		/* reserved; do not reference */
+	dm_vardata_t	di_fshandle;
+	dm_eventset_t	di_eventset;
+};
+typedef struct dm_dispinfo	dm_dispinfo_t;
+
+
+typedef enum {
+	DM_EVENT_INVALID	= -1,
+	DM_EVENT_CANCEL		= 0,		/* not supported */
+	DM_EVENT_MOUNT		= 1,
+	DM_EVENT_PREUNMOUNT	= 2,
+	DM_EVENT_UNMOUNT	= 3,
+	DM_EVENT_DEBUT		= 4,		/* not supported */
+	DM_EVENT_CREATE		= 5,
+	DM_EVENT_CLOSE		= 6,		/* not supported */
+	DM_EVENT_POSTCREATE	= 7,
+	DM_EVENT_REMOVE		= 8,
+	DM_EVENT_POSTREMOVE	= 9,
+	DM_EVENT_RENAME		= 10,
+	DM_EVENT_POSTRENAME	= 11,
+	DM_EVENT_LINK		= 12,
+	DM_EVENT_POSTLINK	= 13,
+	DM_EVENT_SYMLINK	= 14,
+	DM_EVENT_POSTSYMLINK	= 15,
+	DM_EVENT_READ		= 16,
+	DM_EVENT_WRITE		= 17,
+	DM_EVENT_TRUNCATE	= 18,
+	DM_EVENT_ATTRIBUTE	= 19,
+	DM_EVENT_DESTROY	= 20,
+	DM_EVENT_NOSPACE	= 21,
+	DM_EVENT_USER		= 22,
+	DM_EVENT_MAX		= 23
+} dm_eventtype_t;
+
+
+struct dm_eventmsg {
+	int		_link;
+	dm_eventtype_t	ev_type;
+	dm_token_t	ev_token;
+	dm_sequence_t	ev_sequence;
+	dm_vardata_t	ev_data;
+};
+typedef struct dm_eventmsg	dm_eventmsg_t;
+
+
+struct dm_cancel_event {			/* not supported */
+	dm_sequence_t	ce_sequence;
+	dm_token_t	ce_token;
+};
+typedef struct dm_cancel_event	dm_cancel_event_t;
+
+
+struct dm_data_event {
+	dm_vardata_t	de_handle;
+	dm_off_t	de_offset;
+	dm_size_t	de_length;
+};
+typedef struct dm_data_event dm_data_event_t;
+
+struct dm_destroy_event {
+	dm_vardata_t		ds_handle;
+	dm_attrname_t		ds_attrname;
+	dm_vardata_t		ds_attrcopy;
+};
+typedef struct dm_destroy_event dm_destroy_event_t;
+
+struct dm_mount_event {
+	dm_mode_t	me_mode;
+	dm_vardata_t	me_handle1;
+	dm_vardata_t	me_handle2;
+	dm_vardata_t	me_name1;
+	dm_vardata_t	me_name2;
+	dm_vardata_t	me_roothandle;
+};
+typedef struct dm_mount_event dm_mount_event_t;
+
+struct dm_namesp_event {
+	dm_mode_t	ne_mode;
+	dm_vardata_t	ne_handle1;
+	dm_vardata_t	ne_handle2;
+	dm_vardata_t	ne_name1;
+	dm_vardata_t	ne_name2;
+	int		ne_retcode;
+};
+typedef struct dm_namesp_event dm_namesp_event_t;
+
+
+typedef enum {
+	DM_EXTENT_INVALID,
+	DM_EXTENT_RES,
+	DM_EXTENT_HOLE
+} dm_extenttype_t;
+
+
+struct dm_extent {
+	dm_extenttype_t ex_type;
+	unsigned int	ex_pad1;		/* reserved; do not reference */
+	dm_off_t	ex_offset;
+	dm_size_t	ex_length;
+};
+typedef struct dm_extent dm_extent_t;
+
+struct dm_fileattr {
+	dm_mode_t	fa_mode;
+	uid_t		fa_uid;
+	gid_t		fa_gid;
+	time_t		fa_atime;
+	time_t		fa_mtime;
+	time_t		fa_ctime;
+	time_t		fa_dtime;
+	unsigned int	fa_pad1;		/* reserved; do not reference */
+	dm_off_t	fa_size;
+};
+typedef struct dm_fileattr dm_fileattr_t;
+
+
+struct dm_inherit {				/* not supported */
+	dm_attrname_t	ih_name;
+	dm_mode_t	ih_filetype;
+};
+typedef struct dm_inherit dm_inherit_t;
+
+
+typedef enum {
+	DM_MSGTYPE_INVALID,
+	DM_MSGTYPE_SYNC,
+	DM_MSGTYPE_ASYNC
+} dm_msgtype_t;
+
+
+struct dm_region {
+	dm_off_t	rg_offset;
+	dm_size_t	rg_size;
+	unsigned int	rg_flags;
+	unsigned int	rg_pad1;		/* reserved; do not reference */
+};
+typedef struct dm_region dm_region_t;
+
+
+typedef enum {
+	DM_RESP_INVALID,
+	DM_RESP_CONTINUE,
+	DM_RESP_ABORT,
+	DM_RESP_DONTCARE
+} dm_response_t;
+
+
+typedef enum {
+	DM_RIGHT_NULL,
+	DM_RIGHT_SHARED,
+	DM_RIGHT_EXCL
+} dm_right_t;
+
+
+struct dm_stat {
+	int		_link;
+	dm_vardata_t	dt_handle;
+	dm_vardata_t	dt_compname;
+	int		dt_nevents;
+	dm_eventset_t	dt_emask;
+	int		dt_pers;		/* field not supported */
+	int		dt_pmanreg;
+	time_t		dt_dtime;
+	unsigned int	dt_change;		/* field not supported */
+	unsigned int	dt_pad1;		/* reserved; do not reference */
+	dm_dev_t	dt_dev;
+	dm_ino_t	dt_ino;
+	dm_mode_t	dt_mode;
+	dm_nlink_t	dt_nlink;
+	uid_t		dt_uid;
+	gid_t		dt_gid;
+	dm_dev_t	dt_rdev;
+	unsigned int	dt_pad2;		/* reserved; do not reference */
+	dm_off_t	dt_size;
+	time_t		dt_atime;
+	time_t		dt_mtime;
+	time_t		dt_ctime;
+	unsigned int	dt_blksize;
+	dm_size_t	dt_blocks;
+
+	/* Non-standard filesystem-specific fields.  Currently XFS is the only
+	   supported filesystem type.
+	*/
+
+	__u64	dt_pad3;	/* reserved; do not reference */
+	int		dt_fstype;	/* filesystem index; see sysfs(2) */
+	union	{
+		struct	{
+			dm_igen_t	igen;
+			unsigned int	xflags;
+			unsigned int	extsize;
+			unsigned int	extents;
+			unsigned short	aextents;
+			unsigned short	dmstate;
+		} sgi_xfs;
+	} fsys_dep;
+};
+typedef struct dm_stat	dm_stat_t;
+
+#define dt_xfs_igen	fsys_dep.sgi_xfs.igen
+#define dt_xfs_xflags	fsys_dep.sgi_xfs.xflags
+#define dt_xfs_extsize	fsys_dep.sgi_xfs.extsize
+#define dt_xfs_extents	fsys_dep.sgi_xfs.extents
+#define dt_xfs_aextents fsys_dep.sgi_xfs.aextents
+#define dt_xfs_dmstate	fsys_dep.sgi_xfs.dmstate
+
+/* Flags for the non-standard dt_xfs_xflags field. */
+
+#define DM_XFLAG_REALTIME	0x1
+#define DM_XFLAG_PREALLOC	0x2
+#define DM_XFLAG_HASATTR	0x80000000
+
+
+struct	dm_timestruct {
+	time_t		dm_tv_sec;
+	int		dm_tv_nsec;
+};
+typedef struct dm_timestruct dm_timestruct_t;
+
+
+struct	dm_xstat {				/* not supported */
+	dm_stat_t	dx_statinfo;
+	dm_vardata_t	dx_attrdata;
+};
+typedef struct dm_xstat dm_xstat_t;
+
+
+
+/* The following list provides the prototypes for all functions defined in
+   the DMAPI interface.
+*/
+
+extern int
+dm_clear_inherit(				/* not supported */
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep);
+
+extern int
+dm_create_by_handle(				/* not supported */
+	dm_sessid_t	sid,
+	void		*dirhanp,
+	size_t		dirhlen,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen,
+	char		*cname);
+
+extern int
+dm_create_session(
+	dm_sessid_t	oldsid,
+	char		*sessinfop,
+	dm_sessid_t	*newsidp);
+
+extern int
+dm_create_userevent(
+	dm_sessid_t	sid,
+	size_t		msglen,
+	void		*msgdatap,
+	dm_token_t	*tokenp);
+
+extern int
+dm_destroy_session(
+	dm_sessid_t	sid);
+
+extern int
+dm_downgrade_right(		/* not completely supported; see caveat above */
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token);
+
+extern int
+dm_fd_to_handle(
+	int		fd,
+	void		**hanpp,
+	size_t		*hlenp);
+
+extern int
+dm_find_eventmsg(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern int
+dm_get_allocinfo(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_off_t	*offp,
+	unsigned int	nelem,
+	dm_extent_t	*extentp,
+	unsigned int	*nelemp);
+
+extern int
+dm_get_bulkall(					/* not supported */
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	mask,
+	dm_attrname_t	*attrnamep,
+	dm_attrloc_t	*locp,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern int
+dm_get_bulkattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	mask,
+	dm_attrloc_t	*locp,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern int
+dm_get_config(
+	void		*hanp,
+	size_t		hlen,
+	dm_config_t	flagname,
+	dm_size_t	*retvalp);
+
+extern int
+dm_get_config_events(
+	void		*hanp,
+	size_t		hlen,
+	unsigned int	nelem,
+	dm_eventset_t	*eventsetp,
+	unsigned int	*nelemp);
+
+extern int
+dm_get_dirattrs(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	mask,
+	dm_attrloc_t	*locp,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern int
+dm_get_dmattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern int
+dm_get_eventlist(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	nelem,
+	dm_eventset_t	*eventsetp,
+	unsigned int	*nelemp);
+
+extern int
+dm_get_events(
+	dm_sessid_t	sid,
+	unsigned int	maxmsgs,
+	unsigned int	flags,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern int
+dm_get_fileattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	mask,
+	dm_stat_t	*statp);
+
+extern int
+dm_get_mountinfo(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern int
+dm_get_region(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	nelem,
+	dm_region_t	*regbufp,
+	unsigned int	*nelemp);
+
+extern int
+dm_getall_disp(
+	dm_sessid_t	sid,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern int
+dm_getall_dmattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern int
+dm_getall_inherit(				/* not supported */
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	nelem,
+	dm_inherit_t	*inheritbufp,
+	unsigned int	*nelemp);
+
+extern int
+dm_getall_sessions(
+	unsigned int	nelem,
+	dm_sessid_t	*sidbufp,
+	unsigned int	*nelemp);
+
+extern int
+dm_getall_tokens(
+	dm_sessid_t	sid,
+	unsigned int	nelem,
+	dm_token_t	*tokenbufp,
+	unsigned int	*nelemp);
+
+extern int
+dm_handle_cmp(
+	void		*hanp1,
+	size_t		hlen1,
+	void		*hanp2,
+	size_t		hlen2);
+
+extern void
+dm_handle_free(
+	void		*hanp,
+	size_t		hlen);
+
+extern u_int
+dm_handle_hash(
+	void		*hanp,
+	size_t		hlen);
+
+extern dm_boolean_t
+dm_handle_is_valid(
+	void		*hanp,
+	size_t		hlen);
+
+extern int
+dm_handle_to_fshandle(
+	void		*hanp,
+	size_t		hlen,
+	void		**fshanpp,
+	size_t		*fshlenp);
+
+extern int
+dm_handle_to_fsid(
+	void		*hanp,
+	size_t		hlen,
+	dm_fsid_t	*fsidp);
+
+extern int
+dm_handle_to_igen(
+	void		*hanp,
+	size_t		hlen,
+	dm_igen_t	*igenp);
+
+extern int
+dm_handle_to_ino(
+	void		*hanp,
+	size_t		hlen,
+	dm_ino_t	*inop);
+
+extern int
+dm_handle_to_path(
+	void		*dirhanp,
+	size_t		dirhlen,
+	void		*targhanp,
+	size_t		targhlen,
+	size_t		buflen,
+	char		*pathbufp,
+	size_t		*rlenp);
+
+extern int
+dm_init_attrloc(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrloc_t	*locp);
+
+extern int
+dm_init_service(
+	char		**versionstrpp);
+
+extern int
+dm_make_handle(
+	dm_fsid_t	*fsidp,
+	dm_ino_t	*inop,
+	dm_igen_t	*igenp,
+	void		**hanpp,
+	size_t		*hlenp);
+
+extern int
+dm_make_fshandle(
+	dm_fsid_t	*fsidp,
+	void		**hanpp,
+	size_t		*hlenp);
+
+extern int
+dm_mkdir_by_handle(				/* not supported */
+	dm_sessid_t	sid,
+	void		*dirhanp,
+	size_t		dirhlen,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen,
+	char		*cname);
+
+extern int
+dm_move_event(
+	dm_sessid_t	srcsid,
+	dm_token_t	token,
+	dm_sessid_t	targetsid,
+	dm_token_t	*rtokenp);
+
+extern int
+dm_obj_ref_hold(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen);
+
+extern int
+dm_obj_ref_query(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen);
+
+extern int
+dm_obj_ref_rele(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen);
+
+extern int
+dm_path_to_fshandle(
+	char		*path,
+	void		**hanpp,
+	size_t		*hlenp);
+
+extern int
+dm_path_to_handle(
+	char		*path,
+	void		**hanpp,
+	size_t		*hlenp);
+
+extern int
+dm_pending(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	dm_timestruct_t *delay);
+
+extern int
+dm_probe_hole(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_off_t	off,
+	dm_size_t	len,
+	dm_off_t	*roffp,
+	dm_size_t	*rlenp);
+
+extern int
+dm_punch_hole(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_off_t	off,
+	dm_size_t	len);
+
+extern int
+dm_query_right(			/* not completely supported; see caveat above */
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_right_t	*rightp);
+
+extern int
+dm_query_session(
+	dm_sessid_t	sid,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp);
+
+extern dm_ssize_t
+dm_read_invis(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_off_t	off,
+	dm_size_t	len,
+	void		*bufp);
+
+extern int
+dm_release_right(		/* not completely supported; see caveat above */
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token);
+
+extern int
+dm_remove_dmattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	int		setdtime,
+	dm_attrname_t	*attrnamep);
+
+extern int
+dm_request_right(		/* not completely supported; see caveat above */
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	flags,
+	dm_right_t	right);
+
+extern int
+dm_respond_event(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	dm_response_t	response,
+	int		reterror,
+	size_t		buflen,
+	void		*respbufp);
+
+extern int
+dm_send_msg(
+	dm_sessid_t	targetsid,
+	dm_msgtype_t	msgtype,
+	size_t		buflen,
+	void		*bufp);
+
+extern int
+dm_set_disp(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_eventset_t	*eventsetp,
+	unsigned int	maxevent);
+
+extern int
+dm_set_dmattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep,
+	int		setdtime,
+	size_t		buflen,
+	void		*bufp);
+
+extern int
+dm_set_eventlist(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_eventset_t	*eventsetp,
+	unsigned int	maxevent);
+
+extern int
+dm_set_fileattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	mask,
+	dm_fileattr_t	*attrp);
+
+extern int
+dm_set_inherit(					/* not supported */
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep,
+	mode_t		mode);
+
+extern int
+dm_set_region(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	unsigned int	nelem,
+	dm_region_t	*regbufp,
+	dm_boolean_t	*exactflagp);
+
+extern int
+dm_set_return_on_destroy(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep,
+	dm_boolean_t	enable);
+
+extern int
+dm_symlink_by_handle(				/* not supported */
+	dm_sessid_t	sid,
+	void		*dirhanp,
+	size_t		dirhlen,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen,
+	char		*cname,
+	char		*path);
+
+extern int
+dm_sync_by_handle(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token);
+
+extern int
+dm_upgrade_right(		/* not completely supported; see caveat above */
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token);
+
+extern dm_ssize_t
+dm_write_invis(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	int		flags,
+	dm_off_t	off,
+	dm_size_t	len,
+	void		*bufp);
+
+/* Non-standard SGI additions to the DMAPI interface. */
+
+int
+dm_open_by_handle(
+	void		*hanp,
+	size_t		hlen,
+	int		mode);
+
+extern int
+dm_get_dioinfo(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_dioinfo_t	*diop);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DMAPI_H */
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_attr.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_attr.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_attr.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_attr.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+
+/* Retrieve attributes for a single file, directory or symlink. */
+
+int
+dm_get_fileattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		mask,
+	dm_stat_t	*statp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_fileattr(tdp->td_vp, tdp->td_right,
+		mask, statp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+/* Set one or more file attributes of a file, directory, or symlink. */
+
+int
+dm_set_fileattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		mask,
+	dm_fileattr_t	*attrp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->set_fileattr(tdp->td_vp, tdp->td_right,
+		mask, attrp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_bulkattr.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_bulkattr.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_bulkattr.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_bulkattr.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+
+int
+dm_init_attrloc(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrloc_t	*locp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS|DM_TDT_DIR,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->init_attrloc(tdp->td_vp, tdp->td_right, locp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+/*
+ * Retrieves both standard and DM specific file attributes for the file
+ * system indicated by the handle. (The FS has to be mounted).
+ * Syscall returns 1 to indicate SUCCESS and more information is available.
+ * -1 is returned on error, and errno will be set appropriately.
+ * 0 is returned upon successful completion.
+ */
+
+int
+dm_get_bulkattr_rvp(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		mask,
+	dm_attrloc_t	*locp,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp,
+	int		*rvp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_bulkattr_rvp(tdp->td_vp, tdp->td_right,
+			mask, locp, buflen, bufp, rlenp, rvp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+/*
+ * Retrieves attributes of directory entries given a handle to that
+ * directory. Iterative.
+ * Syscall returns 1 to indicate SUCCESS and more information is available.
+ * -1 is returned on error, and errno will be set appropriately.
+ * 0 is returned upon successful completion.
+ */
+
+int
+dm_get_dirattrs_rvp(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		mask,
+	dm_attrloc_t	*locp,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp,
+	int		*rvp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_DIR,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_dirattrs_rvp(tdp->td_vp, tdp->td_right,
+		mask, locp, buflen, bufp, rlenp, rvp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_get_bulkall_rvp(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		mask,
+	dm_attrname_t	*attrnamep,
+	dm_attrloc_t	*locp,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp,
+	int		*rvp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_bulkall_rvp(tdp->td_vp, tdp->td_right,
+		mask, attrnamep, locp, buflen, bufp, rlenp, rvp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_config.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_config.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_config.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_config.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+int
+dm_get_config(
+	void		*hanp,
+	size_t		hlen,
+	dm_config_t	flagname,
+	dm_size_t	*retvalp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	dm_size_t	retval;
+	int		system = 1;
+	int		error;
+
+	/* Trap and process configuration parameters which are system-wide. */
+
+	switch (flagname) {
+	case DM_CONFIG_LEGACY:
+	case DM_CONFIG_PENDING:
+	case DM_CONFIG_OBJ_REF:
+		retval = DM_TRUE;
+		break;
+	case DM_CONFIG_MAX_MESSAGE_DATA:
+		retval = DM_MAX_MSG_DATA;
+		break;
+	default:
+		system = 0;
+		break;
+	}
+	if (system) {
+		if (copy_to_user(retvalp, &retval, sizeof(retval)))
+			return(EFAULT);
+		return(0);
+	}
+
+	/* Must be filesystem-specific.	 Convert the handle into a vnode. */
+
+	if ((error = dm_get_config_tdp(hanp, hlen, &tdp)) != 0)
+		return(error);
+
+	/* Now call the filesystem-specific routine to determine the
+	   value of the configuration option for that filesystem.
+	*/
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_config(tdp->td_vp, tdp->td_right,
+		flagname, retvalp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_get_config_events(
+	void		*hanp,
+	size_t		hlen,
+	u_int		nelem,
+	dm_eventset_t	*eventsetp,
+	u_int		*nelemp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	/* Convert the handle into a vnode. */
+
+	if ((error = dm_get_config_tdp(hanp, hlen, &tdp)) != 0)
+		return(error);
+
+	/* Now call the filesystem-specific routine to determine the
+	   events supported by that filesystem.
+	*/
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_config_events(tdp->td_vp, tdp->td_right,
+		nelem, eventsetp, nelemp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_dmattr.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_dmattr.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_dmattr.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_dmattr.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+
+int
+dm_clear_inherit(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->clear_inherit(tdp->td_vp, tdp->td_right,
+		attrnamep);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_get_dmattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_dmattr(tdp->td_vp, tdp->td_right,
+		attrnamep, buflen, bufp, rlenp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_getall_dmattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->getall_dmattr(tdp->td_vp, tdp->td_right,
+		buflen, bufp, rlenp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_getall_inherit(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		nelem,
+	dm_inherit_t	*inheritbufp,
+	u_int		*nelemp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->getall_inherit(tdp->td_vp, tdp->td_right,
+		nelem, inheritbufp, nelemp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_remove_dmattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	int		setdtime,
+	dm_attrname_t	*attrnamep)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->remove_dmattr(tdp->td_vp, tdp->td_right,
+		setdtime, attrnamep);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_set_dmattr(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep,
+	int		setdtime,
+	size_t		buflen,
+	void		*bufp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VNO,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->set_dmattr(tdp->td_vp, tdp->td_right,
+		attrnamep, setdtime, buflen, bufp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_set_inherit(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep,
+	mode_t		mode)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->set_inherit(tdp->td_vp, tdp->td_right,
+		attrnamep, mode);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_event.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_event.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_event.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_event.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,859 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+/* The "rights" portion of the DMAPI spec is not currently implemented.	 A
+   framework for rights is provided in the code, but turns out to be a noop
+   in practice.	 The following comments are a brain dump to serve as input to
+   the poor soul that eventually has to get DMAPI rights working in IRIX.
+
+   A DMAPI right is similar but not identical to the mrlock_t mechanism
+   already used within the kernel.  The similarities are that it is a
+   sleeping lock, and that a multiple-reader, single-writer protocol is used.
+   How locks are obtained and dropped are different however.  With a mrlock_t,
+   a thread grabs the lock, does some stuff, then drops the lock, and all other
+   threads block in the meantime (assuming a write lock).  There is a one-to-
+   one relationship between the lock and the thread which obtained the lock.
+   Not so with DMAPI right locks.  A DMAPI lock is associated with a particular
+   session/token/hanp/hlen quad; since there is a dm_tokdata_t structure for
+   each such quad, you can think of it as a one-to-one relationship between the
+   lock and a dm_tokdata_t.  Any application thread which presents the correct
+   quad is entitled to grab or release the lock, or to use the rights
+   associated with that lock.  The thread that grabs the lock does not have to
+   be the one to use the lock, nor does it have to be the thread which drops
+   the lock.  The lock can be held for very long periods of time, even across
+   multiple systems calls by multiple application threads.  The idea is that a
+   coordinated group of DMAPI application threads can grab the lock, issue a
+   series of inode accesses and/or updates, then drop the lock, and be assured
+   that no other thread in the system could be modifying the inode at the same
+   time.  The kernel is expected to blindly trust that the application will
+   not forget to unlock inodes it has locked, and will not deadlock itself
+   against the kernel.
+
+   There are two types of DMAPI rights, file object (inode) and filesystem
+   object (superblock?).  An inode right is the equivalent of the combination
+   of both the XFS ilock and iolock; if held exclusively, no data or metadata
+   within the file can be changed by non-lock-holding threads.	The filesystem
+   object lock is a little fuzzier; I think that if it is held, things like
+   unmounts can be blocked, plus there is an event mask associated with the
+   filesystem which can't be updated without the lock.	(By the way, that
+   event mask is supposed to be persistent in the superblock; add that to
+   your worklist :-)
+
+   All events generated by XFS currently arrive with no rights, i.e.
+   DM_RIGHT_NULL, and return to the filesystem with no rights.	It would be
+   smart to leave it this way if possible, because it otherwise becomes more
+   likely that an application thread will deadlock against the kernel if the
+   one responsible for calling dm_get_events() happens to touch a file which
+   was locked at the time the event was queued.	 Since the thread is blocked,
+   it can't read the event in order to find and drop the lock.	Catch-22.  If
+   you do have events that arrive with non-null rights, then dm_enqueue() needs
+   to have code added for synchronous events which atomically switches the
+   right from being a thread-based right to a dm_tokdata_t-based right without
+   allowing the lock to drop in between.  You will probably have to add a new
+   dm_fsys_vector entry point to do this.  The lock can't be lost during the
+   switch, or other threads might change the inode or superblock in between.
+   Likewise, if you need to return to the filesystem holding a right, then
+   you need a DMAPI-to-thread atomic switch to occur, most likely in
+   dm_change_right().  Again, the lock must not be lost during the switch; the
+   DMAPI spec spends a couple of pages stressing this.	Another dm_fsys_vector
+   entry point is probably the answer.
+
+   There are several assumptions implied in the current layout of the code.
+   First of all, if an event returns to the filesystem with a return value of
+   zero, then the filesystem can assume that any locks (rights) held at the
+   start of the event are still in effect at the end of the event.  (Note that
+   the application could have temporarily dropped and reaquired the right
+   while the event was outstanding, however).  If the event returns to the
+   filesystem with an errno, then the filesystem must assume that it has lost
+   any and all rights associated with any of the objects in the event.	This
+   was done for a couple of reasons.  First of all, since an errno is being
+   returned, most likely the filesystem is going to immediately drop all the
+   locks anyway.  If the DMAPI code was required to unconditionally reobtain
+   all locks before returning to the filesystem, then dm_pending() wouldn't
+   work for NFS server threads because the process would block indefinitely
+   trying to get its thread-based rights back, because the DMAPI-rights
+   associated with the dm_tokdata_t in the outstanding event would prevent
+   the rights from being obtained.  That would be a bad thing.	We wouldn't
+   be able to let users Cntl-C out of read/write/truncate events either.
+
+   If a case should ever surface where the thread has lost its rights even
+   though it has a zero return status, or where the thread has rights even
+   though it is returning with an errno, then this logic will have to be
+   reworked.  This could be done by changing the 'right' parameters on all
+   the event calls to (dm_right_t *), so that they could serve both as IN
+   and OUT parameters.
+
+   Some events such as DM_EVENT_DESTROY arrive without holding a vnode
+   reference; if you don't have a vnode reference, you can't have a right
+   on the file.
+
+   One more quirk.  The DM_EVENT_UNMOUNT event is defined to be synchronous
+   when it's behavior is asynchronous.	If an unmount event arrives with
+   rights, the event should return with the same rights and should NOT leave
+   any rights in the dm_tokdata_t where the application could use them.
+*/
+
+
+#define GETNEXTOFF(vdat)	((vdat).vd_offset + (vdat).vd_length)
+#define HANDLE_SIZE(tdp)	\
+	((tdp)->td_type & DM_TDT_VFS ? FSHSIZE : XFS_HSIZE((tdp)->td_handle))
+
+
+/* Given a vnode pointer in a filesystem known to support DMAPI,
+   build a tdp structure for the corresponding vnode.
+*/
+
+static dm_tokdata_t *
+dm_vp_data(
+	vnode_t		*vp,
+	dm_right_t	right,
+	int		referenced)	/* != 0, caller holds vnode reference */
+{
+	int		error;
+	dm_tokdata_t	*tdp;
+
+	tdp = kmem_cache_alloc(dm_tokdata_cachep, SLAB_KERNEL);
+	if (tdp == NULL) {
+		printk("%s/%d: kmem_cache_alloc(dm_tokdata_cachep) returned NULL\n", __FUNCTION__, __LINE__);
+		return NULL;
+	}
+
+	tdp->td_next = NULL;
+	tdp->td_tevp = NULL;
+	tdp->td_app_ref = 0;
+	tdp->td_orig_right = right;
+	tdp->td_right = right;
+	tdp->td_flags = DM_TDF_ORIG;
+	if (referenced) {
+		tdp->td_flags |= DM_TDF_EVTREF;
+	}
+
+	if (vp->v_type == VREG) {
+		tdp->td_type = DM_TDT_REG;
+	} else if (vp->v_type == VDIR) {
+		tdp->td_type = DM_TDT_DIR;
+	} else if (vp->v_type == VLNK) {
+		tdp->td_type = DM_TDT_LNK;
+	} else {
+		tdp->td_type = DM_TDT_OTH;
+	}
+
+	if (referenced) {
+		tdp->td_vp = vp;
+	} else {
+		tdp->td_vp = NULL;
+	}
+	tdp->td_vcount = 0;
+
+	if ((error = dm_vp_to_handle(vp, &tdp->td_handle)) != 0) {
+		panic("dm_vp_data: dm_vp_to_handle failed for vp %p in "
+			"a DMAPI filesystem, errno %d\n", vp, error);
+	}
+
+	return(tdp);
+}
+
+
+/* Given a vfs pointer to a filesystem known to support DMAPI, build a tdp
+   structure for that vfsp.
+*/
+static dm_tokdata_t *
+dm_vfs_data(
+	vfs_t		*vfsp,
+	vnode_t		*vp,		/* will be NULL for DM_EVENT_UNMOUNT */
+	dm_right_t	right)
+{
+	dm_tokdata_t	*tdp;
+
+	tdp = kmem_cache_alloc(dm_tokdata_cachep, SLAB_KERNEL);
+	if (tdp == NULL) {
+		printk("%s/%d: kmem_cache_alloc(dm_tokdata_cachep) returned NULL\n", __FUNCTION__, __LINE__);
+		return NULL;
+	}
+
+	tdp->td_next = NULL;
+	tdp->td_tevp = NULL;
+	tdp->td_app_ref = 0;
+	tdp->td_orig_right = right;
+	tdp->td_right = right;
+	tdp->td_flags = DM_TDF_ORIG;
+	if (vp) {
+		tdp->td_flags |= DM_TDF_EVTREF;
+	}
+	tdp->td_type = DM_TDT_VFS;
+	if (vp) {
+		tdp->td_vp = vp;
+	} else {
+		tdp->td_vp = NULL;
+	}
+	tdp->td_vcount = 0;
+
+	bcopy(vfsp->vfs_altfsid, &tdp->td_handle.ha_fsid, sizeof(fsid_t));
+	bzero((char *)&tdp->td_handle.ha_fsid + sizeof(fsid_t),
+		sizeof(tdp->td_handle) - sizeof(fsid_t));
+
+	return(tdp);
+}
+
+
+/* Link a tdp structure into the tevp. */
+
+static void
+dm_add_handle_to_event(
+	dm_tokevent_t	*tevp,
+	dm_tokdata_t	*tdp)
+{
+	tdp->td_next = tevp->te_tdp;
+	tevp->te_tdp = tdp;
+	tdp->td_tevp = tevp;
+}
+
+
+/* Generate the given data event for the vnode, and wait for a reply.  The
+   caller must guarantee that the vnode's reference count is greater than zero
+   so that the filesystem can't disappear while the request is outstanding.
+*/
+
+int
+dm_send_data_event(
+	dm_eventtype_t	event,
+	bhv_desc_t	*bdp,
+	dm_right_t	vp_right,	/* current right for vp */
+	off_t		offset,
+	size_t		length,
+	int		flags)		/* 0 or DM_FLAGS_NDELAY */
+{
+	dm_data_event_t *datap;
+	dm_tokevent_t	*tevp;
+	dm_tokdata_t	*tdp;
+	vnode_t		*vp;
+	int		error;
+
+	vp = BHV_TO_VNODE(bdp);
+	tdp = dm_vp_data(vp, vp_right, /* reference held */ 1);
+	if (tdp == NULL)
+		return ENOMEM;
+
+	/* Calculate the size of the event in bytes, create an event structure
+	   for it, and insert the file's handle into the event.
+	*/
+
+	tevp = dm_evt_create_tevp(event, HANDLE_SIZE(tdp), (void **)&datap);
+	if (tevp == NULL) {
+		kmem_cache_free(dm_tokdata_cachep, tdp);
+		return(ENOMEM);
+	}
+	dm_add_handle_to_event(tevp, tdp);
+
+	/* Now fill in all the dm_data_event_t fields. */
+
+	datap->de_handle.vd_offset = sizeof(*datap);
+	datap->de_handle.vd_length = HANDLE_SIZE(tdp);
+	bcopy(&tdp->td_handle, (char *)datap + datap->de_handle.vd_offset,
+		datap->de_handle.vd_length);
+	datap->de_offset = offset;
+	datap->de_length = length;
+
+	/* Queue the message and wait for the reply. */
+
+	error = dm_enqueue_normal_event(vp->v_vfsp, tevp, flags);
+
+	/* If no errors occurred, we must leave with the same rights we had
+	   upon entry.	If errors occurred, we must leave with no rights.
+	*/
+
+	dm_evt_rele_tevp(tevp, error);
+
+	return(error);
+}
+
+
+/* Generate the destroy event for the vnode and wait until the request has been
+   queued.  The caller does not hold a vnode reference or a right on the vnode,
+   but it must otherwise lock down the vnode such that the filesystem can't
+   disappear while the request is waiting to be queued.	 While waiting to be
+   queued, the vnode must not be referenceable either by path or by a call
+   to dm_handle_to_vp().
+*/
+
+int
+dm_send_destroy_event(
+	bhv_desc_t	*bdp,
+	dm_right_t	vp_right)	/* always DM_RIGHT_NULL */
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokevent_t	*tevp;
+	dm_tokdata_t	*tdp;
+	dm_destroy_event_t *destp;
+	dm_attrname_t	attrname;
+	vnode_t		*vp;
+	char		*value;
+	int		value_len;
+	int		error;
+
+	vp = BHV_TO_VNODE(bdp);
+	tdp = dm_vp_data(vp, vp_right, /* no reference held */ 0);
+	if (tdp == NULL)
+		return ENOMEM;
+
+	if ((error = dm_waitfor_destroy_attrname(vp->v_vfsp, &attrname)) != 0)
+		return(error);
+
+	/* If a return-on-destroy attribute name exists for this filesystem,
+	   see if the object being deleted has this attribute.	If the object
+	   doesn't have the attribute or if we encounter an error, then send
+	   the event without the attribute.
+	 */
+
+	value_len = -1;		/* because zero is a valid attribute length */
+	if (attrname.an_chars[0] != '\0') {
+		fsys_vector = dm_fsys_vector(vp);
+		error = fsys_vector->get_destroy_dmattr(vp, vp_right, &attrname,
+			&value, &value_len);
+		if (error)
+			return error;
+	}
+
+	/* Now that we know the size of the attribute value, if any, calculate
+	   the size of the event in bytes, create an event structure for it,
+	   and insert the handle into the event.
+	*/
+
+	tevp = dm_evt_create_tevp(DM_EVENT_DESTROY,
+		HANDLE_SIZE(tdp) + (value_len >= 0 ? value_len : 0),
+		(void **)&destp);
+	if (tevp == NULL) {
+		kmem_cache_free(dm_tokdata_cachep, tdp);
+		if (value_len > 0)
+			kfree(value);
+		return(ENOMEM);
+	}
+	dm_add_handle_to_event(tevp, tdp);
+
+	/* Now fill in all the dm_destroy_event_t fields. */
+
+	destp->ds_handle.vd_offset = sizeof(*destp);
+	destp->ds_handle.vd_length = HANDLE_SIZE(tdp);
+	bcopy(&tdp->td_handle, (char *)destp + destp->ds_handle.vd_offset,
+		destp->ds_handle.vd_length);
+	if (value_len >= 0) {
+		destp->ds_attrname = attrname;
+		destp->ds_attrcopy.vd_length = value_len;
+		if (value_len == 0) {
+			destp->ds_attrcopy.vd_offset = 0;
+		} else {
+			destp->ds_attrcopy.vd_offset = GETNEXTOFF(destp->ds_handle);
+			bcopy(value, (char *)destp + destp->ds_attrcopy.vd_offset,
+				value_len);
+			kfree(value);
+		}
+	}
+
+	/* Queue the message asynchronously. */
+
+	error = dm_enqueue_normal_event(vp->v_vfsp, tevp, 0);
+
+	/* Since we had no rights upon entry, we have none to reobtain before
+	   leaving.
+	*/
+
+	dm_evt_rele_tevp(tevp, 1);
+
+	return(error);
+}
+
+
+/* The dm_mount_event_t event is sent in turn to all sessions that have asked
+   for it until one either rejects it or accepts it.  The filesystem is not
+   going anywhere because the mount is blocked until the event is answered.
+*/
+
+int
+dm_send_mount_event(
+	vfs_t		*vfsp,		/* filesystem being mounted */
+	dm_right_t	vfsp_right,
+	bhv_desc_t	*bdp,		/* mounted on directory */
+	dm_right_t	vp_right,
+	bhv_desc_t	*rootbdp,
+	dm_right_t	rootvp_right,
+	char		*name1,		/* mount path */
+	char		*name2)		/* filesystem device name */
+{
+	int		error;
+	dm_tokevent_t	*tevp = NULL;
+	dm_tokdata_t	*tdp1 = NULL;	/* filesystem handle for event */
+	dm_tokdata_t	*tdp2 = NULL;	/* file handle for mounted-on dir. */
+	dm_tokdata_t	*tdp3 = NULL;	/* file handle for root vnode */
+	dm_mount_event_t *mp;
+	size_t		nextoff;
+	vnode_t		*vp = NULL; /* mounted on directory */
+	vnode_t		*rootvp = BHV_TO_VNODE(rootbdp);
+
+	/* Convert the vfsp to a filesystem handle, and vp and rootvp into
+	   file handles.  vp (the mounted-on directory) may not have a handle
+	   if it is a different filesystem type such as EFS which does not
+	   support DMAPI.
+	*/
+
+	if(bdp)
+		vp = BHV_TO_VNODE(bdp);
+
+	tdp1 = dm_vfs_data(vfsp, rootvp, vfsp_right);
+	if (tdp1 == NULL)
+		goto out_nomem;
+
+	if ((vp == NULL) || dm_check_dmapi_vp(vp)) {
+		vp = NULL;	/* assume we are mounting on non XFS */
+	} else {
+		tdp2 = dm_vp_data(vp, vp_right, /* reference held */ 1);
+		if (tdp2 == NULL)
+			goto out_nomem;
+	}
+
+	tdp3 = dm_vp_data(rootvp, rootvp_right, /* reference held */ 1);
+	if (tdp3 == NULL)
+		goto out_nomem;
+
+	/* Calculate the size of the event in bytes, create an event structure
+	   for it, and insert the handles into the event.
+	*/
+
+	tevp = dm_evt_create_tevp(DM_EVENT_MOUNT,
+			HANDLE_SIZE(tdp1) + (vp ? HANDLE_SIZE(tdp2) : 0) +
+			HANDLE_SIZE(tdp3) + strlen(name1) + 1 +
+			strlen(name2) + 1, (void **)&mp);
+	if (tevp == NULL)
+		goto out_nomem;
+
+	dm_add_handle_to_event(tevp, tdp1);
+	if (vp)
+		dm_add_handle_to_event(tevp, tdp2);
+	dm_add_handle_to_event(tevp, tdp3);
+
+	/* Now fill in all the dm_mount_event_t fields. */
+
+	mp->me_handle1.vd_offset = sizeof(*mp);
+	mp->me_handle1.vd_length = HANDLE_SIZE(tdp1);
+	bcopy(&tdp1->td_handle, (char *) mp + mp->me_handle1.vd_offset,
+			mp->me_handle1.vd_length);
+	nextoff = GETNEXTOFF(mp->me_handle1);
+
+	if (vp) {
+		mp->me_handle2.vd_offset = nextoff;
+		mp->me_handle2.vd_length = HANDLE_SIZE(tdp2);
+		bcopy(&tdp2->td_handle, (char *)mp + mp->me_handle2.vd_offset,
+			mp->me_handle2.vd_length);
+		nextoff = GETNEXTOFF(mp->me_handle2);
+	}
+
+	mp->me_name1.vd_offset = nextoff;
+	mp->me_name1.vd_length = strlen(name1) + 1;
+	bcopy(name1, (char *)mp + mp->me_name1.vd_offset, mp->me_name1.vd_length);
+	nextoff = GETNEXTOFF(mp->me_name1);
+
+	mp->me_name2.vd_offset = nextoff;
+	mp->me_name2.vd_length = strlen(name2) + 1;
+	bcopy(name2, (char *)mp + mp->me_name2.vd_offset, mp->me_name2.vd_length);
+	nextoff = GETNEXTOFF(mp->me_name2);
+
+	mp->me_roothandle.vd_offset = nextoff;
+	mp->me_roothandle.vd_length = HANDLE_SIZE(tdp3);
+	bcopy(&tdp3->td_handle, (char *)mp + mp->me_roothandle.vd_offset,
+			mp->me_roothandle.vd_length);
+
+	mp->me_mode = (vfsp->vfs_flag & VFS_RDONLY ? DM_MOUNT_RDONLY : 0);
+
+	/* Queue the message and wait for the reply. */
+
+	error = dm_enqueue_mount_event(vfsp, tevp);
+
+	/* If no errors occurred, we must leave with the same rights we had
+	   upon entry.	If errors occurred, we must leave with no rights.
+	*/
+
+	dm_evt_rele_tevp(tevp, error);
+
+	return(error);
+
+out_nomem:
+	if (tevp)
+		kfree(tevp);
+	if (tdp1)
+		kmem_cache_free(dm_tokdata_cachep, tdp1);
+	if (tdp2)
+		kmem_cache_free(dm_tokdata_cachep, tdp2);
+	if (tdp3)
+		kmem_cache_free(dm_tokdata_cachep, tdp3);
+	return ENOMEM;
+}
+
+
+/* Generate an DM_EVENT_UNMOUNT event and wait for a reply.  The 'retcode'
+   field indicates whether this is a successful or unsuccessful unmount.
+   If successful, the filesystem is already unmounted, and any pending handle
+   reference to the filesystem will be failed.	If the unmount was
+   unsuccessful, then the filesystem will be placed back into full service.
+
+   The DM_EVENT_UNMOUNT event should really be asynchronous, because the
+   application has no control over whether or not the unmount succeeds.	 (The
+   DMAPI spec defined it that way because asynchronous events aren't always
+   guaranteed to be delivered.)
+
+   Since the filesystem is already unmounted in the successful case, the
+   DM_EVENT_UNMOUNT event can't make available any vnode to be used in
+   subsequent sid/hanp/hlen/token calls by the application.  The event will
+   hang around until the application does a DM_RESP_CONTINUE, but the handle
+   within the event is unusable by the application.
+*/
+
+void
+dm_send_unmount_event(
+	vfs_t		*vfsp,
+	vnode_t		*vp,		/* NULL if unmount successful */
+	dm_right_t	vfsp_right,
+	mode_t		mode,
+	int		retcode,	/* errno, if unmount failed */
+	int		flags)
+{
+	dm_namesp_event_t	*np;
+	dm_tokevent_t	*tevp;
+	dm_tokdata_t	*tdp1;
+
+	/* If the unmount failed, put the filesystem back into full service,
+	   allowing blocked handle references to finish.  If it succeeded, put
+	   the filesystem into the DM_STATE_UNMOUNTED state and fail all
+	   blocked DM_NO_TOKEN handle accesses.
+	*/
+
+	if (retcode != 0) {	/* unmount was unsuccessful */
+		dm_change_fsys_entry(vfsp, DM_STATE_MOUNTED);
+	} else {
+		dm_change_fsys_entry(vfsp, DM_STATE_UNMOUNTED);
+	}
+
+	/* If the event wasn't in the filesystem dm_eventset_t, just remove
+	   the filesystem from the list of DMAPI filesystems and return.
+	*/
+
+	if (flags & DM_FLAGS_UNWANTED) {
+		if (retcode == 0)
+			dm_remove_fsys_entry(vfsp);
+		return;
+	}
+
+	/* Calculate the size of the event in bytes and allocate zeroed memory
+	   for it.
+	*/
+
+	tdp1 = dm_vfs_data(vfsp, vp, vfsp_right);
+	if (tdp1 == NULL)
+		return;
+
+	tevp = dm_evt_create_tevp(DM_EVENT_UNMOUNT, HANDLE_SIZE(tdp1),
+		(void **)&np);
+	if (tevp == NULL) {
+		kmem_cache_free(dm_tokdata_cachep, tdp1);
+		return;
+	}
+
+	dm_add_handle_to_event(tevp, tdp1);
+
+	/* Now copy in all the dm_namesp_event_t specific fields. */
+
+	np->ne_handle1.vd_offset = sizeof(*np);
+	np->ne_handle1.vd_length = HANDLE_SIZE(tdp1);
+	bcopy(&tdp1->td_handle, (char *) np + np->ne_handle1.vd_offset,
+			np->ne_handle1.vd_length);
+	np->ne_mode = mode;
+	np->ne_retcode = retcode;
+
+	/* Since DM_EVENT_UNMOUNT is effectively asynchronous, queue the
+	   message and ignore any error return for DM_EVENT_UNMOUNT.
+	*/
+
+	(void)dm_enqueue_normal_event(vfsp, tevp, flags);
+
+	if (retcode == 0)
+		dm_remove_fsys_entry(vfsp);
+
+	dm_evt_rele_tevp(tevp, 0);
+}
+
+
+/* Generate the given namespace event and wait for a reply (if synchronous) or
+   until the event has been queued (asynchronous).  The caller must guarantee
+   that at least one vnode within the filesystem has had its reference count
+   bumped so that the filesystem can't disappear while the event is
+   outstanding.
+*/
+
+int
+dm_send_namesp_event(
+	dm_eventtype_t	event,
+	bhv_desc_t	*bdp1,
+	dm_right_t	vp1_right,
+	bhv_desc_t	*bdp2,
+	dm_right_t	vp2_right,
+	char		*name1,
+	char		*name2,
+	mode_t		mode,
+	int		retcode,
+	int		flags)
+{
+	dm_namesp_event_t	*np;
+	dm_tokevent_t	*tevp;
+	dm_tokdata_t	*tdp1 = NULL;	/* primary handle for event */
+	dm_tokdata_t	*tdp2 = NULL;	/* additional handle for event */
+	vfs_t		*sidvfsp;	/* vfs event must be registered on */
+	size_t		nextoff;
+	int		error;
+	vnode_t		*vp1;
+
+	vp1 = BHV_TO_VNODE(bdp1);
+	sidvfsp = vp1->v_vfsp;
+
+	switch (event) {
+	case DM_EVENT_PREUNMOUNT:
+		/*
+		 *  PREUNMOUNT - Send the file system handle in handle1,
+		 *  and the handle for the root dir in the second.  Otherwise
+		 *  it's a normal sync message; i.e. succeeds or fails
+		 *  depending on the app's return code.
+		 *	bdp1 and bdp2 are both the root dir of mounted FS
+		 *	vp1_right is the filesystem right.
+		 *	vp2_right is the root inode right.
+		 */
+
+		tdp1 = dm_vfs_data(sidvfsp, vp1, vp1_right);
+		if (tdp1 == NULL)
+			return ENOMEM;
+		tdp2 = dm_vp_data(BHV_TO_VNODE(bdp2), vp2_right, /* reference held */ 1);
+		if (tdp2 == NULL) {
+			kmem_cache_free(dm_tokdata_cachep, tdp1);
+			return ENOMEM;
+		}
+
+		if (flags & DM_FLAGS_UNWANTED) {
+			dm_change_fsys_entry(sidvfsp, DM_STATE_UNMOUNTING);
+			return(0);
+		}
+		break;
+
+	case DM_EVENT_NOSPACE:
+		/* vp1_right is the filesystem right. */
+
+		tdp1 = dm_vfs_data(sidvfsp, vp1, vp1_right);
+		if (tdp1 == NULL)
+			return ENOMEM;
+		tdp2 = dm_vp_data(BHV_TO_VNODE(bdp2), vp2_right, /* reference held */ 1); /* additional info - not in the spec */
+		if (tdp2 == NULL) {
+			kmem_cache_free(dm_tokdata_cachep, tdp1);
+			return ENOMEM;
+		}
+		break;
+
+	default:
+		/* All other events only pass in vnodes and don't require any
+		   special cases.
+		*/
+
+		tdp1 = dm_vp_data(vp1, vp1_right, /* reference held */ 1);
+		if (tdp1 == NULL)
+			return ENOMEM;
+		if (bdp2) {
+			tdp2 = dm_vp_data(BHV_TO_VNODE(bdp2), vp2_right, /* reference held */ 1);
+			if (tdp2 == NULL) {
+				kmem_cache_free(dm_tokdata_cachep, tdp1);
+				return ENOMEM;
+			}
+		}
+	}
+
+	/* Calculate the size of the event in bytes and allocate zeroed memory
+	   for it.
+	*/
+
+	tevp = dm_evt_create_tevp(event,
+		HANDLE_SIZE(tdp1) + (bdp2 ? HANDLE_SIZE(tdp2) : 0) +
+		(name1 ? strlen(name1) + 1 : 0) +
+		(name2 ? strlen(name2) + 1 : 0), (void **)&np);
+	if (tevp == NULL) {
+		if (tdp1)
+			kmem_cache_free(dm_tokdata_cachep, tdp1);
+		if (tdp2)
+			kmem_cache_free(dm_tokdata_cachep, tdp2);
+		return(ENOMEM);
+	}
+
+	dm_add_handle_to_event(tevp, tdp1);
+	if (bdp2)
+		dm_add_handle_to_event(tevp, tdp2);
+
+	/* Now copy in all the dm_namesp_event_t specific fields. */
+
+	np->ne_handle1.vd_offset = sizeof(*np);
+	np->ne_handle1.vd_length = HANDLE_SIZE(tdp1);
+	bcopy(&tdp1->td_handle, (char *) np + np->ne_handle1.vd_offset,
+			np->ne_handle1.vd_length);
+	nextoff = GETNEXTOFF(np->ne_handle1);
+	if (bdp2) {
+		np->ne_handle2.vd_offset = nextoff;
+		np->ne_handle2.vd_length = HANDLE_SIZE(tdp2);
+		bcopy(&tdp2->td_handle, (char *)np + np->ne_handle2.vd_offset,
+				np->ne_handle2.vd_length);
+		nextoff = GETNEXTOFF(np->ne_handle2);
+	}
+	if (name1) {
+		np->ne_name1.vd_offset = nextoff;
+		np->ne_name1.vd_length = strlen(name1) + 1;
+		bcopy(name1, (char *)np + np->ne_name1.vd_offset,
+				np->ne_name1.vd_length);
+		nextoff = GETNEXTOFF(np->ne_name1);
+	}
+	if (name2) {
+		np->ne_name2.vd_offset = nextoff;
+		np->ne_name2.vd_length = strlen(name2) + 1;
+		bcopy(name2, (char *)np + np->ne_name2.vd_offset,
+				np->ne_name2.vd_length);
+	}
+	np->ne_mode = mode;
+	np->ne_retcode = retcode;
+
+	/* Queue the message and wait for the reply. */
+
+	error = dm_enqueue_normal_event(sidvfsp, tevp, flags);
+
+	/* If no errors occurred, we must leave with the same rights we had
+	   upon entry.	If errors occurred, we must leave with no rights.
+	*/
+
+	dm_evt_rele_tevp(tevp, error);
+
+	if (!error && event == DM_EVENT_PREUNMOUNT) {
+		dm_change_fsys_entry(sidvfsp, DM_STATE_UNMOUNTING);
+	}
+
+	return(error);
+}
+
+
+/*
+ *  Send a message of type "DM_EVENT_USER".  Since no vnode is involved, we
+ *  don't have to worry about rights here.
+ */
+
+int
+dm_send_msg(
+	dm_sessid_t	targetsid,
+	dm_msgtype_t	msgtype,		/* SYNC or ASYNC */
+	size_t		buflen,
+	void		*bufp)
+{
+	dm_tokevent_t	*tevp;
+	int		sync;
+	void		*msgp;
+	int		error;
+
+	if (buflen > DM_MAX_MSG_DATA)
+		return(E2BIG);
+	if (msgtype == DM_MSGTYPE_ASYNC) {
+		sync = 0;
+	} else if (msgtype == DM_MSGTYPE_SYNC) {
+		sync = 1;
+	} else {
+		return(EINVAL);
+	}
+
+	tevp = dm_evt_create_tevp(DM_EVENT_USER, buflen, (void **)&msgp);
+	if (tevp == NULL)
+		return ENOMEM;
+
+	if (buflen && copy_from_user(msgp, bufp, buflen)) {
+		dm_evt_rele_tevp(tevp, 0);
+		return(EFAULT);
+	}
+
+	/* Enqueue the request and wait for the reply. */
+
+	error = dm_enqueue_sendmsg_event(targetsid, tevp, sync);
+
+	/* Destroy the tevp and return the reply.  (dm_pending is not
+	   supported here.)
+	*/
+
+	dm_evt_rele_tevp(tevp, error);
+
+	return(error);
+}
+
+
+/*
+ *  Send a message of type "DM_EVENT_USER".  Since no vnode is involved, we
+ *  don't have to worry about rights here.
+ */
+
+int
+dm_create_userevent(
+	dm_sessid_t	sid,
+	size_t		msglen,
+	void		*msgdatap,
+	dm_token_t	*tokenp)		/* return token created */
+{
+	dm_tokevent_t	*tevp;
+	dm_token_t	token;
+	int		error;
+	void		*msgp;
+
+	if (msglen > DM_MAX_MSG_DATA)
+		return(E2BIG);
+
+	tevp = dm_evt_create_tevp(DM_EVENT_USER, msglen, (void **)&msgp);
+	if (tevp == NULL)
+		return(ENOMEM);
+
+	if (msglen && copy_from_user(msgp, msgdatap, msglen)) {
+		dm_evt_rele_tevp(tevp, 0);
+		return(EFAULT);
+	}
+
+	/* Queue the message.  If that didn't work, free the tevp structure. */
+
+	if ((error = dm_enqueue_user_event(sid, tevp, &token)) != 0)
+		dm_evt_rele_tevp(tevp, 0);
+
+	if (!error && copy_to_user(tokenp, &token, sizeof(token)))
+		error = EFAULT;
+
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_handle.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_handle.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_handle.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_handle.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+
+int
+dm_create_by_handle(
+	dm_sessid_t	sid,
+	void		*dirhanp,
+	size_t		dirhlen,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen,
+	char		*cname)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, dirhanp, dirhlen, token, DM_TDT_DIR,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->create_by_handle(tdp->td_vp, tdp->td_right,
+		hanp, hlen, cname);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_mkdir_by_handle(
+	dm_sessid_t	sid,
+	void		*dirhanp,
+	size_t		dirhlen,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen,
+	char		*cname)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, dirhanp, dirhlen, token, DM_TDT_DIR,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->mkdir_by_handle(tdp->td_vp, tdp->td_right,
+		hanp, hlen, cname);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_symlink_by_handle(
+	dm_sessid_t	sid,
+	void		*dirhanp,
+	size_t		dirhlen,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen,
+	char		*cname,
+	char		*path)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, dirhanp, dirhlen, token, DM_TDT_DIR,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->symlink_by_handle(tdp->td_vp, tdp->td_right,
+		hanp, hlen, cname, path);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_hole.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_hole.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_hole.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_hole.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+
+int
+dm_get_allocinfo_rvp(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_off_t	*offp,
+	u_int		nelem,
+	dm_extent_t	*extentp,
+	u_int		*nelemp,
+	int		*rvp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_allocinfo_rvp(tdp->td_vp, tdp->td_right,
+		offp, nelem, extentp, nelemp, rvp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_probe_hole(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_off_t	off,
+	dm_size_t	len,
+	dm_off_t	*roffp,
+	dm_size_t	*rlenp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->probe_hole(tdp->td_vp, tdp->td_right,
+		off, len, roffp, rlenp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_punch_hole(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_off_t	off,
+	dm_size_t	len)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->punch_hole(tdp->td_vp, tdp->td_right, off, len);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_io.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_io.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_io.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_io.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+
+int
+dm_read_invis_rvp(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_off_t	off,
+	dm_size_t	len,
+	void		*bufp,
+	int		*rvp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->read_invis_rvp(tdp->td_vp, tdp->td_right,
+		off, len, bufp, rvp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_write_invis_rvp(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	int		flags,
+	dm_off_t	off,
+	dm_size_t	len,
+	void		*bufp,
+	int		*rvp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->write_invis_rvp(tdp->td_vp, tdp->td_right,
+		flags, off, len, bufp, rvp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_sync_by_handle (
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->sync_by_handle(tdp->td_vp, tdp->td_right);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_get_dioinfo (
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_dioinfo_t	*diop)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_dioinfo(tdp->td_vp, tdp->td_right, diop);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_kern.h linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_kern.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_kern.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_kern.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,563 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef __DMAPI_KERN_H__
+#define __DMAPI_KERN_H__
+
+
+union sys_dmapi_uarg {
+	void *p;
+	__u64 u;
+};
+typedef union sys_dmapi_uarg sys_dmapi_u;
+
+struct sys_dmapi_args {
+	sys_dmapi_u uarg1, uarg2, uarg3, uarg4, uarg5, uarg6, uarg7, uarg8,
+		uarg9, uarg10, uarg11;
+};
+typedef struct sys_dmapi_args sys_dmapi_args_t;
+
+#define DM_Uarg(uap,i)	uap->uarg##i.u
+#define DM_Parg(uap,i)	uap->uarg##i.p
+
+#ifdef __KERNEL__
+
+struct xfs_handle_t;
+
+/* The first group of definitions and prototypes define the filesystem's
+   interface into the DMAPI code.
+*/
+
+
+/* Definitions used for the flags field on dm_send_data_event(),
+   dm_send_unmount_event(), and dm_send_namesp_event() calls.
+*/
+
+#define DM_FLAGS_NDELAY		0x001	/* return EAGAIN after dm_pending() */
+#define DM_FLAGS_UNWANTED	0x002	/* event not in fsys dm_eventset_t */
+
+/* Possible code levels reported by dm_code_level(). */
+
+#define DM_CLVL_INIT	0	/* DMAPI prior to X/Open compliance */
+#define DM_CLVL_XOPEN	1	/* X/Open compliant DMAPI */
+
+
+/* Prototypes used outside of the DMI module/directory. */
+
+int		dm_send_data_event(
+			dm_eventtype_t	event,
+			struct bhv_desc *bdp,
+			dm_right_t	vp_right,
+			off_t		off,
+			size_t		len,
+			int		flags);
+
+int		dm_send_destroy_event(
+			struct bhv_desc *bdp,
+			dm_right_t	vp_right);
+
+int		dm_send_mount_event(
+			struct vfs	*vfsp,
+			dm_right_t	vfsp_right,
+			struct bhv_desc *bdp,
+			dm_right_t	vp_right,
+			struct bhv_desc *rootbdp,
+			dm_right_t	rootvp_right,
+			char		*name1,
+			char		*name2);
+
+int		dm_send_namesp_event(
+			dm_eventtype_t	event,
+			struct bhv_desc *bdp1,
+			dm_right_t	vp1_right,
+			struct bhv_desc *bdp2,
+			dm_right_t	vp2_right,
+			char		*name1,
+			char		*name2,
+			mode_t		mode,
+			int		retcode,
+			int		flags);
+
+void		dm_send_unmount_event(
+			struct vfs	*vfsp,
+			struct vnode	*vp,
+			dm_right_t	vfsp_right,
+			mode_t		mode,
+			int		retcode,
+			int		flags);
+
+int		dm_code_level(void);
+
+int		dm_vp_to_handle (
+			struct vnode	*vp,
+			xfs_handle_t	*handlep);
+
+/* The following prototypes and definitions are used by DMAPI as its
+   interface into the filesystem code.	Communication between DMAPI and the
+   filesystem are established as follows:
+   1. DMAPI uses the VFS_DMAPI_FSYS_VECTOR to ask for the addresses
+      of all the functions within the filesystem that it may need to call.
+   2. The filesystem returns an array of function name/address pairs which
+      DMAPI builds into a function vector.
+   The VFS_DMAPI_FSYS_VECTOR call is only made one time for a particular
+   filesystem type.  From then on, DMAPI uses its function vector to call the
+   filesystem functions directly.  Functions in the array which DMAPI doesn't
+   recognize are ignored.  A dummy function which returns ENOSYS is used for
+   any function that DMAPI needs but which was not provided by the filesystem.
+   If XFS doesn't recognize the VFS_DMAPI_FSYS_VECTOR, DMAPI assumes that it
+   doesn't have the X/Open support code; in this case DMAPI uses the XFS-code
+   originally bundled within DMAPI.
+
+   The goal of this interface is allow incremental changes to be made to
+   both the filesystem and to DMAPI while minimizing inter-patch dependencies,
+   and to eventually allow DMAPI to support multiple filesystem types at the
+   same time should that become necessary.
+*/
+
+typedef enum {
+	DM_FSYS_CLEAR_INHERIT		=  0,
+	DM_FSYS_CREATE_BY_HANDLE	=  1,
+	DM_FSYS_DOWNGRADE_RIGHT		=  2,
+	DM_FSYS_GET_ALLOCINFO_RVP	=  3,
+	DM_FSYS_GET_BULKALL_RVP		=  4,
+	DM_FSYS_GET_BULKATTR_RVP	=  5,
+	DM_FSYS_GET_CONFIG		=  6,
+	DM_FSYS_GET_CONFIG_EVENTS	=  7,
+	DM_FSYS_GET_DESTROY_DMATTR	=  8,
+	DM_FSYS_GET_DIOINFO		=  9,
+	DM_FSYS_GET_DIRATTRS_RVP	= 10,
+	DM_FSYS_GET_DMATTR		= 11,
+	DM_FSYS_GET_EVENTLIST		= 12,
+	DM_FSYS_GET_FILEATTR		= 13,
+	DM_FSYS_GET_REGION		= 14,
+	DM_FSYS_GETALL_DMATTR		= 15,
+	DM_FSYS_GETALL_INHERIT		= 16,
+	DM_FSYS_INIT_ATTRLOC		= 17,
+	DM_FSYS_MKDIR_BY_HANDLE		= 18,
+	DM_FSYS_PROBE_HOLE		= 19,
+	DM_FSYS_PUNCH_HOLE		= 20,
+	DM_FSYS_READ_INVIS_RVP		= 21,
+	DM_FSYS_RELEASE_RIGHT		= 22,
+	DM_FSYS_REMOVE_DMATTR		= 23,
+	DM_FSYS_REQUEST_RIGHT		= 24,
+	DM_FSYS_SET_DMATTR		= 25,
+	DM_FSYS_SET_EVENTLIST		= 26,
+	DM_FSYS_SET_FILEATTR		= 27,
+	DM_FSYS_SET_INHERIT		= 28,
+	DM_FSYS_SET_REGION		= 29,
+	DM_FSYS_SYMLINK_BY_HANDLE	= 30,
+	DM_FSYS_SYNC_BY_HANDLE		= 31,
+	DM_FSYS_UPGRADE_RIGHT		= 32,
+	DM_FSYS_WRITE_INVIS_RVP		= 33,
+	DM_FSYS_OBJ_REF_HOLD		= 34,
+	DM_FSYS_MAX			= 35
+} dm_fsys_switch_t;
+
+
+#define DM_FSYS_OBJ	0x1		/* object refers to a fsys handle */
+
+
+/*
+ *  Prototypes for filesystem-specific functions.
+ */
+
+typedef int	(*dm_fsys_clear_inherit_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_attrname_t	*attrnamep);
+
+typedef int	(*dm_fsys_create_by_handle_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			void		*hanp,
+			size_t		hlen,
+			char		*cname);
+
+typedef int	(*dm_fsys_downgrade_right_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		type);		/* DM_FSYS_OBJ or zero */
+
+typedef int	(*dm_fsys_get_allocinfo_rvp_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_off_t	*offp,
+			u_int		nelem,
+			dm_extent_t	*extentp,
+			u_int		*nelemp,
+			int		*rvalp);
+
+typedef int	(*dm_fsys_get_bulkall_rvp_t)(
+			vnode_t		*vp,		/* root vnode */
+			dm_right_t	right,
+			u_int		mask,
+			dm_attrname_t	*attrnamep,
+			dm_attrloc_t	*locp,
+			size_t		buflen,
+			void		*bufp,
+			size_t		*rlenp,
+			int		*rvalp);
+
+typedef int	(*dm_fsys_get_bulkattr_rvp_t)(
+			vnode_t		*vp,		/* root vnode */
+			dm_right_t	right,
+			u_int		mask,
+			dm_attrloc_t	*locp,
+			size_t		buflen,
+			void		*bufp,
+			size_t		*rlenp,
+			int		*rvalp);
+
+typedef int	(*dm_fsys_get_config_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_config_t	flagname,
+			dm_size_t	*retvalp);
+
+typedef int	(*dm_fsys_get_config_events_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		nelem,
+			dm_eventset_t	*eventsetp,
+			u_int		*nelemp);
+
+typedef int	(*dm_fsys_get_destroy_dmattr_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_attrname_t	*attrnamep,
+			char		**valuepp,
+			int		*vlenp);
+
+typedef int	(*dm_fsys_get_dioinfo_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_dioinfo_t	*diop);
+
+typedef int	(*dm_fsys_get_dirattrs_rvp_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		mask,
+			dm_attrloc_t	*locp,
+			size_t		buflen,
+			void		*bufp,
+			size_t		*rlenp,
+			int		*rvalp);
+
+typedef int	(*dm_fsys_get_dmattr_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_attrname_t	*attrnamep,
+			size_t		buflen,
+			void		*bufp,
+			size_t		*rlenp);
+
+typedef int	(*dm_fsys_get_eventlist_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		type,
+			u_int		nelem,
+			dm_eventset_t	*eventsetp,	/* in kernel space! */
+			u_int		*nelemp);		/* in kernel space! */
+
+typedef int	(*dm_fsys_get_fileattr_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		mask,
+			dm_stat_t	*statp);
+
+typedef int	(*dm_fsys_get_region_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		nelem,
+			dm_region_t	*regbufp,
+			u_int		*nelemp);
+
+typedef int	(*dm_fsys_getall_dmattr_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			size_t		buflen,
+			void		*bufp,
+			size_t		*rlenp);
+
+typedef int	(*dm_fsys_getall_inherit_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		nelem,
+			dm_inherit_t	*inheritbufp,
+			u_int		*nelemp);
+
+typedef int	(*dm_fsys_init_attrloc_t)(
+			vnode_t		*vp,	/* sometimes root vnode */
+			dm_right_t	right,
+			dm_attrloc_t	*locp);
+
+typedef int	(*dm_fsys_mkdir_by_handle_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			void		*hanp,
+			size_t		hlen,
+			char		*cname);
+
+typedef int	(*dm_fsys_probe_hole_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_off_t	off,
+			dm_size_t	len,
+			dm_off_t	*roffp,
+			dm_size_t	*rlenp);
+
+typedef int	(*dm_fsys_punch_hole_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_off_t	off,
+			dm_size_t	len);
+
+typedef int	(*dm_fsys_read_invis_rvp_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_off_t	off,
+			dm_size_t	len,
+			void		*bufp,
+			int		*rvp);
+
+typedef int	(*dm_fsys_release_right_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		type);
+
+typedef int	(*dm_fsys_remove_dmattr_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			int		setdtime,
+			dm_attrname_t	*attrnamep);
+
+typedef int	(*dm_fsys_request_right_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		type,	/* DM_FSYS_OBJ or zero */
+			u_int		flags,
+			dm_right_t	newright);
+
+typedef int	(*dm_fsys_set_dmattr_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_attrname_t	*attrnamep,
+			int		setdtime,
+			size_t		buflen,
+			void		*bufp);
+
+typedef int	(*dm_fsys_set_eventlist_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		type,
+			dm_eventset_t	*eventsetp,	/* in kernel space! */
+			u_int		maxevent);
+
+typedef int	(*dm_fsys_set_fileattr_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		mask,
+			dm_fileattr_t	*attrp);
+
+typedef int	(*dm_fsys_set_inherit_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			dm_attrname_t	*attrnamep,
+			mode_t		mode);
+
+typedef int	(*dm_fsys_set_region_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		nelem,
+			dm_region_t	*regbufp,
+			dm_boolean_t	*exactflagp);
+
+typedef int	(*dm_fsys_symlink_by_handle_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			void		*hanp,
+			size_t		hlen,
+			char		*cname,
+			char		*path);
+
+typedef int	(*dm_fsys_sync_by_handle_t)(
+			vnode_t		*vp,
+			dm_right_t	right);
+
+typedef int	(*dm_fsys_upgrade_right_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			u_int		type);		/* DM_FSYS_OBJ or zero */
+
+typedef int	(*dm_fsys_write_invis_rvp_t)(
+			vnode_t		*vp,
+			dm_right_t	right,
+			int		flags,
+			dm_off_t	off,
+			dm_size_t	len,
+			void		*bufp,
+			int		*rvp);
+
+typedef void	(*dm_fsys_obj_ref_hold_t)(
+			vnode_t		*vp);
+
+
+/* Structure definitions used by the VFS_DMAPI_FSYS_VECTOR call. */
+
+typedef struct {
+	dm_fsys_switch_t  func_no;	/* function number */
+	union {
+		dm_fsys_clear_inherit_t clear_inherit;
+		dm_fsys_create_by_handle_t create_by_handle;
+		dm_fsys_downgrade_right_t downgrade_right;
+		dm_fsys_get_allocinfo_rvp_t get_allocinfo_rvp;
+		dm_fsys_get_bulkall_rvp_t get_bulkall_rvp;
+		dm_fsys_get_bulkattr_rvp_t get_bulkattr_rvp;
+		dm_fsys_get_config_t get_config;
+		dm_fsys_get_config_events_t get_config_events;
+		dm_fsys_get_destroy_dmattr_t get_destroy_dmattr;
+		dm_fsys_get_dioinfo_t get_dioinfo;
+		dm_fsys_get_dirattrs_rvp_t get_dirattrs_rvp;
+		dm_fsys_get_dmattr_t get_dmattr;
+		dm_fsys_get_eventlist_t get_eventlist;
+		dm_fsys_get_fileattr_t get_fileattr;
+		dm_fsys_get_region_t get_region;
+		dm_fsys_getall_dmattr_t getall_dmattr;
+		dm_fsys_getall_inherit_t getall_inherit;
+		dm_fsys_init_attrloc_t init_attrloc;
+		dm_fsys_mkdir_by_handle_t mkdir_by_handle;
+		dm_fsys_probe_hole_t probe_hole;
+		dm_fsys_punch_hole_t punch_hole;
+		dm_fsys_read_invis_rvp_t read_invis_rvp;
+		dm_fsys_release_right_t release_right;
+		dm_fsys_remove_dmattr_t remove_dmattr;
+		dm_fsys_request_right_t request_right;
+		dm_fsys_set_dmattr_t set_dmattr;
+		dm_fsys_set_eventlist_t set_eventlist;
+		dm_fsys_set_fileattr_t set_fileattr;
+		dm_fsys_set_inherit_t set_inherit;
+		dm_fsys_set_region_t set_region;
+		dm_fsys_symlink_by_handle_t symlink_by_handle;
+		dm_fsys_sync_by_handle_t sync_by_handle;
+		dm_fsys_upgrade_right_t upgrade_right;
+		dm_fsys_write_invis_rvp_t write_invis_rvp;
+		dm_fsys_obj_ref_hold_t obj_ref_hold;
+	} u_fc;
+} fsys_function_vector_t;
+
+struct dm_fcntl_vector {
+	int	code_level;
+	int	count;		/* Number of functions in the vector */
+	fsys_function_vector_t *vecp;
+};
+typedef struct dm_fcntl_vector dm_fcntl_vector_t;
+
+struct dm_fcntl_mapevent {
+	size_t	length;			/* length of transfer */
+	dm_eventtype_t	max_event;	/* Maximum (WRITE or READ)  event */
+	int	error;			/* returned error code */
+};
+typedef struct dm_fcntl_mapevent dm_fcntl_mapevent_t;
+
+#endif	/* __KERNEL__ */
+
+
+/* The following definitions are needed both by the kernel and by the
+   library routines.
+*/
+
+#define DM_MAX_HANDLE_SIZE	56	/* maximum size for a file handle */
+
+
+/*
+ *  Opcodes for dmapi ioctl.
+ */
+
+#define DM_CLEAR_INHERIT	1
+#define DM_CREATE_BY_HANDLE	2
+#define DM_CREATE_SESSION	3
+#define DM_CREATE_USEREVENT	4
+#define DM_DESTROY_SESSION	5
+#define DM_DOWNGRADE_RIGHT	6
+#define DM_FD_TO_HANDLE		7
+#define DM_FIND_EVENTMSG	8
+#define DM_GET_ALLOCINFO	9
+#define DM_GET_BULKALL		10
+#define DM_GET_BULKATTR		11
+#define DM_GET_CONFIG		12
+#define DM_GET_CONFIG_EVENTS	13
+#define DM_GET_DIOINFO		14
+#define DM_GET_DIRATTRS		15
+#define DM_GET_DMATTR		16
+#define DM_GET_EVENTLIST	17
+#define DM_GET_EVENTS		18
+#define DM_GET_FILEATTR		19
+#define DM_GET_MOUNTINFO	20
+#define DM_GET_REGION		21
+#define DM_GETALL_DISP		22
+#define DM_GETALL_DMATTR	23
+#define DM_GETALL_INHERIT	24
+#define DM_GETALL_SESSIONS	25
+#define DM_GETALL_TOKENS	26
+#define DM_INIT_ATTRLOC		27
+#define DM_MKDIR_BY_HANDLE	28
+#define DM_MOVE_EVENT		29
+#define DM_OBJ_REF_HOLD		30
+#define DM_OBJ_REF_QUERY	31
+#define DM_OBJ_REF_RELE		32
+#define DM_PATH_TO_FSHANDLE	33
+#define DM_PATH_TO_HANDLE	34
+#define DM_PENDING		35
+#define DM_PROBE_HOLE		36
+#define DM_PUNCH_HOLE		37
+#define DM_QUERY_RIGHT		38
+#define DM_QUERY_SESSION	39
+#define DM_READ_INVIS		40
+#define DM_RELEASE_RIGHT	41
+#define DM_REMOVE_DMATTR	42
+#define DM_REQUEST_RIGHT	43
+#define DM_RESPOND_EVENT	44
+#define DM_SEND_MSG		45
+#define DM_SET_DISP		46
+#define DM_SET_DMATTR		47
+#define DM_SET_EVENTLIST	48
+#define DM_SET_FILEATTR		49
+#define DM_SET_INHERIT		50
+#define DM_SET_REGION		51
+#define DM_SET_RETURN_ON_DESTROY 52
+#define DM_SYMLINK_BY_HANDLE	53
+#define DM_SYNC_BY_HANDLE	54
+#define DM_UPGRADE_RIGHT	55
+#define DM_WRITE_INVIS		56
+#define DM_OPEN_BY_HANDLE	57
+
+#endif /* __DMAPI_KERN_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_mountinfo.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_mountinfo.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_mountinfo.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_mountinfo.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+#ifdef linux
+/* XXX */
+#define vfsmax 1
+#endif
+
+typedef struct	{
+	int	support_type;
+	char	name[16];
+	dm_fsys_vector_t	*vptr;
+} dm_vector_map_t;
+
+/* Values for the support_type field. */
+
+#define DM_SUPPORT_UNKNOWN	0
+#define DM_SUPPORT_AVAIL	1
+
+
+dm_vector_map_t *dm_fsys_map = NULL;
+
+
+int
+dm_code_level(void)
+{
+	return(DM_CLVL_XOPEN);	/* initial X/Open compliant release */
+}
+
+
+/* Dummy routine which is stored in each function vector slot for which the
+   filesystem provides no function of its own.	If an application calls the
+   function, he will just get ENOSYS.
+*/
+
+static int
+dm_enosys(void)
+{
+	return(ENOSYS);		/* function not supported by filesystem */
+}
+
+
+/* dm_query_fsys_for_vector() asks a filesystem for its list of supported
+   DMAPI functions, and builds a dm_vector_map_t structure based upon the
+   reply.  We ignore functions supported by the filesystem which we do not
+   know about, and we substitute the subroutine 'dm_enosys' for each function
+   we know about but the filesystem does not support.
+*/
+
+static void
+dm_query_fsys_for_vector(
+	vnode_t		*vp)
+{
+	dm_vector_map_t *map;
+	fsys_function_vector_t	*vecp;
+	dm_fsys_vector_t  *vptr;
+	dm_fcntl_vector_t	vecrq;
+	struct vfs	*vfsp = vp->v_vfsp;
+	int		fstype;
+	int		error;
+	int		i;
+
+	/* XXX fstype = vfsp->vfs_fstype */
+	fstype = 0;
+	map = &dm_fsys_map[fstype];
+
+	/* Clear out any information left from a previous filesystem that was
+	   in this slot and initialize it for the new filesystem.
+	*/
+
+	if (map->vptr) {
+		kfree(map->vptr);
+		map->vptr = NULL;
+	}
+
+	/* XXX	strcpy(map->name, vfssw[fstype].vsw_name); */
+	strcpy(map->name, XFS_NAME);
+
+	map->support_type = DM_SUPPORT_AVAIL;
+
+	/* Next allocate a function vector and initialize all fields with a
+	   dummy function that returns ENOSYS.
+	*/
+
+	vptr = map->vptr = kmalloc(sizeof(*map->vptr), SLAB_KERNEL);
+	if (vptr == NULL) {
+		printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+		return;
+	}
+
+	strncpy(vptr->fsys_name, map->name, sizeof(vptr->fsys_name));
+	vptr->code_level = 0;
+	vptr->clear_inherit = (dm_fsys_clear_inherit_t)dm_enosys;
+	vptr->create_by_handle = (dm_fsys_create_by_handle_t)dm_enosys;
+	vptr->downgrade_right = (dm_fsys_downgrade_right_t)dm_enosys;
+	vptr->get_allocinfo_rvp = (dm_fsys_get_allocinfo_rvp_t)dm_enosys;
+	vptr->get_bulkall_rvp = (dm_fsys_get_bulkall_rvp_t)dm_enosys;
+	vptr->get_bulkattr_rvp = (dm_fsys_get_bulkattr_rvp_t)dm_enosys;
+	vptr->get_config = (dm_fsys_get_config_t)dm_enosys;
+	vptr->get_config_events = (dm_fsys_get_config_events_t)dm_enosys;
+	vptr->get_destroy_dmattr = (dm_fsys_get_destroy_dmattr_t)dm_enosys;
+	vptr->get_dioinfo = (dm_fsys_get_dioinfo_t)dm_enosys;
+	vptr->get_dirattrs_rvp = (dm_fsys_get_dirattrs_rvp_t)dm_enosys;
+	vptr->get_dmattr = (dm_fsys_get_dmattr_t)dm_enosys;
+	vptr->get_eventlist = (dm_fsys_get_eventlist_t)dm_enosys;
+	vptr->get_fileattr = (dm_fsys_get_fileattr_t)dm_enosys;
+	vptr->get_region = (dm_fsys_get_region_t)dm_enosys;
+	vptr->getall_dmattr = (dm_fsys_getall_dmattr_t)dm_enosys;
+	vptr->getall_inherit = (dm_fsys_getall_inherit_t)dm_enosys;
+	vptr->init_attrloc = (dm_fsys_init_attrloc_t)dm_enosys;
+	vptr->mkdir_by_handle = (dm_fsys_mkdir_by_handle_t)dm_enosys;
+	vptr->probe_hole = (dm_fsys_probe_hole_t)dm_enosys;
+	vptr->punch_hole = (dm_fsys_punch_hole_t)dm_enosys;
+	vptr->read_invis_rvp = (dm_fsys_read_invis_rvp_t)dm_enosys;
+	vptr->release_right = (dm_fsys_release_right_t)dm_enosys;
+	vptr->request_right = (dm_fsys_request_right_t)dm_enosys;
+	vptr->remove_dmattr = (dm_fsys_remove_dmattr_t)dm_enosys;
+	vptr->set_dmattr = (dm_fsys_set_dmattr_t)dm_enosys;
+	vptr->set_eventlist = (dm_fsys_set_eventlist_t)dm_enosys;
+	vptr->set_fileattr = (dm_fsys_set_fileattr_t)dm_enosys;
+	vptr->set_inherit = (dm_fsys_set_inherit_t)dm_enosys;
+	vptr->set_region = (dm_fsys_set_region_t)dm_enosys;
+	vptr->symlink_by_handle = (dm_fsys_symlink_by_handle_t)dm_enosys;
+	vptr->sync_by_handle = (dm_fsys_sync_by_handle_t)dm_enosys;
+	vptr->upgrade_right = (dm_fsys_upgrade_right_t)dm_enosys;
+	vptr->write_invis_rvp = (dm_fsys_write_invis_rvp_t)dm_enosys;
+	vptr->obj_ref_hold = (dm_fsys_obj_ref_hold_t)dm_enosys;
+
+	/* Issue a VFS_DMAPI_FSYS_VECTOR to the filesystem in order to obtain
+	   its vector of filesystem-specific DMAPI routines.
+	*/
+
+	vecrq.count = 0;
+	vecrq.vecp = NULL;
+
+	VFS_DMAPI_FSYS_VECTOR(vfsp, &vecrq, error);
+
+	/* If we still have an error at this point, then the filesystem simply
+	   does not support DMAPI, so we give up with all functions set to
+	   ENOSYS.
+	*/
+
+	if (error || vecrq.count == 0)
+		return;
+
+	/* The request succeeded and we were given a vector which we need to
+	   map to our current level.  Overlay the dummy function with every
+	   filesystem function we understand.
+	*/
+
+	vptr->code_level = vecrq.code_level;
+	vecp = vecrq.vecp;
+	for (i = 0; i < vecrq.count; i++) {
+		switch (vecp[i].func_no) {
+		case DM_FSYS_CLEAR_INHERIT:
+			vptr->clear_inherit = vecp[i].u_fc.clear_inherit;
+			break;
+		case DM_FSYS_CREATE_BY_HANDLE:
+			vptr->create_by_handle = vecp[i].u_fc.create_by_handle;
+			break;
+		case DM_FSYS_DOWNGRADE_RIGHT:
+			vptr->downgrade_right = vecp[i].u_fc.downgrade_right;
+			break;
+		case DM_FSYS_GET_ALLOCINFO_RVP:
+			vptr->get_allocinfo_rvp = vecp[i].u_fc.get_allocinfo_rvp;
+			break;
+		case DM_FSYS_GET_BULKALL_RVP:
+			vptr->get_bulkall_rvp = vecp[i].u_fc.get_bulkall_rvp;
+			break;
+		case DM_FSYS_GET_BULKATTR_RVP:
+			vptr->get_bulkattr_rvp = vecp[i].u_fc.get_bulkattr_rvp;
+			break;
+		case DM_FSYS_GET_CONFIG:
+			vptr->get_config = vecp[i].u_fc.get_config;
+			break;
+		case DM_FSYS_GET_CONFIG_EVENTS:
+			vptr->get_config_events = vecp[i].u_fc.get_config_events;
+			break;
+		case DM_FSYS_GET_DESTROY_DMATTR:
+			vptr->get_destroy_dmattr = vecp[i].u_fc.get_destroy_dmattr;
+			break;
+		case DM_FSYS_GET_DIOINFO:
+			vptr->get_dioinfo = vecp[i].u_fc.get_dioinfo;
+			break;
+		case DM_FSYS_GET_DIRATTRS_RVP:
+			vptr->get_dirattrs_rvp = vecp[i].u_fc.get_dirattrs_rvp;
+			break;
+		case DM_FSYS_GET_DMATTR:
+			vptr->get_dmattr = vecp[i].u_fc.get_dmattr;
+			break;
+		case DM_FSYS_GET_EVENTLIST:
+			vptr->get_eventlist = vecp[i].u_fc.get_eventlist;
+			break;
+		case DM_FSYS_GET_FILEATTR:
+			vptr->get_fileattr = vecp[i].u_fc.get_fileattr;
+			break;
+		case DM_FSYS_GET_REGION:
+			vptr->get_region = vecp[i].u_fc.get_region;
+			break;
+		case DM_FSYS_GETALL_DMATTR:
+			vptr->getall_dmattr = vecp[i].u_fc.getall_dmattr;
+			break;
+		case DM_FSYS_GETALL_INHERIT:
+			vptr->getall_inherit = vecp[i].u_fc.getall_inherit;
+			break;
+		case DM_FSYS_INIT_ATTRLOC:
+			vptr->init_attrloc = vecp[i].u_fc.init_attrloc;
+			break;
+		case DM_FSYS_MKDIR_BY_HANDLE:
+			vptr->mkdir_by_handle = vecp[i].u_fc.mkdir_by_handle;
+			break;
+		case DM_FSYS_PROBE_HOLE:
+			vptr->probe_hole = vecp[i].u_fc.probe_hole;
+			break;
+		case DM_FSYS_PUNCH_HOLE:
+			vptr->punch_hole = vecp[i].u_fc.punch_hole;
+			break;
+		case DM_FSYS_READ_INVIS_RVP:
+			vptr->read_invis_rvp = vecp[i].u_fc.read_invis_rvp;
+			break;
+		case DM_FSYS_RELEASE_RIGHT:
+			vptr->release_right = vecp[i].u_fc.release_right;
+			break;
+		case DM_FSYS_REMOVE_DMATTR:
+			vptr->remove_dmattr = vecp[i].u_fc.remove_dmattr;
+			break;
+		case DM_FSYS_REQUEST_RIGHT:
+			vptr->request_right = vecp[i].u_fc.request_right;
+			break;
+		case DM_FSYS_SET_DMATTR:
+			vptr->set_dmattr = vecp[i].u_fc.set_dmattr;
+			break;
+		case DM_FSYS_SET_EVENTLIST:
+			vptr->set_eventlist = vecp[i].u_fc.set_eventlist;
+			break;
+		case DM_FSYS_SET_FILEATTR:
+			vptr->set_fileattr = vecp[i].u_fc.set_fileattr;
+			break;
+		case DM_FSYS_SET_INHERIT:
+			vptr->set_inherit = vecp[i].u_fc.set_inherit;
+			break;
+		case DM_FSYS_SET_REGION:
+			vptr->set_region = vecp[i].u_fc.set_region;
+			break;
+		case DM_FSYS_SYMLINK_BY_HANDLE:
+			vptr->symlink_by_handle = vecp[i].u_fc.symlink_by_handle;
+			break;
+		case DM_FSYS_SYNC_BY_HANDLE:
+			vptr->sync_by_handle = vecp[i].u_fc.sync_by_handle;
+			break;
+		case DM_FSYS_UPGRADE_RIGHT:
+			vptr->upgrade_right = vecp[i].u_fc.upgrade_right;
+			break;
+		case DM_FSYS_WRITE_INVIS_RVP:
+			vptr->write_invis_rvp = vecp[i].u_fc.write_invis_rvp;
+			break;
+		case DM_FSYS_OBJ_REF_HOLD:
+			vptr->obj_ref_hold = vecp[i].u_fc.obj_ref_hold;
+			break;
+		default:		/* ignore ones we don't understand */
+			break;
+		}
+	}
+}
+
+
+/* Given a behavior pointer, dm_fsys_vector() returns a pointer to the DMAPI
+   function vector to be used for the corresponding vnode.  There is one possible
+   function vector for each filesystem type, although currently XFS is the
+   only filesystem that actually supports DMAPI.
+*/
+
+dm_fsys_vector_t *
+dm_fsys_vector(
+	vnode_t		*vp)
+{
+	dm_vector_map_t *map;
+	int		fstype;
+
+	/* XXX fstype = vp->v_vfsp->vfs_fstype */
+	fstype = 0;
+
+	/* If this is the first call, initialize the filesystem function
+	   vector map.
+	*/
+
+	if (dm_fsys_map == NULL) {
+		int	size = vfsmax * sizeof(*dm_fsys_map);
+
+		dm_fsys_map = (dm_vector_map_t *)kmalloc(size, GFP_KERNEL);
+		if (dm_fsys_map == NULL) {
+			printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+			return NULL;
+		}
+		memset(dm_fsys_map, 0, size);
+	}
+	map = &dm_fsys_map[fstype];
+
+	/* If a new filesystem has been dynamically loaded into a slot
+	   previously held by another filesystem, then treat it as a
+	   DM_SUPPORT_UNKNOWN.
+	*/
+
+	/* XXX if (strcmp(map->name, vfssw[fstype].vsw_name)) */
+	if (strcmp(map->name, XFS_NAME))
+		map->support_type = DM_SUPPORT_UNKNOWN;
+
+	/* If we don't yet know what the filesystem supports, ask it. */
+
+	if (map->support_type == DM_SUPPORT_UNKNOWN)
+		dm_query_fsys_for_vector(vp);
+
+	/* Now return the function vector. */
+
+	return(map->vptr);
+}
+
+
+void
+dm_fsys_vector_free()
+{
+	dm_vector_map_t *map;
+	int i;
+
+	if (dm_fsys_map) {
+		for (i = 0; i < vfsmax; i++){
+			map = &dm_fsys_map[i];
+			if (map->vptr)
+				kfree(map->vptr);
+		}
+		kfree(dm_fsys_map);
+	}
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_private.h linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_private.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_private.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_private.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef _DMAPI_PRIVATE_H
+#define _DMAPI_PRIVATE_H
+
+#include "xfs.h"
+
+#ifdef CONFIG_PROC_FS
+#define DMAPI_PROCFS		"fs/xfs_dmapi_v2" /* DMAPI device in /proc. */
+#define DMAPI_DBG_PROCFS	"fs/xfs_dmapi_d" /* DMAPI debugging dir */
+#endif
+
+extern struct kmem_cache_s	*dm_fsreg_cachep;
+extern struct kmem_cache_s	*dm_tokdata_cachep;
+extern struct kmem_cache_s	*dm_session_cachep;
+
+
+
+typedef struct dm_tokdata {
+	struct dm_tokdata *td_next;
+	struct dm_tokevent *td_tevp;	/* pointer to owning tevp */
+	int		td_app_ref;	/* # app threads currently active */
+	dm_right_t	td_orig_right;	/* original right held when created */
+	dm_right_t	td_right;	/* current right held for this handle */
+	short		td_flags;
+	short		td_type;	/* object type */
+	int		td_vcount;	/* # of current application VN_HOLDs */
+	vnode_t		*td_vp;		/* vnode pointer */
+	xfs_handle_t	td_handle;	/* handle for vp or vfsp */
+} dm_tokdata_t;
+
+/* values for td_type */
+
+#define DM_TDT_NONE	0x00		/* td_handle is empty */
+#define DM_TDT_VFS	0x01		/* td_handle points to a vfs */
+#define DM_TDT_REG	0x02		/* td_handle points to a file */
+#define DM_TDT_DIR	0x04		/* td_handle points to a directory */
+#define DM_TDT_LNK	0x08		/* td_handle points to a symlink */
+#define DM_TDT_OTH	0x10		/* some other object eg. pipe, socket */
+
+#define DM_TDT_VNO	(DM_TDT_REG|DM_TDT_DIR|DM_TDT_LNK|DM_TDT_OTH)
+#define DM_TDT_ANY	(DM_TDT_VFS|DM_TDT_REG|DM_TDT_DIR|DM_TDT_LNK|DM_TDT_OTH)
+
+/* values for td_flags */
+
+#define DM_TDF_ORIG	0x0001		/* part of the original event */
+#define DM_TDF_EVTREF	0x0002		/* event thread holds vnode reference */
+#define DM_TDF_STHREAD	0x0004		/* only one app can use this handle */
+#define DM_TDF_RIGHT	0x0008		/* vcount bumped for dm_request_right */
+#define DM_TDF_HOLD	0x0010		/* vcount bumped for dm_obj_ref_hold */
+
+
+/* Because some events contain __u64 fields, we force te_msg and te_event
+   to always be 8-byte aligned.	 In order to send more than one message in
+   a single dm_get_events() call, we also ensure that each message is an
+   8-byte multiple.
+*/
+
+typedef struct dm_tokevent {
+	struct dm_tokevent  *te_next;
+	struct dm_tokevent  *te_hashnext; /* hash chain */
+	lock_t		te_lock;	/* lock for all fields but te_*next.
+					 * te_next and te_hashnext are
+					 * protected by the session lock.
+					 */
+	short		te_flags;
+	short		te_allocsize;	/* alloc'ed size of this structure */
+	sv_t		te_evt_queue;	/* queue waiting for dm_respond_event */
+	sv_t		te_app_queue;	/* queue waiting for handle access */
+	int		te_evt_ref;	/* number of event procs using token */
+	int		te_app_ref;	/* number of app procs using token */
+	int		te_app_slp;	/* number of app procs sleeping */
+	int		te_reply;	/* return errno for sync messages */
+	dm_tokdata_t	*te_tdp;	/* list of handle/right pairs */
+	union {
+		__u64		align;	/* force alignment of te_msg */
+		dm_eventmsg_t	te_msg;		/* user visible part */
+	} te_u;
+	__u64		te_event;	/* start of dm_xxx_event_t message */
+} dm_tokevent_t;
+
+#define te_msg	te_u.te_msg
+
+/* values for te_flags */
+
+#define DM_TEF_LOCKED	0x0001		/* event "locked" by dm_get_events() */
+#define DM_TEF_INTERMED 0x0002		/* a dm_pending reply was received */
+#define DM_TEF_FINAL	0x0004		/* dm_respond_event has been received */
+#ifdef __sgi
+#define DM_TEF_HASHED	0x0010		/* event is on hash chain  */
+#endif
+
+
+#ifdef __sgi
+#ifdef DEBUG
+#define DM_SHASH_DEBUG
+#endif
+
+typedef struct dm_sesshash {
+	dm_tokevent_t	*h_next;	/* ptr to chain of tokevents */
+#ifdef DM_SHASH_DEBUG
+	int		maxlength;
+	int		curlength;
+	int		num_adds;
+	int		num_dels;
+	int		dup_hits;
+#endif
+} dm_sesshash_t;
+#endif
+
+
+typedef struct dm_eventq {
+	dm_tokevent_t	*eq_head;
+	dm_tokevent_t	*eq_tail;
+	int		eq_count;	/* size of queue */
+} dm_eventq_t;
+
+
+typedef struct dm_session {
+	struct dm_session	*sn_next;	/* sessions linkage */
+	dm_sessid_t	sn_sessid;	/* user-visible session number */
+	u_int		sn_flags;
+	lock_t		sn_qlock;	/* lock for newq/delq related fields */
+	sv_t		sn_readerq;	/* waiting for message on sn_newq */
+	sv_t		sn_writerq;	/* waiting for room on sn_newq */
+	u_int		sn_readercnt;	/* count of waiting readers */
+	u_int		sn_writercnt;	/* count of waiting readers */
+	dm_eventq_t	sn_newq;	/* undelivered event queue */
+	dm_eventq_t	sn_delq;	/* delivered event queue */
+	dm_eventq_t	sn_evt_writerq; /* events of thrds in sn_writerq */
+#ifdef __sgi
+	dm_sesshash_t	*sn_sesshash;	/* buckets for tokevent hash chains */
+#ifdef DM_SHASH_DEBUG
+	int		sn_buckets_in_use;
+	int		sn_max_buckets_in_use;
+#endif
+#endif
+	char		sn_info[DM_SESSION_INFO_LEN];	/* user-supplied info */
+} dm_session_t;
+
+/* values for sn_flags */
+
+#define DM_SN_WANTMOUNT 0x0001		/* session wants to get mount events */
+
+
+typedef enum {
+	DM_STATE_MOUNTING,
+	DM_STATE_MOUNTED,
+	DM_STATE_UNMOUNTING,
+	DM_STATE_UNMOUNTED
+} dm_fsstate_t;
+
+
+typedef struct dm_fsreg {
+	struct dm_fsreg *fr_next;
+	vfs_t		*fr_vfsp;	/* filesystem pointer */
+	dm_tokevent_t	*fr_tevp;
+	fsid_t		fr_fsid;	/* filesystem ID */
+	void		*fr_msg;	/* dm_mount_event_t for filesystem */
+	int		fr_msgsize;	/* size of dm_mount_event_t */
+	dm_fsstate_t	fr_state;
+	sv_t		fr_dispq;
+	int		fr_dispcnt;
+	dm_eventq_t	fr_evt_dispq;	/* events of thrds in fr_dispq */
+	sv_t		fr_queue;	/* queue for hdlcnt/vfscnt/unmount */
+	lock_t		fr_lock;
+	int		fr_hdlcnt;	/* threads blocked during unmount */
+	int		fr_vfscnt;	/* threads in VFS_VGET or VFS_ROOT */
+	int		fr_unmount;	/* if non-zero, umount is sleeping */
+	dm_attrname_t	fr_rattr;	/* dm_set_return_on_destroy attribute */
+	dm_session_t	*fr_sessp [DM_EVENT_MAX];
+} dm_fsreg_t;
+
+
+
+
+/* events valid in dm_set_disp() when called with a filesystem handle. */
+
+#define DM_VALID_DISP_EVENTS		( \
+	(1 << DM_EVENT_PREUNMOUNT)	| \
+	(1 << DM_EVENT_UNMOUNT)		| \
+	(1 << DM_EVENT_NOSPACE)		| \
+	(1 << DM_EVENT_DEBUT)		| \
+	(1 << DM_EVENT_CREATE)		| \
+	(1 << DM_EVENT_POSTCREATE)	| \
+	(1 << DM_EVENT_REMOVE)		| \
+	(1 << DM_EVENT_POSTREMOVE)	| \
+	(1 << DM_EVENT_RENAME)		| \
+	(1 << DM_EVENT_POSTRENAME)	| \
+	(1 << DM_EVENT_LINK)		| \
+	(1 << DM_EVENT_POSTLINK)	| \
+	(1 << DM_EVENT_SYMLINK)		| \
+	(1 << DM_EVENT_POSTSYMLINK)	| \
+	(1 << DM_EVENT_READ)		| \
+	(1 << DM_EVENT_WRITE)		| \
+	(1 << DM_EVENT_TRUNCATE)	| \
+	(1 << DM_EVENT_ATTRIBUTE)	| \
+	(1 << DM_EVENT_DESTROY)		)
+
+
+/* isolate the read/write/trunc events of a dm_tokevent_t */
+
+#define DM_EVENT_RDWRTRUNC(tevp)			(  \
+	((tevp)->te_msg.ev_type == DM_EVENT_READ)	|| \
+	((tevp)->te_msg.ev_type == DM_EVENT_WRITE)	|| \
+	((tevp)->te_msg.ev_type == DM_EVENT_TRUNCATE)	)
+
+
+/*
+ *  Global handle hack isolation.
+ */
+
+#define DM_GLOBALHAN(hanp, hlen)	(((hanp) == DM_GLOBAL_HANP) && \
+					 ((hlen) == DM_GLOBAL_HLEN))
+
+
+#define DM_MAX_MSG_DATA		3960
+
+
+
+/* Supported filesystem function vector functions. */
+
+
+typedef struct {
+	int				code_level;
+	char				fsys_name[16];
+	dm_fsys_clear_inherit_t		clear_inherit;
+	dm_fsys_create_by_handle_t	create_by_handle;
+	dm_fsys_downgrade_right_t	downgrade_right;
+	dm_fsys_get_allocinfo_rvp_t	get_allocinfo_rvp;
+	dm_fsys_get_bulkall_rvp_t	get_bulkall_rvp;
+	dm_fsys_get_bulkattr_rvp_t	get_bulkattr_rvp;
+	dm_fsys_get_config_t		get_config;
+	dm_fsys_get_config_events_t	get_config_events;
+	dm_fsys_get_destroy_dmattr_t	get_destroy_dmattr;
+	dm_fsys_get_dioinfo_t		get_dioinfo;
+	dm_fsys_get_dirattrs_rvp_t	get_dirattrs_rvp;
+	dm_fsys_get_dmattr_t		get_dmattr;
+	dm_fsys_get_eventlist_t		get_eventlist;
+	dm_fsys_get_fileattr_t		get_fileattr;
+	dm_fsys_get_region_t		get_region;
+	dm_fsys_getall_dmattr_t		getall_dmattr;
+	dm_fsys_getall_inherit_t	getall_inherit;
+	dm_fsys_init_attrloc_t		init_attrloc;
+	dm_fsys_mkdir_by_handle_t	mkdir_by_handle;
+	dm_fsys_probe_hole_t		probe_hole;
+	dm_fsys_punch_hole_t		punch_hole;
+	dm_fsys_read_invis_rvp_t	read_invis_rvp;
+	dm_fsys_release_right_t		release_right;
+	dm_fsys_remove_dmattr_t		remove_dmattr;
+	dm_fsys_request_right_t		request_right;
+	dm_fsys_set_dmattr_t		set_dmattr;
+	dm_fsys_set_eventlist_t		set_eventlist;
+	dm_fsys_set_fileattr_t		set_fileattr;
+	dm_fsys_set_inherit_t		set_inherit;
+	dm_fsys_set_region_t		set_region;
+	dm_fsys_symlink_by_handle_t	symlink_by_handle;
+	dm_fsys_sync_by_handle_t	sync_by_handle;
+	dm_fsys_upgrade_right_t		upgrade_right;
+	dm_fsys_write_invis_rvp_t	write_invis_rvp;
+	dm_fsys_obj_ref_hold_t		obj_ref_hold;
+} dm_fsys_vector_t;
+
+
+extern	dm_session_t	*dm_sessions;		/* head of session list */
+extern	dm_fsreg_t	*dm_registers;
+extern	lock_t		dm_reg_lock;		/* lock for registration list */
+
+/*
+ *  Kernel only prototypes.
+ */
+
+int		dm_find_session_and_lock(
+			dm_sessid_t	sid,
+			dm_session_t	**sessionpp,
+			unsigned long	*lcp);
+
+int		dm_find_msg_and_lock(
+			dm_sessid_t	sid,
+			dm_token_t	token,
+			dm_tokevent_t	**tevpp,
+			unsigned long	*lcp);
+
+dm_tokevent_t * dm_evt_create_tevp(
+			dm_eventtype_t	event,
+			int		variable_size,
+			void		**msgpp);
+
+int		dm_app_get_tdp(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			short		types,
+			dm_right_t	right,
+			dm_tokdata_t	**tdpp);
+
+int		dm_get_config_tdp(
+			void		*hanp,
+			size_t		hlen,
+			dm_tokdata_t	**tdpp);
+
+void		dm_app_put_tdp(
+			dm_tokdata_t	*tdp);
+
+void		dm_put_tevp(
+			dm_tokevent_t	*tevp,
+			dm_tokdata_t	*tdp);
+
+void		dm_evt_rele_tevp(
+			dm_tokevent_t	*tevp,
+			int		droprights);
+
+int		dm_enqueue_normal_event(
+			vfs_t		*vfsp,
+			dm_tokevent_t	*tevp,
+			int		flags);
+
+int		dm_enqueue_mount_event(
+			vfs_t		*vfsp,
+			dm_tokevent_t	*tevp);
+
+int		dm_enqueue_sendmsg_event(
+			dm_sessid_t	targetsid,
+			dm_tokevent_t	*tevp,
+			int		synch);
+
+int		dm_enqueue_user_event(
+			dm_sessid_t	sid,
+			dm_tokevent_t	*tevp,
+			dm_token_t	*tokenp);
+
+int		dm_obj_ref_query_rvp(
+			dm_sessid_t	sid,
+			dm_token_t	token,
+			void		*hanp,
+			size_t		hlen,
+			int		*rvp);
+
+int		dm_read_invis_rvp(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			dm_off_t	off,
+			dm_size_t	len,
+			void		*bufp,
+			int		*rvp);
+
+int		dm_write_invis_rvp(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			int		flags,
+			dm_off_t	off,
+			dm_size_t	len,
+			void		*bufp,
+			int		*rvp);
+
+int		dm_get_bulkattr_rvp(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			u_int		mask,
+			dm_attrloc_t	*locp,
+			size_t		buflen,
+			void		*bufp,
+			size_t		*rlenp,
+			int		*rvp);
+
+int		dm_get_bulkall_rvp(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			u_int		mask,
+			dm_attrname_t	*attrnamep,
+			dm_attrloc_t	*locp,
+			size_t		buflen,
+			void		*bufp,
+			size_t		*rlenp,
+			int		*rvp);
+
+int		dm_get_dirattrs_rvp(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			u_int		mask,
+			dm_attrloc_t	*locp,
+			size_t		buflen,
+			void		*bufp,
+			size_t		*rlenp,
+			int		*rvp);
+
+int		dm_get_allocinfo_rvp(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			dm_off_t	*offp,
+			u_int		nelem,
+			dm_extent_t	*extentp,
+			u_int		*nelemp,
+			int		*rvp);
+
+int		dm_waitfor_destroy_attrname(
+			vfs_t		*vfsp,
+			dm_attrname_t	*attrnamep);
+
+void		dm_clear_fsreg(
+			dm_session_t	*s);
+
+int		dm_add_fsys_entry(
+			vfs_t		*vfsp,
+			dm_tokevent_t	*tevp);
+
+void		dm_change_fsys_entry(
+			vfs_t		*vfsp,
+			dm_fsstate_t	newstate);
+
+void		dm_remove_fsys_entry(
+			vfs_t		*vfsp);
+
+dm_fsys_vector_t *dm_fsys_vector(
+			vnode_t		*vp);
+
+void		dm_fsys_vector_free(void);
+
+int		dm_waitfor_disp_session(
+			vfs_t		*vfsp,
+			dm_tokevent_t	*tevp,
+			dm_session_t	**sessionpp,
+			unsigned long	*lcp);
+
+vnode_t *	dm_handle_to_vp (
+			xfs_handle_t	*handlep,
+			short		*typep);
+
+int		dm_check_dmapi_vp(
+			vnode_t		*vp);
+
+dm_tokevent_t * dm_find_mount_tevp_and_lock(
+			fsid_t		*fsidp,
+			unsigned long	*lcp);
+
+int		dm_path_to_hdl(
+			char		*path,
+			void		*hanp,
+			size_t		*hlenp);
+
+int		dm_path_to_fshdl(
+			char		*path,
+			void		*hanp,
+			size_t		*hlenp);
+
+int		dm_fd_to_hdl(
+			int		fd,
+			void		*hanp,
+			size_t		*hlenp);
+
+int		dm_upgrade_right(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token);
+
+int		dm_downgrade_right(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token);
+
+int		dm_request_right(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			u_int		flags,
+			dm_right_t	right);
+
+int		dm_release_right(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token);
+
+int		dm_query_right(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			dm_right_t	*rightp);
+
+
+int		dm_set_eventlist(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			dm_eventset_t	*eventsetp,
+			u_int		maxevent);
+
+int		dm_obj_ref_hold(
+			dm_sessid_t	sid,
+			dm_token_t	token,
+			void		*hanp,
+			size_t		hlen);
+
+int		dm_obj_ref_rele(
+			dm_sessid_t	sid,
+			dm_token_t	token,
+			void		*hanp,
+			size_t		hlen);
+
+int		dm_get_eventlist(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			u_int		nelem,
+			dm_eventset_t	*eventsetp,
+			u_int		*nelemp);
+
+
+int		dm_set_disp(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			dm_eventset_t	*eventsetp,
+			u_int		maxevent);
+
+
+int		dm_set_return_on_destroy(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			dm_attrname_t	*attrnamep,
+			dm_boolean_t	enable);
+
+
+int		dm_get_mountinfo(
+			dm_sessid_t	sid,
+			void		*hanp,
+			size_t		hlen,
+			dm_token_t	token,
+			size_t		buflen,
+			void		*bufp,
+			size_t		*rlenp);
+
+void		dm_link_event(
+			dm_tokevent_t	*tevp,
+			dm_eventq_t	*queue);
+
+void		dm_unlink_event(
+			dm_tokevent_t	*tevp,
+			dm_eventq_t	*queue);
+
+int		dm_open_by_handle_rvp(
+			unsigned int	fd,
+			void		*hanp,
+			size_t		hlen,
+			int		mode,
+			int		*rvp);
+
+int		dm_copyin_handle(
+			void		*hanp,
+			size_t		hlen,
+			xfs_handle_t	*handlep);
+
+#endif	/* _DMAPI_PRIVATE_H */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_region.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_region.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_region.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_region.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+
+int
+dm_get_region(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		nelem,
+	dm_region_t	*regbufp,
+	u_int		*nelemp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_region(tdp->td_vp, tdp->td_right,
+		nelem, regbufp, nelemp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+
+int
+dm_set_region(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		nelem,
+	dm_region_t	*regbufp,
+	dm_boolean_t	*exactflagp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_REG,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->set_region(tdp->td_vp, tdp->td_right,
+		nelem, regbufp, exactflagp);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_register.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_register.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_register.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_register.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,1555 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+
+dm_fsreg_t	*dm_registers;	/* head of filesystem registration list */
+int		dm_fsys_cnt;	/* number of filesystems on dm_registers list */
+lock_t		dm_reg_lock = SPIN_LOCK_UNLOCKED;/* lock for dm_registers */
+
+
+
+#ifdef CONFIG_PROC_FS
+static int
+fsreg_read_pfs(char *buffer, char **start, off_t offset,
+		 int count, int *eof, void *data)
+{
+	int len;
+	int i;
+	dm_fsreg_t	*fsrp = (dm_fsreg_t*)data;
+	char		statebuf[30];
+
+#define CHKFULL if(len >= count) break;
+#define ADDBUF(a,b)	len += sprintf(buffer + len, a, b); CHKFULL;
+
+	switch (fsrp->fr_state) {
+	case DM_STATE_MOUNTING:		sprintf(statebuf, "mounting"); break;
+	case DM_STATE_MOUNTED:		sprintf(statebuf, "mounted"); break;
+	case DM_STATE_UNMOUNTING:	sprintf(statebuf, "unmounting"); break;
+	case DM_STATE_UNMOUNTED:	sprintf(statebuf, "unmounted"); break;
+	default:
+		sprintf(statebuf, "unknown:%d", (int)fsrp->fr_state);
+		break;
+	}
+
+	len=0;
+	while(1){
+		ADDBUF("fsrp=0x%p\n", fsrp);
+		ADDBUF("fr_next=0x%p\n", fsrp->fr_next);
+		ADDBUF("fr_vfsp=0x%p\n", fsrp->fr_vfsp);
+		ADDBUF("fr_tevp=0x%p\n", fsrp->fr_tevp);
+		ADDBUF("fr_fsid=%c\n", '?');
+		ADDBUF("fr_msg=0x%p\n", fsrp->fr_msg);
+		ADDBUF("fr_msgsize=%d\n", fsrp->fr_msgsize);
+		ADDBUF("fr_state=%s\n", statebuf);
+		ADDBUF("fr_dispq=%c\n", '?');
+		ADDBUF("fr_dispcnt=%d\n", fsrp->fr_dispcnt);
+
+		ADDBUF("fr_evt_dispq.eq_head=0x%p\n", fsrp->fr_evt_dispq.eq_head);
+		ADDBUF("fr_evt_dispq.eq_tail=0x%p\n", fsrp->fr_evt_dispq.eq_tail);
+		ADDBUF("fr_evt_dispq.eq_count=%d\n", fsrp->fr_evt_dispq.eq_count);
+
+		ADDBUF("fr_queue=%c\n", '?');
+		ADDBUF("fr_lock=%c\n", '?');
+		ADDBUF("fr_hdlcnt=%d\n", fsrp->fr_hdlcnt);
+		ADDBUF("fr_vfscnt=%d\n", fsrp->fr_vfscnt);
+		ADDBUF("fr_unmount=%d\n", fsrp->fr_unmount);
+
+		len += sprintf(buffer + len, "fr_rattr=");
+		CHKFULL;
+		for(i = 0; i <= DM_ATTR_NAME_SIZE; ++i){
+			ADDBUF("%c", fsrp->fr_rattr.an_chars[i]);
+		}
+		CHKFULL;
+		len += sprintf(buffer + len, "\n");
+		CHKFULL;
+
+		for(i = 0; i < DM_EVENT_MAX; i++){
+			if( fsrp->fr_sessp[i] != NULL ){
+				ADDBUF("fr_sessp[%d]=", i);
+				ADDBUF("0x%p\n", fsrp->fr_sessp[i]);
+			}
+		}
+		CHKFULL;
+
+		break;
+	}
+
+	if (offset >= len) {
+		*start = buffer;
+		*eof = 1;
+		return 0;
+	}
+	*start = buffer + offset;
+	if ((len -= offset) > count)
+		return count;
+	*eof = 1;
+
+	return len;
+}
+#endif
+
+
+/* Returns a pointer to the filesystem structure for the filesystem
+   referenced by vfsp.	The caller is responsible for obtaining dm_reg_lock
+   before calling this routine.
+*/
+
+static dm_fsreg_t *
+dm_find_fsreg(
+	fsid_t		*fsidp)
+{
+	dm_fsreg_t	*fsrp;
+
+	for (fsrp = dm_registers; fsrp; fsrp = fsrp->fr_next) {
+		if (!bcmp(&fsrp->fr_fsid, fsidp, sizeof(*fsidp)))
+			break;
+	}
+	return(fsrp);
+}
+
+
+/* Given a fsid_t, dm_find_fsreg_and_lock() finds the dm_fsreg_t structure
+   for that filesytem if one exists, and returns a pointer to the structure
+   after obtaining its 'fr_lock' so that the caller can safely modify the
+   dm_fsreg_t.	The caller is responsible for releasing 'fr_lock'.
+*/
+
+static dm_fsreg_t *
+dm_find_fsreg_and_lock(
+	fsid_t		*fsidp,
+	unsigned long	*lcp)		/* address of returned lock cookie */
+{
+	dm_fsreg_t	*fsrp;
+
+	for (;;) {
+		*lcp = mutex_spinlock(&dm_reg_lock);
+
+		if ((fsrp = dm_find_fsreg(fsidp)) == NULL) {
+			mutex_spinunlock(&dm_reg_lock, *lcp);
+			return(NULL);
+		}
+		if (spin_trylock(&fsrp->fr_lock)) {
+			nested_spinunlock(&dm_reg_lock);
+			return(fsrp);	/* success */
+		}
+
+		/* If the second lock is not available, drop the first and
+		   start over.	This gives the CPU a chance to process any
+		   interrupts, and also allows processes which want a fr_lock
+		   for a different filesystem to proceed.
+		*/
+
+		mutex_spinunlock(&dm_reg_lock, *lcp);
+	}
+}
+
+
+/* dm_add_fsys_entry() is called when a DM_EVENT_MOUNT event is about to be
+   sent.  It creates a dm_fsreg_t structure for the filesystem and stores a
+   pointer to a copy of the mount event within that structure so that it is
+   available for subsequent dm_get_mountinfo() calls.
+*/
+
+int
+dm_add_fsys_entry(
+	vfs_t		*vfsp,
+	dm_tokevent_t	*tevp)
+{
+	dm_fsreg_t	*fsrp;
+	int		msgsize;
+	void		*msg;
+	unsigned long	lc;			/* lock cookie */
+
+	/* Allocate and initialize a dm_fsreg_t structure for the filesystem. */
+
+	msgsize = tevp->te_allocsize - offsetof(dm_tokevent_t, te_event);
+	msg = kmalloc(msgsize, GFP_KERNEL);
+	if (msg == NULL) {
+		printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+		return ENOMEM;
+	}
+	bcopy(&tevp->te_event, msg, msgsize);
+
+	fsrp = kmem_cache_alloc(dm_fsreg_cachep, SLAB_KERNEL);
+	if (fsrp == NULL) {
+		kfree(msg);
+		printk("%s/%d: kmem_cache_alloc(dm_fsreg_cachep) returned NULL\n", __FUNCTION__, __LINE__);
+		return ENOMEM;
+	}
+	memset(fsrp, 0, sizeof(*fsrp));
+
+	fsrp->fr_vfsp = vfsp;
+	fsrp->fr_tevp = tevp;
+	fsrp->fr_fsid = *vfsp->vfs_altfsid;
+	fsrp->fr_msg = msg;
+	fsrp->fr_msgsize = msgsize;
+	fsrp->fr_state = DM_STATE_MOUNTING;
+	sv_init(&fsrp->fr_dispq, SV_DEFAULT, "fr_dispq");
+	sv_init(&fsrp->fr_queue, SV_DEFAULT, "fr_queue");
+	spinlock_init(&fsrp->fr_lock, "fr_lock");
+
+	/* If no other mounted DMAPI filesystem already has this same
+	   fsid_t, then add this filesystem to the list.
+	*/
+
+	lc = mutex_spinlock(&dm_reg_lock);
+
+	if (!dm_find_fsreg(vfsp->vfs_altfsid)) {
+		fsrp->fr_next = dm_registers;
+		dm_registers = fsrp;
+		dm_fsys_cnt++;
+#ifdef CONFIG_PROC_FS
+		{
+		char buf[100];
+		struct proc_dir_entry *entry;
+
+		sprintf(buf, DMAPI_DBG_PROCFS "/fsreg/0x%p", fsrp);
+		entry = create_proc_read_entry(buf, 0, 0, fsreg_read_pfs, fsrp);
+		entry->owner = THIS_MODULE;
+		}
+#endif
+		mutex_spinunlock(&dm_reg_lock, lc);
+		return(0);
+	}
+
+	/* A fsid_t collision occurred, so prevent this new filesystem from
+	   mounting.
+	*/
+
+	mutex_spinunlock(&dm_reg_lock, lc);
+
+	sv_destroy(&fsrp->fr_dispq);
+	sv_destroy(&fsrp->fr_queue);
+	spinlock_destroy(&fsrp->fr_lock);
+	kfree(msg);
+	kmem_cache_free(dm_fsreg_cachep, fsrp);
+	return(EBUSY);
+}
+
+
+/* dm_change_fsys_entry() is called whenever a filesystem's mount state is
+   about to change.  The state is changed to DM_STATE_MOUNTED after a
+   successful DM_EVENT_MOUNT event or after a failed unmount.  It is changed
+   to DM_STATE_UNMOUNTING after a successful DM_EVENT_PREUNMOUNT event.
+   Finally, the state is changed to DM_STATE_UNMOUNTED after a successful
+   unmount.  It stays in this state until the DM_EVENT_UNMOUNT event is
+   queued, at which point the filesystem entry is removed.
+*/
+
+void
+dm_change_fsys_entry(
+	vfs_t		*vfsp,
+	dm_fsstate_t	newstate)
+{
+	dm_fsreg_t	*fsrp;
+	int		seq_error;
+	unsigned long	lc;			/* lock cookie */
+
+	/* Find the filesystem referenced by the vfsp's fsid_t.	 This should
+	   always succeed.
+	*/
+
+	if ((fsrp = dm_find_fsreg_and_lock(vfsp->vfs_altfsid, &lc)) == NULL) {
+		panic("dm_change_fsys_entry: can't find DMAPI fsrp for "
+			"vfsp %p\n", vfsp);
+	}
+
+	/* Make sure that the new state is acceptable given the current state
+	   of the filesystem.  Any error here is a major DMAPI/filesystem
+	   screwup.
+	*/
+
+	seq_error = 0;
+	switch (newstate) {
+	case DM_STATE_MOUNTED:
+		if (fsrp->fr_state != DM_STATE_MOUNTING &&
+		    fsrp->fr_state != DM_STATE_UNMOUNTING) {
+			seq_error++;
+		}
+		break;
+	case DM_STATE_UNMOUNTING:
+		if (fsrp->fr_state != DM_STATE_MOUNTED)
+			seq_error++;
+		break;
+	case DM_STATE_UNMOUNTED:
+		if (fsrp->fr_state != DM_STATE_UNMOUNTING)
+			seq_error++;
+		break;
+	default:
+		seq_error++;
+		break;
+	}
+	if (seq_error) {
+		panic("dm_change_fsys_entry: DMAPI sequence error: old state "
+			"%d, new state %d, fsrp %p\n", fsrp->fr_state,
+			newstate, fsrp);
+	}
+
+	/* If the old state was DM_STATE_UNMOUNTING, then processes could be
+	   sleeping in dm_handle_to_vp() waiting for their DM_NO_TOKEN handles
+	   to be translated to vnodes.	Wake them up so that they either
+	   continue (new state is DM_STATE_MOUNTED) or fail (new state is
+	   DM_STATE_UNMOUNTED).
+	*/
+
+	if (fsrp->fr_state == DM_STATE_UNMOUNTING) {
+		if (fsrp->fr_hdlcnt)
+			sv_broadcast(&fsrp->fr_queue);
+	}
+
+	/* Change the filesystem's mount state to its new value. */
+
+	fsrp->fr_state = newstate;
+	fsrp->fr_tevp = NULL;		/* not valid after DM_STATE_MOUNTING */
+
+	/* If the new state is DM_STATE_UNMOUNTING, wait until any application
+	   threads currently in the process of making VFS_VGET and VFS_ROOT
+	   calls are done before we let this unmount thread continue the
+	   unmount.  (We want to make sure that the unmount will see these
+	   vnode references during its scan.)
+	*/
+
+	if (newstate == DM_STATE_UNMOUNTING) {
+		while (fsrp->fr_vfscnt) {
+			fsrp->fr_unmount++;
+			sv_wait(&fsrp->fr_queue, 1, &fsrp->fr_lock, lc);
+			lc = mutex_spinlock(&fsrp->fr_lock);
+			fsrp->fr_unmount--;
+		}
+	}
+
+	mutex_spinunlock(&fsrp->fr_lock, lc);
+}
+
+
+/* dm_remove_fsys_entry() gets called after a failed mount or after an
+   DM_EVENT_UNMOUNT event has been queued.  (The filesystem entry must stay
+   until the DM_EVENT_UNMOUNT reply is queued so that the event can use the
+   'fr_sessp' list to see which session to send the event to.)
+*/
+
+void
+dm_remove_fsys_entry(
+	vfs_t		*vfsp)
+{
+	dm_fsreg_t	**fsrpp;
+	dm_fsreg_t	*fsrp;
+	unsigned long	lc;			/* lock cookie */
+
+	/* Find the filesystem referenced by the vfsp's fsid_t and dequeue
+	   it after verifying that the fr_state shows a filesystem that is
+	   either mounting or unmounted.
+	*/
+
+	lc = mutex_spinlock(&dm_reg_lock);
+
+	fsrpp = &dm_registers;
+	while ((fsrp = *fsrpp) != NULL) {
+		if (!bcmp(&fsrp->fr_fsid, vfsp->vfs_altfsid, sizeof(fsrp->fr_fsid)))
+			break;
+		fsrpp = &fsrp->fr_next;
+	}
+	if (fsrp == NULL) {
+		mutex_spinunlock(&dm_reg_lock, lc);
+		panic("dm_remove_fsys_entry: can't find DMAPI fsrp for "
+			"vfsp %p\n", vfsp);
+	}
+
+	nested_spinlock(&fsrp->fr_lock);
+
+	/* Verify that it makes sense to remove this entry. */
+
+	if (fsrp->fr_state != DM_STATE_MOUNTING &&
+	    fsrp->fr_state != DM_STATE_UNMOUNTED) {
+		nested_spinunlock(&fsrp->fr_lock);
+		mutex_spinunlock(&dm_reg_lock, lc);
+		panic("dm_remove_fsys_entry: DMAPI sequence error: old state "
+			"%d, fsrp %p\n", fsrp->fr_state, fsrp);
+	}
+
+	*fsrpp = fsrp->fr_next;
+	dm_fsys_cnt--;
+
+	nested_spinunlock(&dm_reg_lock);
+
+	/* Since the filesystem is about to finish unmounting, we must be sure
+	   that no vnodes are being referenced within the filesystem before we
+	   let this event thread continue.  If the filesystem is currently in
+	   state DM_STATE_MOUNTING, then we know by definition that there can't
+	   be any references.  If the filesystem is DM_STATE_UNMOUNTED, then
+	   any application threads referencing handles with DM_NO_TOKEN should
+	   have already been awakened by dm_change_fsys_entry and should be
+	   long gone by now.  Just in case they haven't yet left, sleep here
+	   until they are really gone.
+	*/
+
+	while (fsrp->fr_hdlcnt) {
+		fsrp->fr_unmount++;
+		sv_wait(&fsrp->fr_queue, 1, &fsrp->fr_lock, lc);
+		lc = mutex_spinlock(&fsrp->fr_lock);
+		fsrp->fr_unmount--;
+	}
+	mutex_spinunlock(&fsrp->fr_lock, lc);
+
+	/* Release all memory. */
+
+#ifdef CONFIG_PROC_FS
+	{
+	char buf[100];
+	sprintf(buf, DMAPI_DBG_PROCFS "/fsreg/0x%p", fsrp);
+	remove_proc_entry(buf, NULL);
+	}
+#endif
+	sv_destroy(&fsrp->fr_dispq);
+	sv_destroy(&fsrp->fr_queue);
+	spinlock_destroy(&fsrp->fr_lock);
+	kfree(fsrp->fr_msg);
+	kmem_cache_free(dm_fsreg_cachep, fsrp);
+}
+
+
+/* Get a vnode for the object referenced by handlep.  We cannot use
+   altgetvfs() because it fails if the VFS_OFFLINE bit is set, which means
+   that any call to dm_handle_to_vp() while a umount is in progress would
+   return an error, even if the umount can't possibly succeed because users
+   are in the filesystem.  The requests would start to fail as soon as the
+   umount begins, even before the application receives the DM_EVENT_PREUNMOUNT
+   event.
+
+   dm_handle_to_vp() emulates the behavior of lookup() while an unmount is
+   in progress.	 Any call to dm_handle_to_vp() while the filesystem is in the
+   DM_STATE_UNMOUNTING state will block.  If the unmount eventually succeeds,
+   the requests will wake up and fail.	If the unmount fails, the requests will
+   wake up and complete normally.
+
+   While a filesystem is in state DM_STATE_MOUNTING, dm_handle_to_vp() will
+   fail all requests.  Per the DMAPI spec, the only handles in the filesystem
+   which are valid during a mount event are the handles within the event
+   itself.
+*/
+
+vnode_t *
+dm_handle_to_vp(
+	xfs_handle_t	*handlep,
+	short		*typep)
+{
+	dm_fsreg_t	*fsrp;
+	vnode_t		*vp;
+	short		type;
+	unsigned long	lc;			/* lock cookie */
+	int		error;
+	fid_t		*fidp;
+
+	if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handlep->ha_fsid, &lc)) == NULL)
+		return(NULL);
+
+	fidp = (fid_t*)&handlep->ha_fid;
+	/* If mounting, and we are not asking for a filesystem handle,
+	 * then fail the request.  (fid_len==0 for fshandle)
+	 */
+	if ((fsrp->fr_state == DM_STATE_MOUNTING) &&
+	    (fidp->fid_len != 0)) {
+		mutex_spinunlock(&fsrp->fr_lock, lc);
+		return(NULL);
+	}
+
+	for (;;) {
+		if (fsrp->fr_state == DM_STATE_MOUNTING)
+			break;
+		if (fsrp->fr_state == DM_STATE_MOUNTED)
+			break;
+		if (fsrp->fr_state == DM_STATE_UNMOUNTED) {
+			if (fsrp->fr_unmount && fsrp->fr_hdlcnt == 0)
+				sv_broadcast(&fsrp->fr_queue);
+			mutex_spinunlock(&fsrp->fr_lock, lc);
+			return(NULL);
+		}
+
+		/* Must be DM_STATE_UNMOUNTING. */
+
+		fsrp->fr_hdlcnt++;
+		sv_wait(&fsrp->fr_queue, 1, &fsrp->fr_lock, lc);
+		lc = mutex_spinlock(&fsrp->fr_lock);
+		fsrp->fr_hdlcnt--;
+	}
+
+	fsrp->fr_vfscnt++;
+	mutex_spinunlock(&fsrp->fr_lock, lc);
+
+	/* Now that the mutex is released, wait until we have access to the
+	   vnode.
+	*/
+
+	if (fidp->fid_len == 0) {	/* filesystem handle */
+		VFS_ROOT(fsrp->fr_vfsp, &vp, error);
+	} else {				/* file object handle */
+		VFS_VGET(fsrp->fr_vfsp, &vp, fidp, error);
+	}
+
+	lc = mutex_spinlock(&fsrp->fr_lock);
+
+	fsrp->fr_vfscnt--;
+	if (fsrp->fr_unmount && fsrp->fr_vfscnt == 0)
+		sv_broadcast(&fsrp->fr_queue);
+
+	mutex_spinunlock(&fsrp->fr_lock, lc);
+	if (error || vp == NULL)
+		return(NULL);
+
+	if (fidp->fid_len == 0) {
+		type = DM_TDT_VFS;
+	} else if (vp->v_type == VREG) {
+		type = DM_TDT_REG;
+	} else if (vp->v_type == VDIR) {
+		type = DM_TDT_DIR;
+	} else if (vp->v_type == VLNK) {
+		type = DM_TDT_LNK;
+	} else {
+		type = DM_TDT_OTH;
+	}
+	*typep = type;
+	return(vp);
+}
+
+
+int
+dm_vp_to_handle(
+	vnode_t		*vp,
+	xfs_handle_t	*handlep)
+{
+	int		error;
+	struct	fid	fid;
+	int		hsize;
+
+	if (vp->v_vfsp->vfs_altfsid == NULL)
+		return(EINVAL);
+
+	VOP_FID2(vp, &fid, error);
+	if (error)
+		return(error);
+
+	bcopy (vp->v_vfsp->vfs_altfsid, &handlep->ha_fsid, sizeof(fsid_t));
+	bcopy(&fid, &handlep->ha_fid, fid.fid_len + sizeof fid.fid_len);
+	hsize = XFS_HSIZE(*handlep);
+	bzero ((char *)handlep + hsize, sizeof(*handlep) - hsize);
+	return(0);
+}
+
+
+/* Given a vnode, check if that vnode resides in filesystem that supports
+   DMAPI.  Returns zero if the vnode is in a DMAPI filesystem, otherwise
+   returns an errno.
+*/
+
+int
+dm_check_dmapi_vp(
+	vnode_t		*vp)
+{
+	xfs_handle_t	handle;
+	/* REFERENCED */
+	dm_fsreg_t	*fsrp;
+	int		error;
+	unsigned long	lc;			/* lock cookie */
+
+	if ((error = dm_vp_to_handle(vp, &handle)) != 0)
+		return(error);
+
+	if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL)
+		return(EBADF);
+	mutex_spinunlock(&fsrp->fr_lock, lc);
+	return(0);
+}
+
+
+/* Return a pointer to the DM_EVENT_MOUNT event while a mount is still in
+   progress.  This is only called by dm_get_config and dm_get_config_events
+   which need to access the filesystem during a mount but which don't have
+   a session and token to use.
+*/
+
+dm_tokevent_t *
+dm_find_mount_tevp_and_lock(
+	fsid_t		*fsidp,
+	unsigned long	*lcp)		/* address of returned lock cookie */
+{
+	dm_fsreg_t	*fsrp;
+
+	if ((fsrp = dm_find_fsreg_and_lock(fsidp, lcp)) == NULL)
+		return(NULL);
+
+	if (!fsrp->fr_tevp || fsrp->fr_state != DM_STATE_MOUNTING) {
+		mutex_spinunlock(&fsrp->fr_lock, *lcp);
+		return(NULL);
+	}
+	nested_spinlock(&fsrp->fr_tevp->te_lock);
+	nested_spinunlock(&fsrp->fr_lock);
+	return(fsrp->fr_tevp);
+}
+
+
+/* Wait interruptibly until a session registers disposition for 'event' in
+   filesystem 'vfsp'.  Upon successful exit, both the filesystem's dm_fsreg_t
+   structure and the session's dm_session_t structure are locked.  The caller
+   is responsible for unlocking both structures using the returned cookies.
+
+   Warning: The locks can be dropped in any order, but the 'lc2p' cookie MUST
+   BE USED FOR THE FIRST UNLOCK, and the lc1p cookie must be used for the
+   second unlock.  If this is not done, the CPU will be interruptible while
+   holding a mutex, which could deadlock the machine!
+*/
+
+static int
+dm_waitfor_disp(
+	vfs_t		*vfsp,
+	dm_tokevent_t	*tevp,
+	dm_fsreg_t	**fsrpp,
+	unsigned long	*lc1p,		/* addr of first returned lock cookie */
+	dm_session_t	**sessionpp,
+	unsigned long	*lc2p)		/* addr of 2nd returned lock cookie */
+{
+	dm_eventtype_t	event = tevp->te_msg.ev_type;
+	dm_session_t	*s;
+	dm_fsreg_t	*fsrp;
+
+	if ((fsrp = dm_find_fsreg_and_lock(vfsp->vfs_altfsid, lc1p)) == NULL)
+		return(ENOENT);
+
+	/* If no session is registered for this event in the specified
+	   filesystem, then sleep interruptibly until one does.
+	*/
+
+	for (;;) {
+		int	rc = 0;
+
+		/* The dm_find_session_and_lock() call is needed because a
+		   session that is in the process of being removed might still
+		   be in the dm_fsreg_t structure but won't be in the
+		   dm_sessions list.
+		*/
+
+		if ((s = fsrp->fr_sessp[event]) != NULL &&
+		    dm_find_session_and_lock(s->sn_sessid, &s, lc2p) == 0) {
+			break;
+		}
+
+		/* Noone is currently registered.  DM_EVENT_UNMOUNT events
+		   don't wait for anyone to register because the unmount is
+		   already past the point of no return.
+		*/
+
+		if (event == DM_EVENT_UNMOUNT) {
+			mutex_spinunlock(&fsrp->fr_lock, *lc1p);
+			return(ENOENT);
+		}
+
+		/* Wait until a session registers for disposition of this
+		   event.
+		*/
+
+		fsrp->fr_dispcnt++;
+		dm_link_event(tevp, &fsrp->fr_evt_dispq);
+
+		sv_wait_sig(&fsrp->fr_dispq, 1, &fsrp->fr_lock, *lc1p);
+		rc = signal_pending(current);
+
+		*lc1p = mutex_spinlock(&fsrp->fr_lock);
+		fsrp->fr_dispcnt--;
+		dm_unlink_event(tevp, &fsrp->fr_evt_dispq);
+		if (rc) {		/* if signal was received */
+			mutex_spinunlock(&fsrp->fr_lock, *lc1p);
+			return(EINTR);
+		}
+	}
+	*sessionpp = s;
+	*fsrpp = fsrp;
+	return(0);
+}
+
+
+/* Returns the session pointer for the session registered for an event
+   in the given vfsp.  If successful, the session is locked upon return.  The
+   caller is responsible for releasing the lock.  If no session is currently
+   registered for the event, dm_waitfor_disp_session() will sleep interruptibly
+   until a registration occurs.
+*/
+
+int
+dm_waitfor_disp_session(
+	vfs_t		*vfsp,
+	dm_tokevent_t	*tevp,
+	dm_session_t	**sessionpp,
+	unsigned long	*lcp)
+{
+	dm_fsreg_t	*fsrp;
+	unsigned long	lc2;
+	int		error;
+
+	if (tevp->te_msg.ev_type < 0 || tevp->te_msg.ev_type > DM_EVENT_MAX)
+		return(EIO);
+
+	error = dm_waitfor_disp(vfsp, tevp, &fsrp, lcp, sessionpp, &lc2);
+	if (!error)
+		mutex_spinunlock(&fsrp->fr_lock, lc2);	/* rev. cookie order*/
+	return(error);
+}
+
+
+/* Find the session registered for the DM_EVENT_DESTROY event on the specified
+   filesystem, sleeping if necessary until registration occurs.	 Once found,
+   copy the session's return-on-destroy attribute name, if any, back to the
+   caller.
+*/
+
+int
+dm_waitfor_destroy_attrname(
+	vfs_t		*vfsp,
+	dm_attrname_t	*attrnamep)
+{
+	dm_tokevent_t	*tevp;
+	dm_session_t	*s;
+	dm_fsreg_t	*fsrp;
+	int		error;
+	unsigned long	lc1;		/* first lock cookie */
+	unsigned long	lc2;		/* second lock cookie */
+	void		*msgp;
+
+	tevp = dm_evt_create_tevp(DM_EVENT_DESTROY, 1, (void**)&msgp);
+	error = dm_waitfor_disp(vfsp, tevp, &fsrp, &lc1, &s, &lc2);
+	if (!error) {
+		*attrnamep = fsrp->fr_rattr;		/* attribute or zeros */
+		mutex_spinunlock(&s->sn_qlock, lc2);	/* rev. cookie order */
+		mutex_spinunlock(&fsrp->fr_lock, lc1);
+	}
+	dm_evt_rele_tevp(tevp,0);
+	return(error);
+}
+
+
+/* Unregisters the session for the disposition of all events on all
+   filesystems.	 This routine is not called until the session has been
+   dequeued from the session list and its session lock has been dropped,
+   but before the actual structure is freed, so it is safe to grab the
+   'dm_reg_lock' here.	If dm_waitfor_disp_session() happens to be called
+   by another thread, it won't find this session on the session list and
+   will wait until a new session registers.
+*/
+
+void
+dm_clear_fsreg(
+	dm_session_t	*s)
+{
+	dm_fsreg_t	*fsrp;
+	int		event;
+	unsigned long	lc;			/* lock cookie */
+
+	lc = mutex_spinlock(&dm_reg_lock);
+
+	for (fsrp = dm_registers; fsrp != NULL; fsrp = fsrp->fr_next) {
+		nested_spinlock(&fsrp->fr_lock);
+		for (event = 0; event < DM_EVENT_MAX; event++) {
+			if (fsrp->fr_sessp[event] != s)
+				continue;
+			fsrp->fr_sessp[event] = NULL;
+			if (event == DM_EVENT_DESTROY)
+				bzero(&fsrp->fr_rattr, sizeof(fsrp->fr_rattr));
+		}
+		nested_spinunlock(&fsrp->fr_lock);
+	}
+
+	mutex_spinunlock(&dm_reg_lock, lc);
+}
+
+
+/*
+ *  Return the handle for the object named by path.
+ */
+
+int
+dm_path_to_hdl(
+	char		*path,		/* any path name */
+	void		*hanp,		/* user's data buffer */
+	size_t		*hlenp)		/* set to size of data copied */
+{
+	/* REFERENCED */
+	dm_fsreg_t	*fsrp;
+	xfs_handle_t	handle;
+	vnode_t		*vp;
+	size_t		hlen;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+	struct nameidata nd;
+	struct inode *inode;
+	size_t		len;
+	char		*name;
+
+	/* XXX get things straightened out so getname() works here? */
+	len = strnlen_user(path, 2000);
+	name = kmalloc(len, GFP_KERNEL);
+	if (name == NULL) {
+		printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+		return(ENOMEM);
+	}
+	if (copy_from_user(name, path, len)) {
+		kfree(name);
+		return(EFAULT);
+	}
+
+	error = 0;
+	if (path_init(name, LOOKUP_POSITIVE, &nd))
+		error = path_walk(name, &nd);
+	kfree(name);
+	if (error)
+		return error;
+
+	ASSERT(nd.dentry);
+	ASSERT(nd.dentry->d_inode);
+	inode = igrab(nd.dentry->d_inode);
+	path_release(&nd);
+
+	if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
+		/* we're not in XFS anymore, Toto */
+		iput(inode);
+		return EINVAL;
+	}
+
+	/* we need the vnode */
+	vp = LINVFS_GET_VP(inode);
+	error = dm_vp_to_handle(vp, &handle);
+	iput(inode);
+	if (error)
+		return(error);
+
+	if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL)
+		return(EBADF);
+	mutex_spinunlock(&fsrp->fr_lock, lc);
+
+	hlen = XFS_HSIZE(handle);
+
+	if (copy_to_user(hanp, &handle, (int)hlen))
+		return(EFAULT);
+	return(put_user(hlen,hlenp));
+}
+
+
+/*
+ *  Return the handle for the file system containing the object named by path.
+ */
+
+int
+dm_path_to_fshdl(
+	char		*path,		/* any path name */
+	void		*hanp,		/* user's data buffer */
+	size_t		*hlenp)		/* set to size of data copied */
+{
+	/* REFERENCED */
+	dm_fsreg_t	*fsrp;
+	xfs_handle_t	handle;
+	vnode_t		*vp;
+	size_t		hlen;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+	struct nameidata nd;
+	struct inode *inode;
+	size_t		len;
+	char		*name;
+
+	/* XXX get things straightened out so getname() works here? */
+	len = strnlen_user(path, 2000);
+	name = kmalloc(len, GFP_KERNEL);
+	if (name == NULL) {
+		printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+		return(ENOMEM);
+	}
+	if (copy_from_user(name, path, len)) {
+		kfree(name);
+		return(EFAULT);
+	}
+
+	error = 0;
+	if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd))
+		error = path_walk(name, &nd);
+	kfree(name);
+	if (error)
+		return error;
+
+	ASSERT(nd.dentry);
+	ASSERT(nd.dentry->d_inode);
+
+	inode = igrab(nd.dentry->d_inode);
+	path_release(&nd);
+
+	if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
+		/* we're not in XFS anymore, Toto */
+		iput(inode);
+		return EINVAL;
+	}
+
+	/* we need the vnode */
+	vp = LINVFS_GET_VP(inode);
+	error = dm_vp_to_handle(vp, &handle);
+	iput(inode);
+
+	if (error)
+		return(error);
+
+	if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL)
+		return(EBADF);
+	mutex_spinunlock(&fsrp->fr_lock, lc);
+
+	hlen = FSHSIZE;
+	if(copy_to_user(hanp, &handle, (int)hlen))
+		return(EFAULT);
+	return(put_user(hlen,hlenp));
+}
+
+
+int
+dm_fd_to_hdl(
+	int		fd,		/* any file descriptor */
+	void		*hanp,		/* user's data buffer */
+	size_t		*hlenp)		/* set to size of data copied */
+{
+	/* REFERENCED */
+	dm_fsreg_t	*fsrp;
+	xfs_handle_t	handle;
+	size_t		hlen;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+	struct file *filep = fget(fd);
+
+	if (!filep)
+		return(EBADF);
+	if ((error = dm_vp_to_handle(LINVFS_GET_VP(filep->f_dentry->d_inode), &handle)) != 0)
+		return(error);
+
+	if ((fsrp = dm_find_fsreg_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL)
+		return(EBADF);
+	mutex_spinunlock(&fsrp->fr_lock, lc);
+
+	hlen = XFS_HSIZE(handle);
+	if (copy_to_user(hanp, &handle, (int)hlen))
+		return(EFAULT);
+	fput(filep);
+	return(put_user(hlen, hlenp));
+}
+
+
+/* Enable events on an object. */
+
+int
+dm_set_eventlist(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_eventset_t	*eventsetp,
+	u_int		maxevent)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_eventset_t	eventset;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	if (copy_from_user(&eventset, eventsetp, sizeof(eventset)))
+		return(EFAULT);
+
+	/* Do some minor sanity checking. */
+
+	if (maxevent == 0 || maxevent > DM_EVENT_MAX)
+		return(EINVAL);
+
+	/* Access the specified object. */
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_ANY,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->set_eventlist(tdp->td_vp, tdp->td_right,
+		(tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0),
+		&eventset, maxevent);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+/* Return the list of enabled events for an object. */
+
+int
+dm_get_eventlist(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		nelem,
+	dm_eventset_t	*eventsetp,
+	u_int		*nelemp)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	dm_eventset_t	eventset;
+	u_int		elem;
+	int		error;
+
+	if (nelem == 0)
+		return(EINVAL);
+
+	/* Access the specified object. */
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_ANY,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	/* Get the object's event list. */
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->get_eventlist(tdp->td_vp, tdp->td_right,
+		(tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0),
+		nelem, &eventset, &elem);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	dm_app_put_tdp(tdp);
+
+	if (error)
+		return(error);
+
+	if (copy_to_user(eventsetp, &eventset, sizeof(eventset)))
+		return(EFAULT);
+	if (put_user(nelem, nelemp))
+		return(EFAULT);
+	return(0);
+}
+
+
+/* Register for disposition of events.	The handle must either be the
+   global handle or must be the handle of a file system.  The list of events
+   is pointed to by eventsetp.
+*/
+
+int
+dm_set_disp(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_eventset_t	*eventsetp,
+	u_int		maxevent)
+{
+	dm_session_t	*s;
+	dm_fsreg_t	*fsrp;
+	dm_tokdata_t	*tdp;
+	dm_eventset_t	eventset;
+	int		error;
+	unsigned long	lc1;		/* first lock cookie */
+	unsigned long	lc2;		/* second lock cookie */
+	u_int		i;
+
+	/* Copy in and validate the event mask.	 Only the lower maxevent bits
+	   are meaningful, so clear any bits set above maxevent.
+	*/
+
+	if (maxevent == 0 || maxevent > DM_EVENT_MAX)
+		return(EINVAL);
+	if (copy_from_user(&eventset, eventsetp, sizeof(eventset)))
+		return(EFAULT);
+	eventset &= (1 << maxevent) - 1;
+
+	/* If the caller specified the global handle, then the only valid token
+	   is DM_NO_TOKEN, and the only valid event in the event mask is
+	   DM_EVENT_MOUNT.  If it is set, add the session to the list of
+	   sessions that want to receive mount events.	If it is clear, remove
+	   the session from the list.  Since DM_EVENT_MOUNT events never block
+	   waiting for a session to register, there is noone to wake up if we
+	   do add the session to the list.
+	*/
+
+	if (DM_GLOBALHAN(hanp, hlen)) {
+		if (token != DM_NO_TOKEN)
+			return(EINVAL);
+		if ((error = dm_find_session_and_lock(sid, &s, &lc1)) != 0)
+			return(error);
+		if (eventset == 0) {
+			s->sn_flags &= ~DM_SN_WANTMOUNT;
+			error = 0;
+		} else if (eventset == 1 << DM_EVENT_MOUNT) {
+			s->sn_flags |= DM_SN_WANTMOUNT;
+			error = 0;
+		} else {
+			error = EINVAL;
+		}
+		mutex_spinunlock(&s->sn_qlock, lc1);
+		return(error);
+	}
+
+	/* Since it's not the global handle, it had better be a filesystem
+	   handle.  Verify that the first 'maxevent' events in the event list
+	   are all valid for a filesystem handle.
+	*/
+
+	if (eventset & ~DM_VALID_DISP_EVENTS)
+		return(EINVAL);
+
+	/* Verify that the session is valid, that the handle is a filesystem
+	   handle, and that the filesystem is capable of sending events.  (If
+	   a dm_fsreg_t structure exists, then the filesystem can issue events.)
+	*/
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	fsrp = dm_find_fsreg_and_lock((fsid_t*)&tdp->td_handle.ha_fsid, &lc1);
+	if (fsrp == NULL) {
+		dm_app_put_tdp(tdp);
+		return(EINVAL);
+	}
+
+	/* Now that we own 'fsrp->fr_lock', get the lock on the session so that
+	   it can't disappear while we add it to the filesystem's event mask.
+	*/
+
+	if ((error = dm_find_session_and_lock(sid, &s, &lc2)) != 0) {
+		mutex_spinunlock(&fsrp->fr_lock, lc1);
+		dm_app_put_tdp(tdp);
+		return(error);
+	}
+
+	/* Update the event disposition array for this filesystem, adding
+	   and/or removing the session as appropriate.	If this session is
+	   dropping registration for DM_EVENT_DESTROY, or is overriding some
+	   other session's registration for DM_EVENT_DESTROY, then clear any
+	   any attr-on-destroy attribute name also.
+	*/
+
+	for (i = 0; i < DM_EVENT_MAX; i++) {
+		if (DMEV_ISSET(i, eventset)) {
+			if (i == DM_EVENT_DESTROY && fsrp->fr_sessp[i] != s)
+				bzero(&fsrp->fr_rattr, sizeof(fsrp->fr_rattr));
+			fsrp->fr_sessp[i] = s;
+		} else if (fsrp->fr_sessp[i] == s) {
+			if (i == DM_EVENT_DESTROY)
+				bzero(&fsrp->fr_rattr, sizeof(fsrp->fr_rattr));
+			fsrp->fr_sessp[i] = NULL;
+		}
+	}
+	mutex_spinunlock(&s->sn_qlock, lc2);	/* reverse cookie order */
+
+	/* Wake up all processes waiting for a disposition on this filesystem
+	   in case any of them happen to be waiting for an event which we just
+	   added.
+	*/
+
+	if (fsrp->fr_dispcnt)
+		sv_broadcast(&fsrp->fr_dispq);
+
+	mutex_spinunlock(&fsrp->fr_lock, lc1);
+
+	dm_app_put_tdp(tdp);
+	return(0);
+}
+
+
+/*
+ *	Register a specific attribute name with a filesystem.  The value of
+ *	the attribute is to be returned with an asynchronous destroy event.
+ */
+
+int
+dm_set_return_on_destroy(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_attrname_t	*attrnamep,
+	dm_boolean_t	enable)
+{
+	dm_attrname_t	attrname;
+	dm_tokdata_t	*tdp;
+	dm_fsreg_t	*fsrp;
+	dm_session_t	*s;
+	int		error;
+	unsigned long	lc1;		/* first lock cookie */
+	unsigned long	lc2;		/* second lock cookie */
+
+	/* If a dm_attrname_t is provided, copy it in and validate it. */
+
+	if (enable && (error = copy_from_user(&attrname, attrnamep, sizeof(attrname))) != 0)
+		return(error);
+
+	/* Validate the filesystem handle and use it to get the filesystem's
+	   disposition structure.
+	*/
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS,
+		DM_RIGHT_EXCL, &tdp);
+	if (error != 0)
+		return(error);
+
+	fsrp = dm_find_fsreg_and_lock((fsid_t*)&tdp->td_handle.ha_fsid, &lc1);
+	if (fsrp == NULL) {
+		dm_app_put_tdp(tdp);
+		return(EINVAL);
+	}
+
+	/* Now that we own 'fsrp->fr_lock', get the lock on the session so that
+	   it can't disappear while we add it to the filesystem's event mask.
+	*/
+
+	if ((error = dm_find_session_and_lock(sid, &s, &lc2)) != 0) {
+		mutex_spinunlock(&fsrp->fr_lock, lc1);
+		dm_app_put_tdp(tdp);
+		return(error);
+	}
+
+	/* A caller cannot disable return-on-destroy if he is not registered
+	   for DM_EVENT_DESTROY.  Enabling return-on-destroy is an implicit
+	   dm_set_disp() for DM_EVENT_DESTROY; we wake up all processes
+	   waiting for a disposition in case any was waiting for a
+	   DM_EVENT_DESTROY event.
+	*/
+
+	error = 0;
+	if (enable) {
+		fsrp->fr_sessp[DM_EVENT_DESTROY] = s;
+		fsrp->fr_rattr = attrname;
+		if (fsrp->fr_dispcnt)
+			sv_broadcast(&fsrp->fr_dispq);
+	} else if (fsrp->fr_sessp[DM_EVENT_DESTROY] != s) {
+		error = EINVAL;
+	} else {
+		bzero(&fsrp->fr_rattr, sizeof(fsrp->fr_rattr));
+	}
+	mutex_spinunlock(&s->sn_qlock, lc2);	/* reverse cookie order */
+	mutex_spinunlock(&fsrp->fr_lock, lc1);
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_get_mountinfo(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp)
+{
+	dm_fsreg_t	*fsrp;
+	dm_tokdata_t	*tdp;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	/* Make sure that the caller's buffer is 8-byte aligned. */
+
+	if (((__psint_t)bufp & (sizeof(__u64) - 1)) != 0)
+		return(EFAULT);
+
+	/* Verify that the handle is a filesystem handle, and that the
+	   filesystem is capable of sending events.  If not, return an error.
+	*/
+
+	error = dm_app_get_tdp(sid, hanp, hlen, token, DM_TDT_VFS,
+		DM_RIGHT_SHARED, &tdp);
+	if (error != 0)
+		return(error);
+
+	/* Find the filesystem entry.  This should always succeed as the
+	   dm_app_get_tdp call created a filesystem reference.	Once we find
+	   the entry, drop the lock.  The mountinfo message is never modified,
+	   the filesystem entry can't disappear, and we don't want to hold a
+	   spinlock while doing copyout calls.
+	*/
+
+	fsrp = dm_find_fsreg_and_lock((fsid_t*)&tdp->td_handle.ha_fsid, &lc);
+	if (fsrp == NULL) {
+		dm_app_put_tdp(tdp);
+		return(EINVAL);
+	}
+	mutex_spinunlock(&fsrp->fr_lock, lc);
+
+	/* Copy the message into the user's buffer and update his 'rlenp'. */
+
+	if (put_user(fsrp->fr_msgsize, rlenp)) {
+		error = EFAULT;
+	} else if (fsrp->fr_msgsize > buflen) { /* user buffer not big enough */
+		error = E2BIG;
+	} else if (copy_to_user(bufp, fsrp->fr_msg, fsrp->fr_msgsize)) {
+		error = EFAULT;
+	} else {
+		error = 0;
+	}
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_getall_disp(
+	dm_sessid_t	sid,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp)
+{
+	dm_session_t	*s;		/* pointer to session given by sid */
+	unsigned long	lc1;		/* first lock cookie */
+	unsigned long	lc2;		/* second lock cookie */
+	int		totalsize;
+	int		msgsize;
+	int		fsyscnt;
+	dm_dispinfo_t	*prevmsg;
+	dm_fsreg_t	*fsrp;
+	int		error;
+	char		*kbuf;
+
+	int tmp3;
+	int tmp4;
+
+	/* Because the dm_getall_disp structure contains a __u64 field,
+	   make sure that the buffer provided by the caller is aligned so
+	   that he can read such fields successfully.
+	*/
+
+	if (((__psint_t)bufp & (sizeof(__u64) - 1)) != 0)
+		return(EFAULT);
+
+	/* Compute the size of a dm_dispinfo structure, rounding up to an
+	   8-byte boundary so that any subsequent structures will also be
+	   aligned.
+	*/
+
+#if 0
+	/* XXX	ug, what is going on here? */
+	msgsize = (sizeof(dm_dispinfo_t) + FSHSIZE + sizeof(uint64_t) - 1) &
+		~(sizeof(uint64_t) - 1);
+#else
+	tmp3 = sizeof(dm_dispinfo_t) + FSHSIZE;
+	tmp3 += sizeof(__u64);
+	tmp3 -= 1;
+	tmp4 = ~(sizeof(__u64) - 1);
+	msgsize = tmp3 & tmp4;
+#endif
+
+	/* Loop until we can get the right amount of temp space, being careful
+	   not to hold a mutex during the allocation.  Usually only one trip.
+	*/
+
+	for (;;) {
+		if ((fsyscnt = dm_fsys_cnt) == 0) {
+			/*if (dm_cpoutsizet(rlenp, 0))*/
+			if (put_user(0,rlenp))
+				return(EFAULT);
+			return(0);
+		}
+		kbuf = kmalloc(fsyscnt * msgsize, GFP_KERNEL);
+		if (kbuf == NULL) {
+			printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+			return ENOMEM;
+		}
+
+		lc1 = mutex_spinlock(&dm_reg_lock);
+		if (fsyscnt == dm_fsys_cnt)
+			break;
+
+		mutex_spinunlock(&dm_reg_lock, lc1);
+		kfree(kbuf);
+	}
+
+	/* Find the indicated session and lock it. */
+
+	if ((error = dm_find_session_and_lock(sid, &s, &lc2)) != 0) {
+		mutex_spinunlock(&dm_reg_lock, lc1);
+		kfree(kbuf);
+		return(error);
+	}
+
+	/* Create a dm_dispinfo structure for each filesystem in which
+	   this session has at least one event selected for disposition.
+	*/
+
+	totalsize = 0;		/* total bytes to transfer to the user */
+	prevmsg = NULL;
+
+	for (fsrp = dm_registers; fsrp; fsrp = fsrp->fr_next) {
+		dm_dispinfo_t	*disp;
+		int		event;
+		int		found;
+
+		disp = (dm_dispinfo_t *)(kbuf + totalsize);
+
+		DMEV_ZERO(disp->di_eventset);
+
+		for (event = 0, found = 0; event < DM_EVENT_MAX; event++) {
+			if (fsrp->fr_sessp[event] != s)
+				continue;
+			DMEV_SET(event, disp->di_eventset);
+			found++;
+		}
+		if (!found)
+			continue;
+
+		disp->_link = 0;
+		disp->di_fshandle.vd_offset = sizeof(dm_dispinfo_t);
+		disp->di_fshandle.vd_length = FSHSIZE;
+
+		bcopy(&fsrp->fr_fsid,
+			(char *)disp + disp->di_fshandle.vd_offset,
+			disp->di_fshandle.vd_length);
+
+		if (prevmsg)
+			prevmsg->_link = msgsize;
+
+		prevmsg = disp;
+		totalsize += msgsize;
+	}
+	mutex_spinunlock(&s->sn_qlock, lc2);	/* reverse cookie order */
+	mutex_spinunlock(&dm_reg_lock, lc1);
+
+	if (put_user(totalsize, rlenp)) {
+		error = EFAULT;
+	} else if (totalsize > buflen) {	/* no more room */
+		error = E2BIG;
+	} else if (totalsize && copy_to_user(bufp, kbuf, totalsize)) {
+		error = EFAULT;
+	} else {
+		error = 0;
+	}
+
+	kfree(kbuf);
+	return(error);
+}
+
+int
+dm_open_by_handle_rvp(
+	unsigned int	fd,
+	void		*hanp,
+	size_t		hlen,
+	int		flags,
+	int		*rvp)
+{
+	xfs_handle_t	handle;
+	int		error;
+	vnode_t		*vp;
+	short		td_type;
+	struct dentry	*dentry;
+	struct inode	*inodep;
+	int		new_fd;
+	struct file	*mfilp;
+	struct file	*filp;
+	struct list_head *lp;
+
+	if ((error = dm_copyin_handle(hanp, hlen, &handle)) != 0) {
+		return(error);
+	}
+
+	if ((vp = dm_handle_to_vp(&handle, &td_type)) == NULL) {
+		return(EBADF);
+	}
+	inodep = LINVFS_GET_IP(vp);
+	if ((td_type == DM_TDT_VFS) || (td_type == DM_TDT_OTH)) {
+		iput(inodep);
+		return(EBADF);
+	}
+
+	if ((new_fd = get_unused_fd()) < 0) {
+		iput(inodep);
+		return(EMFILE);
+	}
+
+	/* Now to find a dentry.  If possible, get a well-connected one. */
+	spin_lock(&dcache_lock);
+	for (lp = inodep->i_dentry.next; lp != &inodep->i_dentry ; lp=lp->next) {
+		dentry = list_entry(lp,struct dentry, d_alias);
+		if (! (dentry->d_flags & DCACHE_NFSD_DISCONNECTED)) {
+			dget_locked(dentry);
+			dentry->d_vfs_flags |= DCACHE_REFERENCED;
+			spin_unlock(&dcache_lock);
+			iput(inodep);
+			goto found;
+		}
+	}
+	spin_unlock(&dcache_lock);
+
+	/* ELSE didn't find dentry.  Create anonymous dcache entry. */
+	dentry = d_alloc_root(inodep);
+	if (dentry == NULL) {
+		iput(inodep);
+		put_unused_fd(new_fd);
+		return(ENOMEM);
+	}
+	/* keep nfsd happy. */
+	dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
+
+found:
+
+	if( inodep->i_ino != dentry->d_inode->i_ino ){
+		dput(dentry);
+		put_unused_fd(new_fd);
+		return(EINVAL);
+	}
+
+	mfilp = fget(fd);
+	if (!mfilp) {
+		dput(dentry);
+		put_unused_fd(new_fd);
+		return(EBADF);
+	}
+
+	mntget(mfilp->f_vfsmnt);
+
+	/* Create file pointer */
+	filp = dentry_open(dentry, mfilp->f_vfsmnt, flags);
+	if (IS_ERR(filp)) {
+		put_unused_fd(new_fd);
+		fput(mfilp);
+		return -PTR_ERR(filp);
+	}
+
+	if (td_type == DM_TDT_REG)
+		filp->f_mode |= FINVIS;
+	fd_install(new_fd, filp);
+	fput(mfilp);
+	*rvp = new_fd;
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_right.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_right.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_right.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_right.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,1268 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "dmapi_private.h"
+
+
+#define DM_FG_STHREAD		0x001	/* keep other threads from using tdp */
+#define DM_FG_MUSTEXIST		0x002	/* handle must exist in the event */
+#define DM_FG_DONTADD		0x004	/* don't add handle if not in event */
+
+/* Get a handle of the form (void *, size_t) from user space and convert it to
+   a handle_t.	Do as much validation of the result as possible; any error
+   other than a bad address should return EBADF per the DMAPI spec.
+*/
+
+int
+dm_copyin_handle(
+	void		*hanp,		/* input,  handle data */
+	size_t		hlen,		/* input,  size of handle data */
+	xfs_handle_t	*handlep)	/* output, copy of data */
+{
+	u_short		len;
+	fid_t		*fidp;
+
+	fidp = (fid_t*)&handlep->ha_fid;
+
+	if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+		return(EBADF);
+
+	if (copy_from_user(handlep, hanp, hlen))
+		return(EFAULT);
+
+	if (hlen < sizeof(*handlep))
+		bzero((char *)handlep + hlen, sizeof(*handlep) - hlen);
+
+	if (hlen == sizeof(handlep->ha_fsid))
+		return(0);	/* FS handle, nothing more to check */
+
+	len = hlen - sizeof(handlep->ha_fsid) - sizeof(fidp->fid_len);
+
+	if (fidp->fid_len != len ||
+	    *((short *) fidp->fid_data)) {
+		return(EBADF);
+	}
+	return(0);
+}
+
+/* Allocate and initialize a tevp structure.  Called from both application and
+   event threads.
+*/
+
+static dm_tokevent_t *
+dm_init_tevp(
+	int		ev_size,	/* size of event structure */
+	int		var_size)	/* size of variable-length data */
+{
+	dm_tokevent_t	*tevp;
+	int		msgsize;
+
+	/* Calculate the size of the event in bytes and allocate memory for it.
+	   Zero all but the variable portion of the message, which will be
+	   eventually overlaid by the caller with data.
+	*/
+
+	msgsize = offsetof(dm_tokevent_t, te_event) + ev_size + var_size;
+	tevp = kmalloc(msgsize, GFP_KERNEL);
+	if (tevp == NULL) {
+		printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	bzero(tevp, msgsize - var_size);
+
+	/* Now initialize all the non-zero fields. */
+
+	spinlock_init(&tevp->te_lock, "te_lock");
+	sv_init(&tevp->te_evt_queue, SV_DEFAULT, "te_evt_queue");
+	sv_init(&tevp->te_app_queue, SV_DEFAULT, "te_app_queue");
+	tevp->te_allocsize = msgsize;
+	tevp->te_msg.ev_type = DM_EVENT_INVALID;
+	tevp->te_flags = 0;
+
+	return(tevp);
+}
+
+
+/* Given the event type and the number of bytes of variable length data that
+   will follow the event, dm_evt_create_tevp() creates a dm_tokevent_t
+   structure to hold the event and initializes all the common event fields.
+
+   No locking is required for this routine because the caller is an event
+   thread, and is therefore the only thread that can see the event.
+*/
+
+dm_tokevent_t *
+dm_evt_create_tevp(
+	dm_eventtype_t	event,
+	int		variable_size,
+	void		**msgpp)
+{
+	dm_tokevent_t	*tevp;
+	int		evsize;
+
+	switch (event) {
+	case DM_EVENT_READ:
+	case DM_EVENT_WRITE:
+	case DM_EVENT_TRUNCATE:
+		evsize = sizeof(dm_data_event_t);
+		break;
+
+	case DM_EVENT_DESTROY:
+		evsize = sizeof(dm_destroy_event_t);
+		break;
+
+	case DM_EVENT_MOUNT:
+		evsize = sizeof(dm_mount_event_t);
+		break;
+
+	case DM_EVENT_PREUNMOUNT:
+	case DM_EVENT_UNMOUNT:
+	case DM_EVENT_NOSPACE:
+	case DM_EVENT_CREATE:
+	case DM_EVENT_REMOVE:
+	case DM_EVENT_RENAME:
+	case DM_EVENT_SYMLINK:
+	case DM_EVENT_LINK:
+	case DM_EVENT_POSTCREATE:
+	case DM_EVENT_POSTREMOVE:
+	case DM_EVENT_POSTRENAME:
+	case DM_EVENT_POSTSYMLINK:
+	case DM_EVENT_POSTLINK:
+	case DM_EVENT_ATTRIBUTE:
+	case DM_EVENT_DEBUT:		/* currently not supported */
+	case DM_EVENT_CLOSE:		/* currently not supported */
+		evsize = sizeof(dm_namesp_event_t);
+		break;
+
+	case DM_EVENT_CANCEL:		/* currently not supported */
+		evsize = sizeof(dm_cancel_event_t);
+		break;
+
+	case DM_EVENT_USER:
+		evsize = 0;
+		break;
+
+	default:
+		panic("dm_create_tevp: called with unknown event type %d\n",
+			event);
+	}
+
+	/* Allocate and initialize an event structure of the correct size. */
+
+	tevp = dm_init_tevp(evsize, variable_size);
+	if (tevp == NULL)
+		return NULL;
+	tevp->te_evt_ref = 1;
+
+	/* Fields ev_token, ev_sequence, and _link are all filled in when the
+	   event is queued onto a session.  Initialize all other fields here.
+	*/
+
+	tevp->te_msg.ev_type = event;
+	tevp->te_msg.ev_data.vd_offset = offsetof(dm_tokevent_t, te_event) -
+		offsetof(dm_tokevent_t, te_msg);
+	tevp->te_msg.ev_data.vd_length = evsize + variable_size;
+
+	/* Give the caller a pointer to the event-specific structure. */
+
+	*msgpp = ((char *)&tevp->te_msg + tevp->te_msg.ev_data.vd_offset);
+	return(tevp);
+}
+
+
+/* Given a pointer to an event (tevp) and a pointer to a handle_t, look for a
+   tdp structure within the event which contains the handle_t.	Either verify
+   that the event contains the tdp, or optionally add the tdp to the
+   event.  Called only from application threads.
+
+   On entry, tevp->te_lock is held; it is dropped prior to return.
+*/
+
+static int
+dm_app_lookup_tdp(
+	xfs_handle_t	*handlep,	/* the handle we are looking for */
+	dm_tokevent_t	*tevp,		/* the event to search for the handle */
+	unsigned long	*lcp,		/* address of active lock cookie */
+	short		types,		/* acceptable object types */
+	dm_right_t	right,		/* minimum right the object must have */
+	u_int		flags,
+	dm_tokdata_t	**tdpp)		/* if ! NULL, pointer to matching tdp */
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	vnode_t		*vp;
+	int		error;
+
+	/* Bump the tevp application reference counter so that the event
+	   can't disappear in case we have to drop the lock for a while.
+	*/
+
+	tevp->te_app_ref++;
+	*tdpp = NULL;		/* assume failure */
+
+	for (;;) {
+		/* Look for a matching tdp in the tevp. */
+
+		for (tdp = tevp->te_tdp; tdp; tdp = tdp->td_next) {
+			if (XFS_HANDLE_CMP(&tdp->td_handle, handlep) == 0)
+				break;
+		}
+
+		/* If the tdp exists, but either we need single-thread access
+		   to the handle and can't get it, or some other thread already
+		   has single-thread access, then sleep until we can try again.
+		*/
+
+		if (tdp != NULL && tdp->td_app_ref &&
+		    ((flags & DM_FG_STHREAD) ||
+		     (tdp->td_flags & DM_TDF_STHREAD))) {
+			tevp->te_app_slp++;
+			sv_wait(&tevp->te_app_queue, 1,
+				&tevp->te_lock, *lcp);
+			*lcp = mutex_spinlock(&tevp->te_lock);
+			tevp->te_app_slp--;
+			continue;
+		}
+
+		if (tdp != NULL &&
+		    (tdp->td_vcount > 0 || tdp->td_flags & DM_TDF_EVTREF)) {
+			/* We have an existing tdp with a non-zero vnode
+			   reference count.  If it's the wrong type, return
+			   an appropriate errno.
+			*/
+
+			if (!(tdp->td_type & types)) {
+				mutex_spinunlock(&tevp->te_lock, *lcp);
+				dm_put_tevp(tevp, NULL); /* no destroy events */
+				return(EOPNOTSUPP);
+			}
+
+			/* If the current access right isn't high enough,
+			   complain.
+			*/
+
+			if (tdp->td_right < right) {
+				mutex_spinunlock(&tevp->te_lock, *lcp);
+				dm_put_tevp(tevp, NULL); /* no destroy events */
+				return(EACCES);
+			}
+
+			/* The handle is acceptable.  Increment the tdp
+			   application and vnode references and mark the tdp
+			   as single-threaded if necessary.
+			*/
+
+			tdp->td_app_ref++;
+			if (flags & DM_FG_STHREAD)
+				tdp->td_flags |= DM_TDF_STHREAD;
+			tdp->td_vcount++;
+
+			fsys_vector = dm_fsys_vector(tdp->td_vp);
+			(void)fsys_vector->obj_ref_hold(tdp->td_vp);
+
+			mutex_spinunlock(&tevp->te_lock, *lcp);
+			*tdpp = tdp;
+			return(0);
+		}
+
+		/* If the tdp is not in the tevp or does not have a vnode
+		   reference, check to make sure it is okay to add/update it.
+		*/
+
+		if (flags & DM_FG_MUSTEXIST) {
+			mutex_spinunlock(&tevp->te_lock, *lcp);
+			dm_put_tevp(tevp, NULL);	/* no destroy events */
+			return(EACCES);		/* i.e. an insufficient right */
+		}
+		if (flags & DM_FG_DONTADD) {
+			tevp->te_app_ref--;
+			mutex_spinunlock(&tevp->te_lock, *lcp);
+			return(0);
+		}
+
+		/* If a tdp structure doesn't yet exist, create one and link
+		   it into the tevp.  Drop the lock while we are doing this as
+		   zallocs can go to sleep.  Once we have the memory, make
+		   sure that another thread didn't simultaneously add the same
+		   handle to the same event.  If so, toss ours and start over.
+		*/
+
+		if (tdp == NULL) {
+			dm_tokdata_t	*tmp;
+
+			mutex_spinunlock(&tevp->te_lock, *lcp);
+
+			tdp = kmem_cache_alloc(dm_tokdata_cachep, SLAB_KERNEL);
+			if (tdp == NULL){
+				printk("%s/%d: kmem_cache_alloc(dm_tokdata_cachep) returned NULL\n", __FUNCTION__, __LINE__);
+				return(ENOMEM);
+			}
+			memset(tdp, 0, sizeof(*tdp));
+
+			*lcp = mutex_spinlock(&tevp->te_lock);
+
+			for (tmp = tevp->te_tdp; tmp; tmp = tmp->td_next) {
+				if (XFS_HANDLE_CMP(&tmp->td_handle, handlep) == 0)
+					break;
+			}
+			if (tmp) {
+				kmem_cache_free(dm_tokdata_cachep, tdp);
+				continue;
+			}
+
+			tdp->td_next = tevp->te_tdp;
+			tevp->te_tdp = tdp;
+			tdp->td_tevp = tevp;
+			tdp->td_handle = *handlep;
+		}
+
+		/* Temporarily single-thread access to the tdp so that other
+		   threads don't touch it while we are filling the rest of the
+		   fields in.
+		*/
+
+		tdp->td_app_ref = 1;
+		tdp->td_flags |= DM_TDF_STHREAD;
+
+		/* Drop the spinlock while we access, validate, and obtain the
+		   proper rights to the object.	 This can take a very long time
+		   if the vnode is not in memory, if the filesystem is
+		   unmounting,	or if the request_right() call should block
+		   because some other tdp or kernel thread is holding a right.
+		*/
+
+		mutex_spinunlock(&tevp->te_lock, *lcp);
+
+		if ((vp = dm_handle_to_vp(handlep, &tdp->td_type)) == NULL) {
+			error = EBADF;
+		} else {
+			tdp->td_vcount = 1;
+			tdp->td_vp = vp;
+
+			/* The handle is usable.  Check that the type of the
+			   object matches one of the types that the caller
+			   will accept.
+			*/
+
+			if (!(types & tdp->td_type)) {
+				error = EOPNOTSUPP;
+			} else if (right > DM_RIGHT_NULL) {
+				/* Attempt to get the rights required by the
+				   caller.  If rights can't be obtained, return
+				   an error.
+				*/
+
+				VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+				fsys_vector = dm_fsys_vector(tdp->td_vp);
+				error = fsys_vector->request_right(tdp->td_vp,
+					DM_RIGHT_NULL,
+					(tdp->td_type == DM_TDT_VFS ?
+					DM_FSYS_OBJ : 0),
+					DM_RR_WAIT, right);
+				VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+				if (!error) {
+					tdp->td_right = right;
+				}
+			} else {
+				error = 0;
+			}
+		}
+		if (error != 0) {
+			dm_put_tevp(tevp, tdp); /* destroy event risk, although tiny */
+			return(error);
+		}
+
+		*lcp = mutex_spinlock(&tevp->te_lock);
+
+		/* Wake up any threads which may have seen our tdp while we
+		   were filling it in.
+		*/
+
+		if (!(flags & DM_FG_STHREAD)) {
+			tdp->td_flags &= ~DM_TDF_STHREAD;
+			if (tevp->te_app_slp)
+				sv_broadcast(&tevp->te_app_queue);
+		}
+
+		mutex_spinunlock(&tevp->te_lock, *lcp);
+		*tdpp = tdp;
+		return(0);
+	}
+}
+
+
+/* dm_app_get_tdp_by_token() is called whenever the application request
+   contains a session ID and contains a token other than DM_NO_TOKEN.
+   Most of the callers provide a right that is either DM_RIGHT_SHARED or
+   DM_RIGHT_EXCL, but a few of the callers such as dm_obj_ref_hold() may
+   specify a right of DM_RIGHT_NULL.
+*/
+
+static int
+dm_app_get_tdp_by_token(
+	dm_sessid_t	sid,		/* an existing session ID */
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,		/* an existing token */
+	short		types,		/* acceptable object types */
+	dm_right_t	right,		/* minimum right the object must have */
+	u_int		flags,
+	dm_tokdata_t	**tdpp)
+{
+	dm_tokevent_t	*tevp;
+	xfs_handle_t	handle;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	if (right < DM_RIGHT_NULL || right > DM_RIGHT_EXCL)
+		return(EINVAL);
+
+	if ((error = dm_copyin_handle(hanp, hlen, &handle)) != 0)
+		return(error);
+
+	/* Find and lock the event which corresponds to the specified
+	   session/token pair.
+	*/
+
+	if ((error = dm_find_msg_and_lock(sid, token, &tevp, &lc)) != 0)
+		return(error);
+
+	return(dm_app_lookup_tdp(&handle, tevp, &lc, types,
+		right, flags, tdpp));
+}
+
+
+/* Function dm_app_get_tdp() must ONLY be called from routines associated with
+   application calls, e.g. dm_read_invis, dm_set_disp, etc.  It must not be
+   called by a thread responsible for generating an event such as
+   dm_send_data_event()!
+
+   dm_app_get_tdp() is the interface used by all application calls other than
+   dm_get_events, dm_respond_event, dm_get_config, dm_get_config_events, and by
+   the dm_obj_ref_* and dm_*_right families of requests.
+
+   dm_app_get_tdp() converts a sid/hanp/hlen/token quad into a tdp pointer,
+   increments the number of active application threads in the event, and
+   increments the number of active application threads using the tdp.  The
+   'right' parameter must be either DM_RIGHT_SHARED or DM_RIGHT_EXCL.  The
+   token may either be DM_NO_TOKEN, or can be a token received in a synchronous
+   event.
+*/
+
+int
+dm_app_get_tdp(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	short		types,
+	dm_right_t	right,		/* minimum right */
+	dm_tokdata_t	**tdpp)
+{
+	dm_session_t	*s;
+	xfs_handle_t	handle;
+	dm_tokevent_t	*tevp;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	ASSERT(right >= DM_RIGHT_SHARED);
+
+	/* If a token other than DM_NO_TOKEN is specified, find the event on
+	   this session which owns the token and increment its reference count.
+	*/
+
+	if (token != DM_NO_TOKEN) {	/* look up existing tokevent struct */
+		return(dm_app_get_tdp_by_token(sid, hanp, hlen, token, types,
+			right, DM_FG_MUSTEXIST, tdpp));
+	}
+
+	/* The token is DM_NO_TOKEN.  In this case we only want to verify that
+	   the session ID is valid, and do not need to continue holding the
+	   session lock after we know that to be true.
+	*/
+
+	if ((error = dm_copyin_handle(hanp, hlen, &handle)) != 0)
+		return(error);
+
+	if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0)
+		return(error);
+	mutex_spinunlock(&s->sn_qlock, lc);
+
+	/* When DM_NO_TOKEN is used, we simply block until we can obtain the
+	   right that we want (since the tevp contains no tdp structures).
+	   The blocking when we eventually support it will occur within
+	   fsys_vector->request_right().
+	*/
+
+	tevp = dm_init_tevp(0, 0);
+	lc = mutex_spinlock(&tevp->te_lock);
+
+	return(dm_app_lookup_tdp(&handle, tevp, &lc, types, right, 0, tdpp));
+}
+
+
+/* dm_get_config_tdp() is only called by dm_get_config() and
+   dm_get_config_events(), which neither have a session ID nor a token.
+   Both of these calls are supposed to work even if the filesystem is in the
+   process of being mounted, as long as the caller only uses handles within
+   the mount event.
+*/
+
+int
+dm_get_config_tdp(
+	void		*hanp,
+	size_t		hlen,
+	dm_tokdata_t	**tdpp)
+{
+	xfs_handle_t	handle;
+	dm_tokevent_t	*tevp;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	if ((error = dm_copyin_handle(hanp, hlen, &handle)) != 0)
+		return(error);
+
+	tevp = dm_init_tevp(0, 0);
+	lc = mutex_spinlock(&tevp->te_lock);
+
+	/* Try to use the handle provided by the caller and assume DM_NO_TOKEN.
+	   This will fail if the filesystem is in the process of being mounted.
+	*/
+
+	error = dm_app_lookup_tdp(&handle, tevp, &lc, DM_TDT_ANY,
+		DM_RIGHT_NULL, 0, tdpp);
+
+	if (!error) {
+		return(0);
+	}
+
+	/* Perhaps the filesystem is still mounting, in which case we need to
+	   see if this is one of the handles in the DM_EVENT_MOUNT tevp.
+	*/
+
+	if ((tevp = dm_find_mount_tevp_and_lock((fsid_t*)&handle.ha_fsid, &lc)) == NULL)
+		return(EBADF);
+
+	return(dm_app_lookup_tdp(&handle, tevp, &lc, DM_TDT_ANY,
+		DM_RIGHT_NULL, DM_FG_MUSTEXIST, tdpp));
+}
+
+
+/* dm_put_tdp() is called to release any right held on the vnode, and to
+   VN_RELE() all references held on the vnode.	It is the caller's
+   responsibility to ensure that no other application threads are using the
+   tdp, and if necessary to unlink the tdp from the tevp before calling
+   this routine and to free the tdp afterwards.
+*/
+
+static void
+dm_put_tdp(
+	dm_tokdata_t	*tdp)
+{
+	ASSERT(tdp->td_app_ref <= 1);
+
+	/* If the application thread is holding a right, or if the event
+	   thread had a right but it has disappeared because of a dm_pending
+	   or Cntl-C, then we need to release it here.
+	*/
+
+	if (tdp->td_right != DM_RIGHT_NULL) {
+		dm_fsys_vector_t *fsys_vector;
+
+		VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+		fsys_vector = dm_fsys_vector(tdp->td_vp);
+		(void)fsys_vector->release_right(tdp->td_vp, tdp->td_right,
+			(tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0));
+		VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+		tdp->td_right = DM_RIGHT_NULL;
+	}
+
+	/* Given that we wouldn't be here if there was still an event thread,
+	   this VN_RELE loop has the potential of generating a DM_EVENT_DESTROY
+	   event if some other thread has unlinked the file.
+	*/
+
+	while (tdp->td_vcount > 0) {
+		iput(LINVFS_GET_IP(tdp->td_vp));
+		tdp->td_vcount--;
+	}
+
+	tdp->td_flags &= ~(DM_TDF_HOLD|DM_TDF_RIGHT);
+	tdp->td_vp = NULL;
+}
+
+
+/* Function dm_put_tevp() must ONLY be called from routines associated with
+   application threads, e.g. dm_read_invis, dm_get_events, etc.	 It must not be
+   called by a thread responsible for generating an event, such as
+   dm_send_data_event.
+
+   PLEASE NOTE: It is possible for this routine to generate DM_EVENT_DESTROY
+   events, because its calls to dm_put_tdp drop vnode references, and another
+   thread may have already unlinked a file whose vnode we are de-referencing.
+   This sets the stage for various types of deadlock if the thread calling
+   dm_put_tevp is the same thread that calls dm_respond_event!	In particular,
+   the dm_sent_destroy_event routine needs to obtain the dm_reg_lock,
+   dm_session_lock, and sn_qlock in order to queue the destroy event.  No
+   caller of dm_put_tevp can hold any of these locks!
+
+   Other possible deadlocks are that dm_send_destroy_event could block waiting
+   for a thread to register for the event using	 dm_set_disp() and/or
+   dm_set_return_on_destroy, or it could block because the session's sn_newq
+   is at the dm_max_queued_msgs event limit.  The only safe solution
+   (unimplemented) is to have a separate kernel thread for each filesystem
+   whose only job is to do the vnode-dereferencing.  That way dm_respond_event
+   will not block, so the application can keep calling dm_get_events to read
+   events even if the filesystem thread should block.  (If the filesystem
+   thread blocks, so will all subsequent destroy events for the same
+   filesystem.)
+*/
+
+void
+dm_put_tevp(
+	dm_tokevent_t	*tevp,
+	dm_tokdata_t	*tdp)
+{
+	int		free_tdp = 0;
+	unsigned long	lc;		/* lock cookie */
+
+	lc = mutex_spinlock(&tevp->te_lock);
+
+	if (tdp != NULL) {
+		if (tdp->td_vcount > 1 || (tdp->td_flags & DM_TDF_EVTREF)) {
+			ASSERT(tdp->td_app_ref > 0);
+
+			iput(LINVFS_GET_IP(tdp->td_vp));
+			tdp->td_vcount--;
+		} else {
+			ASSERT(tdp->td_app_ref == 1);
+
+			/* The vnode reference count is either already at
+			   zero (e.g. a failed dm_handle_to_vp() call in
+			   dm_app_lookup_tdp()) or is going to zero.  We can't
+			   hold the lock while we decrement the count because
+			   we could potentially end up being busy for a long
+			   time in VOP_INACTIVATE.  Use single-threading to
+			   lock others out while we clean house.
+			*/
+
+			tdp->td_flags |= DM_TDF_STHREAD;
+
+			/* WARNING - A destroy event is possible here if we are
+			   giving up the last reference on a vnode which has
+			   been previously unlinked by some other thread!
+			*/
+
+			mutex_spinunlock(&tevp->te_lock, lc);
+			dm_put_tdp(tdp);
+			lc = mutex_spinlock(&tevp->te_lock);
+
+			/* If this tdp is not one of the original tdps in the
+			   event, then remove it from the tevp.
+			*/
+
+			if (!(tdp->td_flags & DM_TDF_ORIG)) {
+				dm_tokdata_t	**tdpp = &tevp->te_tdp;
+
+				while (*tdpp && *tdpp != tdp) {
+					tdpp = &(*tdpp)->td_next;
+				}
+				if (*tdpp == NULL) {
+					panic("dm_remove_tdp_from_tevp: tdp "
+						"%p not in tevp %p\n", tdp,
+						tevp);
+				}
+				*tdpp = tdp->td_next;
+				free_tdp++;
+			}
+		}
+
+		/* If this is the last app thread actively using the tdp, clear
+		   any single-threading and wake up any other app threads who
+		   might be waiting to use this tdp, single-threaded or
+		   otherwise.
+		*/
+
+		if (--tdp->td_app_ref == 0) {
+			if (tdp->td_flags & DM_TDF_STHREAD) {
+				tdp->td_flags &= ~DM_TDF_STHREAD;
+				if (tevp->te_app_slp)
+					sv_broadcast(&tevp->te_app_queue);
+			}
+		}
+
+		if (free_tdp) {
+			kmem_cache_free(dm_tokdata_cachep, tdp);
+		}
+	}
+
+	/* If other application threads are using this token/event, they will
+	   do the cleanup.
+	*/
+
+	if (--tevp->te_app_ref > 0) {
+		mutex_spinunlock(&tevp->te_lock, lc);
+		return;
+	}
+
+	/* If event generation threads are waiting for this thread to go away,
+	   wake them up and let them do the cleanup.
+	*/
+
+	if (tevp->te_evt_ref > 0) {
+		sv_broadcast(&tevp->te_evt_queue);
+		mutex_spinunlock(&tevp->te_lock, lc);
+		return;
+	}
+
+	/* This thread is the last active thread using the token/event.	 No
+	   lock can be held while we disassemble the tevp because we could
+	   potentially end up being busy for a long time in VOP_INACTIVATE.
+	*/
+
+	mutex_spinunlock(&tevp->te_lock, lc);
+
+	/* WARNING - One or more destroy events are possible here if we are
+	   giving up references on vnodes which have been previously unlinked
+	   by other kernel threads!
+	*/
+
+	while ((tdp = tevp->te_tdp) != NULL) {
+		tevp->te_tdp = tdp->td_next;
+		dm_put_tdp(tdp);
+		kmem_cache_free(dm_tokdata_cachep, tdp);
+	}
+	spinlock_destroy(&tevp->te_lock);
+	sv_destroy(&tevp->te_evt_queue);
+	sv_destroy(&tevp->te_app_queue);
+	kfree(tevp);
+}
+
+
+/* No caller of dm_app_put_tevp can hold either of the locks dm_reg_lock,
+   dm_session_lock, or any sn_qlock!  (See dm_put_tevp for details.)
+*/
+
+void
+dm_app_put_tdp(
+	dm_tokdata_t	*tdp)
+{
+	dm_put_tevp(tdp->td_tevp, tdp);
+}
+
+
+/* dm_change_right is only called if the event thread is the one doing the
+   cleanup on a completed event.  It looks at the current rights of a tdp
+   and compares that with the rights it had on the tdp when the event was
+   created.  If different, it reaquires the original rights, then transfers
+   the rights back to being thread-based.
+*/
+
+static void
+dm_change_right(
+	dm_tokdata_t	*tdp)
+{
+#ifdef HAVE_DMAPI_RIGHTS
+	dm_fsys_vector_t *fsys_vector;
+	int		error;
+	u_int		type;
+#endif
+
+	/* If the event doesn't have a vnode reference, if the original right
+	   was DM_RIGHT_NULL, or if the rights were never switched from being
+	   thread-based to tdp-based, then there is nothing to do.
+	*/
+
+	if (!(tdp->td_flags & DM_TDF_EVTREF))
+		return;
+
+	if (tdp->td_orig_right == DM_RIGHT_NULL)
+		return;
+
+	/* DEBUG - Need a check here for event-based rights. */
+
+#ifdef HAVE_DMAPI_RIGHTS
+	/* The "rights" vectors are stubs now anyway.  When they are
+	 * implemented then bhv locking will have to be sorted out.
+	 */
+
+	/* If the current right is not the same as it was when the event was
+	   created, first get back the original right.
+	*/
+
+	if (tdp->td_right != tdp->td_orig_right) {
+		fsys_vector = dm_fsys_vector(tdp->td_vp);
+		type = (tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0);
+
+		switch (tdp->td_orig_right) {
+		case DM_RIGHT_SHARED:
+			if (tdp->td_right == DM_RIGHT_EXCL) {
+				error = fsys_vector->downgrade_right(
+					tdp->td_vp, tdp->td_right, type);
+				if (!error)
+					break;
+				(void)fsys_vector->release_right(tdp->td_vp,
+					tdp->td_right, type);
+			}
+			(void)fsys_vector->request_right(tdp->td_vp,
+				tdp->td_right, type, DM_RR_WAIT,
+				tdp->td_orig_right);
+			break;
+
+		case DM_RIGHT_EXCL:
+			if (tdp->td_right == DM_RIGHT_SHARED) {
+				error = fsys_vector->upgrade_right(tdp->td_vp,
+					tdp->td_right, type);
+				if (!error)
+					break;
+				(void)fsys_vector->release_right(tdp->td_vp,
+					tdp->td_right, type);
+			}
+			(void)fsys_vector->request_right(tdp->td_vp,
+				tdp->td_right, type, DM_RR_WAIT,
+				tdp->td_orig_right);
+			break;
+		case DM_RIGHT_NULL:
+			break;
+		}
+	}
+#endif
+
+	/* We now have back the same level of rights as we had when the event
+	   was generated.  Now transfer the rights from being tdp-based back
+	   to thread-based.
+	*/
+
+	/* DEBUG - Add a call here to transfer rights back to thread-based. */
+
+	/* Finally, update the tdp so that we don't mess with the rights when
+	   we eventually call dm_put_tdp.
+	*/
+
+	tdp->td_right = DM_RIGHT_NULL;
+}
+
+
+/* This routine is only called by event threads.  The calls to dm_put_tdp
+   are not a deadlock risk here because this is an event thread, and it is
+   okay for such a thread to block on an induced destroy event.	 Okay, maybe
+   there is a slight risk; say that the event contains three vnodes all of
+   which have DM_RIGHT_EXCL, and say that we are at the dm_max_queued_msgs
+   limit, and that the first vnode is already unlinked.	 In that case the
+   destroy event will block waiting to be queued, and the application thread
+   could happen to reference one of the other locked vnodes.  Deadlock.
+*/
+
+void
+dm_evt_rele_tevp(
+	dm_tokevent_t	*tevp,
+	int		droprights)	/* non-zero, evt thread loses rights */
+{
+	dm_tokdata_t	*tdp;
+	unsigned long	lc;		/* lock cookie */
+
+	lc = mutex_spinlock(&tevp->te_lock);
+
+	/* If we are here without DM_TEF_FINAL set and with at least one
+	   application reference still remaining, then one of several
+	   possibilities is true:
+	   1. This is an asynchronous event which has been queued but has not
+	      yet been delivered, or which is in the process of being delivered.
+	   2. This is an unmount event (pseudo-asynchronous) yet to be
+	      delivered or in the process of being delivered.
+	   3. This event had DM_FLAGS_NDELAY specified, and the application
+	      has sent a dm_pending() reply for the event.
+	   4. This is a DM_EVENT_READ, DM_EVENT_WRITE, or DM_EVENT_TRUNCATE
+	      event and the user typed a Cntl-C.
+	   In all of these cases, the correct behavior is to leave the
+	   responsibility of releasing any rights to the application threads
+	   when they are done.
+	*/
+
+	if (tevp->te_app_ref > 0 && !(tevp->te_flags & DM_TEF_FINAL)) {
+		tevp->te_evt_ref--;
+		for (tdp = tevp->te_tdp; tdp; tdp = tdp->td_next) {
+			if (tdp->td_flags & DM_TDF_EVTREF) {
+				tdp->td_flags &= ~DM_TDF_EVTREF;
+				if (tdp->td_vcount == 0) {
+					tdp->td_vp = NULL;
+				}
+			}
+		}
+		mutex_spinunlock(&tevp->te_lock, lc);
+		return;		/* not the last thread */
+	}
+
+	/* If the application reference count is non-zero here, that can only
+	   mean that dm_respond_event() has been called, but the application
+	   still has one or more threads in the kernel that haven't let go of
+	   the tevp.  In these cases, the event thread must wait until all
+	   application threads have given up their references, and their
+	   rights to handles within the event.
+	*/
+
+	while (tevp->te_app_ref) {
+		sv_wait(&tevp->te_evt_queue, 1, &tevp->te_lock, lc);
+		lc = mutex_spinlock(&tevp->te_lock);
+	}
+
+	/* This thread is the last active thread using the token/event.	 Reset
+	   the rights of any vnode that was part of the original event back
+	   to their initial values before returning to the filesystem.	The
+	   exception is if the event failed (droprights is non-zero), in which
+	   case we chose to return to the filesystem with all rights released.
+	   Release the rights on any vnode that was not part of the original
+	   event.  Give up all remaining application vnode references
+	   regardless of whether or not the vnode was part of the original
+	   event.
+	*/
+
+	mutex_spinunlock(&tevp->te_lock, lc);
+
+	while ((tdp = tevp->te_tdp) != NULL) {
+		tevp->te_tdp = tdp->td_next;
+		if ((tdp->td_flags & DM_TDF_ORIG) &&
+		    (tdp->td_flags & DM_TDF_EVTREF) &&
+		    (!droprights)) {
+			dm_change_right(tdp);
+		}
+		dm_put_tdp(tdp);
+		kmem_cache_free(dm_tokdata_cachep, tdp);
+	}
+	spinlock_destroy(&tevp->te_lock);
+	sv_destroy(&tevp->te_evt_queue);
+	sv_destroy(&tevp->te_app_queue);
+	kfree(tevp);
+}
+
+
+/* dm_obj_ref_hold() is just a fancy way to get a vnode reference on an object
+   to hold it in kernel memory.
+*/
+
+int
+dm_obj_ref_hold(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_VNO,
+		DM_RIGHT_NULL, DM_FG_STHREAD, &tdp);
+
+	/* The tdp is single-threaded, so no mutex lock needed for update. */
+
+	if (error == 0) {
+		if (tdp->td_flags & DM_TDF_HOLD) {	/* if already held */
+			error = EBUSY;
+		} else {
+			tdp->td_flags |= DM_TDF_HOLD;
+			tdp->td_vcount++;
+
+			fsys_vector = dm_fsys_vector(tdp->td_vp);
+			(void)fsys_vector->obj_ref_hold(tdp->td_vp);
+		}
+		dm_app_put_tdp(tdp);
+	}
+	return(error);
+}
+
+
+int
+dm_obj_ref_rele(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen)
+{
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_VNO,
+		DM_RIGHT_NULL, DM_FG_MUSTEXIST|DM_FG_STHREAD, &tdp);
+
+	/* The tdp is single-threaded, so no mutex lock needed for update. */
+
+	if (error == 0) {
+		if (!(tdp->td_flags & DM_TDF_HOLD)) {	/* if not held */
+			error = EACCES; /* use the DM_FG_MUSTEXIST errno */
+		} else {
+			tdp->td_flags &= ~DM_TDF_HOLD;
+			iput(LINVFS_GET_IP(tdp->td_vp));
+			tdp->td_vcount--;
+		}
+		dm_app_put_tdp(tdp);
+	}
+	return(error);
+}
+
+
+int
+dm_obj_ref_query_rvp(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	void		*hanp,
+	size_t		hlen,
+	int		*rvp)
+{
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_VNO,
+		DM_RIGHT_NULL, DM_FG_DONTADD|DM_FG_STHREAD, &tdp);
+	if (error != 0)
+		return(error);
+
+	/* If the request is valid but the handle just isn't present in the
+	   event or the hold flag isn't set, return zero, else return one.
+	*/
+
+	if (tdp) {
+		if (tdp->td_flags & DM_TDF_HOLD) {	/* if held */
+			*rvp = 1;
+		} else {
+			*rvp = 0;
+		}
+		dm_app_put_tdp(tdp);
+	} else {
+		*rvp = 0;
+	}
+	return(0);
+}
+
+
+int
+dm_downgrade_right(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY,
+		DM_RIGHT_EXCL, DM_FG_MUSTEXIST|DM_FG_STHREAD, &tdp);
+	if (error != 0)
+		return(error);
+
+	/* Attempt the downgrade.  Filesystems which support rights but not
+	   the downgrading of rights will return ENOSYS.
+	*/
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->downgrade_right(tdp->td_vp, tdp->td_right,
+		(tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0));
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	/* The tdp is single-threaded, so no mutex lock needed for update. */
+
+	if (error == 0)
+		tdp->td_right = DM_RIGHT_SHARED;
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_query_right(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	dm_right_t	*rightp)
+{
+	dm_tokdata_t	*tdp;
+	dm_right_t	right;
+	int		error;
+
+	error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY,
+		DM_RIGHT_NULL, DM_FG_DONTADD|DM_FG_STHREAD, &tdp);
+	if (error != 0)
+		return(error);
+
+	/* Get the current right and copy it to the caller.  The tdp is
+	   single-threaded, so no mutex lock is needed.	 If the tdp is not in
+	   the event we are supposed to return DM_RIGHT_NULL in order to be
+	   compatible with Veritas.
+	*/
+
+	if (tdp) {
+		right = tdp->td_right;
+		dm_app_put_tdp(tdp);
+	} else {
+		right = DM_RIGHT_NULL;
+	}
+	if (copy_to_user(rightp, &right, sizeof(right)))
+		return(EFAULT);
+	return(0);
+}
+
+
+int
+dm_release_right(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY,
+		DM_RIGHT_SHARED, DM_FG_MUSTEXIST|DM_FG_STHREAD, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->release_right(tdp->td_vp, tdp->td_right,
+		(tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0));
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	/* The tdp is single-threaded, so no mutex lock needed for update. */
+
+	if (error == 0) {
+		tdp->td_right = DM_RIGHT_NULL;
+		if (tdp->td_flags & DM_TDF_RIGHT) {
+			tdp->td_flags &= ~DM_TDF_RIGHT;
+			iput(LINVFS_GET_IP(tdp->td_vp));
+			tdp->td_vcount--;
+		}
+	}
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_request_right(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token,
+	u_int		flags,
+	dm_right_t	right)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY,
+		DM_RIGHT_NULL, DM_FG_STHREAD, &tdp);
+	if (error != 0)
+		return(error);
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->request_right(tdp->td_vp, tdp->td_right,
+		(tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0), flags, right);
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	/* The tdp is single-threaded, so no mutex lock is needed for update.
+
+	   If this is the first dm_request_right call for this vnode, then we
+	   need to bump the vnode reference count for two reasons.  First of
+	   all, it is supposed to be impossible for the file to disappear or
+	   for the filesystem to be unmounted while a right is held on a file;
+	   bumping the file's vnode reference count ensures this.  Second, if
+	   rights are ever actually implemented, it will most likely be done
+	   without changes to the on-disk inode, which means that we can't let
+	   the vnode become unreferenced while a right on it is held.
+	*/
+
+	if (error == 0) {
+		if (!(tdp->td_flags & DM_TDF_RIGHT)) {	/* if first call */
+			tdp->td_flags |= DM_TDF_RIGHT;
+			tdp->td_vcount++;
+			(void)fsys_vector->obj_ref_hold(tdp->td_vp);
+		}
+		tdp->td_right = right;
+	}
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
+
+
+int
+dm_upgrade_right(
+	dm_sessid_t	sid,
+	void		*hanp,
+	size_t		hlen,
+	dm_token_t	token)
+{
+	dm_fsys_vector_t *fsys_vector;
+	dm_tokdata_t	*tdp;
+	int		error;
+
+	error = dm_app_get_tdp_by_token(sid, hanp, hlen, token, DM_TDT_ANY,
+		DM_RIGHT_SHARED, DM_FG_MUSTEXIST|DM_FG_STHREAD, &tdp);
+	if (error != 0)
+		return(error);
+
+	/* If the object already has the DM_RIGHT_EXCL right, no need to
+	   attempt an upgrade.
+	*/
+
+	if (tdp->td_right == DM_RIGHT_EXCL) {
+		dm_app_put_tdp(tdp);
+		return(0);
+	}
+
+	/* Attempt the upgrade.	 Filesystems which support rights but not
+	   the upgrading of rights will return ENOSYS.
+	*/
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(tdp->td_vp));
+	fsys_vector = dm_fsys_vector(tdp->td_vp);
+	error = fsys_vector->upgrade_right(tdp->td_vp, tdp->td_right,
+		(tdp->td_type == DM_TDT_VFS ? DM_FSYS_OBJ : 0));
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(tdp->td_vp));
+
+	/* The tdp is single-threaded, so no mutex lock needed for update. */
+
+	if (error == 0)
+		tdp->td_right = DM_RIGHT_EXCL;
+
+	dm_app_put_tdp(tdp);
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_session.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_session.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_session.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_session.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,1539 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include "dmapi_private.h"
+
+dm_session_t	*dm_sessions = NULL;	/* head of session list */
+u_int		dm_sessions_active = 0; /* # sessions currently active */
+dm_sessid_t	dm_next_sessid = 1;	/* next session ID to use */
+lock_t		dm_session_lock = SPIN_LOCK_UNLOCKED;/* lock for session list */
+
+dm_token_t	dm_next_token = 1;	/* next token ID to use */
+dm_sequence_t	dm_next_sequence = 1;	/* next sequence number to use */
+lock_t		dm_token_lock = SPIN_LOCK_UNLOCKED;/* dm_next_token/dm_next_sequence lock */
+
+int	dm_max_queued_msgs = 2048;	/* max # undelivered msgs/session */
+
+#ifdef __sgi
+int	dm_hash_buckets = 1009;		/* prime -- number of buckets */
+
+/* XXX floating point not allowed in Linux kernel. */
+#define DM_SHASH(sess,inodenum)	 ((sess)->sn_sesshash + \
+				  ((inodenum) % dm_hash_buckets))
+#endif
+
+
+#ifdef CONFIG_PROC_FS
+static int
+sessions_read_pfs(char *buffer, char **start, off_t offset,
+		 int count, int *eof, void *data)
+{
+	int len;
+	dm_session_t	*sessp = (dm_session_t*)data;
+
+#define CHKFULL if(len >= count) break;
+#define ADDBUF(a,b)	len += sprintf(buffer + len, a, b); CHKFULL;
+
+	len=0;
+	while(1){
+		ADDBUF("sessp=0x%p\n", sessp);
+		ADDBUF("sn_next=0x%p\n", sessp->sn_next);
+		ADDBUF("sn_sessid=%d\n", sessp->sn_sessid);
+		ADDBUF("sn_flags=%x\n", sessp->sn_flags);
+		ADDBUF("sn_qlock=%c\n", '?');
+		ADDBUF("sn_readerq=%c\n", '?');
+		ADDBUF("sn_writerq=%c\n", '?');
+		ADDBUF("sn_readercnt=%u\n", sessp->sn_readercnt);
+		ADDBUF("sn_writercnt=%u\n", sessp->sn_writercnt);
+
+		ADDBUF("sn_newq.eq_head=0x%p\n", sessp->sn_newq.eq_head);
+		ADDBUF("sn_newq.eq_tail=0x%p\n", sessp->sn_newq.eq_tail);
+		ADDBUF("sn_newq.eq_count=%d\n", sessp->sn_newq.eq_count);
+
+		ADDBUF("sn_delq.eq_head=0x%p\n", sessp->sn_delq.eq_head);
+		ADDBUF("sn_delq.eq_tail=0x%p\n", sessp->sn_delq.eq_tail);
+		ADDBUF("sn_delq.eq_count=%d\n", sessp->sn_delq.eq_count);
+
+		ADDBUF("sn_evt_writerq.eq_head=0x%p\n", sessp->sn_evt_writerq.eq_head);
+		ADDBUF("sn_evt_writerq.eq_tail=0x%p\n", sessp->sn_evt_writerq.eq_tail);
+		ADDBUF("sn_evt_writerq.eq_count=%d\n", sessp->sn_evt_writerq.eq_count);
+
+		ADDBUF("sn_info=\"%s\"\n", sessp->sn_info);
+
+		break;
+	}
+
+	if (offset >= len) {
+		*start = buffer;
+		*eof = 1;
+		return 0;
+	}
+	*start = buffer + offset;
+	if ((len -= offset) > count)
+		return count;
+	*eof = 1;
+
+	return len;
+}
+#endif
+
+
+/* Link a session to the end of the session list.  New sessions are always
+   added at the end of the list so that dm_enqueue_mount_event() doesn't
+   miss a session.  The caller must have obtained dm_session_lock before
+   calling this routine.
+*/
+
+static void
+link_session(
+	dm_session_t	*s)
+{
+	dm_session_t	*tmp;
+
+	if ((tmp = dm_sessions) == NULL) {
+		dm_sessions = s;
+	} else {
+		while (tmp->sn_next != NULL)
+			tmp = tmp->sn_next;
+		tmp->sn_next = s;
+	}
+	s->sn_next = NULL;
+	dm_sessions_active++;
+}
+
+
+/* Remove a session from the session list.  The caller must have obtained
+   dm_session_lock before calling this routine.	 unlink_session() should only
+   be used in situations where the session is known to be on the dm_sessions
+   list; otherwise it panics.
+*/
+
+static void
+unlink_session(
+	dm_session_t	*s)
+{
+	dm_session_t	*tmp;
+
+	if (dm_sessions == s) {
+		dm_sessions = dm_sessions->sn_next;
+	} else {
+		for (tmp = dm_sessions; tmp; tmp = tmp->sn_next) {
+			if (tmp->sn_next == s)
+				break;
+		}
+		if (tmp == NULL) {
+			panic("unlink_session: corrupt DMAPI session list, "
+				"dm_sessions %p, session %p\n",
+				dm_sessions, s);
+		}
+		tmp->sn_next = s->sn_next;
+	}
+	s->sn_next = NULL;
+	dm_sessions_active--;
+}
+
+
+/* Link an event to the end of an event queue.	The caller must have obtained
+   the session's sn_qlock before calling this routine.
+*/
+
+void
+dm_link_event(
+	dm_tokevent_t	*tevp,
+	dm_eventq_t	*queue)
+{
+	if (queue->eq_tail) {
+		queue->eq_tail->te_next = tevp;
+		queue->eq_tail = tevp;
+	} else {
+		queue->eq_head = queue->eq_tail = tevp;
+	}
+	tevp->te_next = NULL;
+	queue->eq_count++;
+}
+
+
+/* Remove an event from an event queue.	 The caller must have obtained the
+   session's sn_qlock before calling this routine.  dm_unlink_event() should
+   only be used in situations where the event is known to be on the queue;
+   otherwise it panics.
+*/
+
+void
+dm_unlink_event(
+	dm_tokevent_t	*tevp,
+	dm_eventq_t	*queue)
+{
+	dm_tokevent_t	*tmp;
+
+	if (queue->eq_head == tevp) {
+		queue->eq_head = tevp->te_next;
+		if (queue->eq_head == NULL)
+			queue->eq_tail = NULL;
+	} else {
+		tmp = queue->eq_head;
+		while (tmp && tmp->te_next != tevp)
+			tmp = tmp->te_next;
+		if (tmp == NULL) {
+			panic("dm_unlink_event: corrupt DMAPI queue %p, "
+				"tevp %p\n", queue, tevp);
+		}
+		tmp->te_next = tevp->te_next;
+		if (tmp->te_next == NULL)
+			queue->eq_tail = tmp;
+	}
+	tevp->te_next = NULL;
+	queue->eq_count--;
+}
+
+/* Link a regular file event to a hash bucket.	The caller must have obtained
+   the session's sn_qlock before calling this routine.
+   The tokevent must be for a regular file object--DM_TDT_REG.
+*/
+
+#ifdef __sgi
+static void
+hash_event(
+	dm_session_t	*s,
+	dm_tokevent_t	*tevp)
+{
+	dm_sesshash_t	*sh;
+	xfs_ino_t	ino;
+
+	if (s->sn_sesshash == NULL)
+		s->sn_sesshash = kmem_zalloc(dm_hash_buckets * sizeof(dm_sesshash_t), KM_SLEEP);
+
+	ino = ((xfs_fid2_t*)&tevp->te_tdp->td_handle.ha_fid)->fid_ino;
+	sh = DM_SHASH(s, ino);
+
+#ifdef DM_SHASH_DEBUG
+	if (sh->h_next == NULL) {
+		s->sn_buckets_in_use++;
+		if (s->sn_buckets_in_use > s->sn_max_buckets_in_use)
+			s->sn_max_buckets_in_use++;
+	}
+	sh->maxlength++;
+	sh->curlength++;
+	sh->num_adds++;
+#endif
+
+	tevp->te_flags |= DM_TEF_HASHED;
+	tevp->te_hashnext = sh->h_next;
+	sh->h_next = tevp;
+}
+#endif
+
+
+/* Remove a regular file event from a hash bucket.  The caller must have
+   obtained the session's sn_qlock before calling this routine.
+   The tokevent must be for a regular file object--DM_TDT_REG.
+*/
+
+#ifdef __sgi
+static void
+unhash_event(
+	dm_session_t	*s,
+	dm_tokevent_t	*tevp)
+{
+	dm_sesshash_t	*sh;
+	dm_tokevent_t	*tmp;
+	xfs_ino_t	ino;
+
+	if (s->sn_sesshash == NULL)
+		return;
+
+	ino = ((xfs_fid2_t*)&tevp->te_tdp->td_handle.ha_fid)->fid_ino;
+	sh = DM_SHASH(s, ino);
+
+	if (sh->h_next == tevp) {
+		sh->h_next = tevp->te_hashnext; /* leap frog */
+	} else {
+		tmp = sh->h_next;
+		while (tmp->te_hashnext != tevp) {
+			tmp = tmp->te_hashnext;
+		}
+		tmp->te_hashnext = tevp->te_hashnext; /* leap frog */
+	}
+	tevp->te_hashnext = NULL;
+	tevp->te_flags &= ~DM_TEF_HASHED;
+
+#ifdef DM_SHASH_DEBUG
+	if (sh->h_next == NULL)
+		s->sn_buckets_in_use--;
+	sh->curlength--;
+	sh->num_dels++;
+#endif
+}
+#endif
+
+
+/* Determine if this is a repeat event.	 The caller MUST be holding
+   the session lock.
+   The tokevent must be for a regular file object--DM_TDT_REG.
+   Returns:
+	0 == match not found
+	1 == match found
+*/
+
+#ifdef __sgi
+static int
+repeated_event(
+	dm_session_t	*s,
+	dm_tokevent_t	*tevp)
+{
+	dm_sesshash_t	*sh;
+	dm_data_event_t *d_event1;
+	dm_data_event_t *d_event2;
+	dm_tokevent_t	*tevph;
+	xfs_ino_t	ino1;
+	xfs_ino_t	ino2;
+
+	if ((!s->sn_newq.eq_tail) && (!s->sn_delq.eq_tail)) {
+		return(0);
+	}
+	if (s->sn_sesshash == NULL) {
+		return(0);
+	}
+
+	ino1 = ((xfs_fid2_t*)&tevp->te_tdp->td_handle.ha_fid)->fid_ino;
+	sh = DM_SHASH(s, ino1);
+
+	if (sh->h_next == NULL) {
+		/* bucket is empty, no match here */
+		return(0);
+	}
+
+	d_event1 = (dm_data_event_t *)((char *)&tevp->te_msg + tevp->te_msg.ev_data.vd_offset);
+	tevph = sh->h_next;
+	while (tevph) {
+		/* find something with the same event type and handle type */
+		if ((tevph->te_msg.ev_type == tevp->te_msg.ev_type) &&
+		    (tevph->te_tdp->td_type == tevp->te_tdp->td_type)) {
+
+			ino2 = ((xfs_fid2_t*)&tevp->te_tdp->td_handle.ha_fid)->fid_ino;
+			d_event2 = (dm_data_event_t *)((char *)&tevph->te_msg + tevph->te_msg.ev_data.vd_offset);
+
+			/* If the two events are operating on the same file,
+			   and the same part of that file, then we have a
+			   match.
+			*/
+			if ((ino1 == ino2) &&
+			    (d_event2->de_offset == d_event1->de_offset) &&
+			    (d_event2->de_length == d_event1->de_length)) {
+				/* found a match */
+#ifdef DM_SHASH_DEBUG
+				sh->dup_hits++;
+#endif
+				return(1);
+			}
+		}
+		tevph = tevph->te_hashnext;
+	}
+
+	/* No match found */
+	return(0);
+}
+#endif
+
+
+/* Return a pointer to a session given its session ID, or EINVAL if no session
+   has the session ID (per the DMAPI spec).  The caller must have obtained
+   dm_session_lock before calling this routine.
+*/
+
+static int
+dm_find_session(
+	dm_sessid_t	sid,
+	dm_session_t	**sessionpp)
+{
+	dm_session_t	*s;
+
+	for (s = dm_sessions; s; s = s->sn_next) {
+		if (s->sn_sessid == sid) {
+			*sessionpp = s;
+			return(0);
+		}
+	}
+	return(EINVAL);
+}
+
+
+/* Return a pointer to a locked session given its session ID.  '*lcp' is
+   used to obtain the session's sn_qlock.  Caller is responsible for eventually
+   unlocking it.
+*/
+
+int
+dm_find_session_and_lock(
+	dm_sessid_t	sid,
+	dm_session_t	**sessionpp,
+	unsigned long	*lcp)		/* addr of returned lock cookie */
+{
+	int		error;
+
+	for (;;) {
+		*lcp = mutex_spinlock(&dm_session_lock);
+
+		if ((error = dm_find_session(sid, sessionpp)) != 0) {
+			mutex_spinunlock(&dm_session_lock, *lcp);
+			return(error);
+		}
+		if (spin_trylock(&(*sessionpp)->sn_qlock)) {
+			nested_spinunlock(&dm_session_lock);
+			return(0);	/* success */
+		}
+
+		/* If the second lock is not available, drop the first and
+		   start over.	This gives the CPU a chance to process any
+		   interrupts, and also allows processes which want a sn_qlock
+		   for a different session to proceed.
+		*/
+
+		mutex_spinunlock(&dm_session_lock, *lcp);
+	}
+}
+
+
+/* Return a pointer to the event on the specified session's sn_delq which
+   contains the given token.  The caller must have obtained the session's
+   sn_qlock before calling this routine.
+*/
+
+static int
+dm_find_msg(
+	dm_session_t	*s,
+	dm_token_t	token,
+	dm_tokevent_t	**tevpp)
+{
+	dm_tokevent_t	*tevp;
+
+	if (token <= DM_INVALID_TOKEN)
+		return(EINVAL);
+
+	for (tevp = s->sn_delq.eq_head; tevp; tevp = tevp->te_next) {
+		if (tevp->te_msg.ev_token == token) {
+			*tevpp = tevp;
+			return(0);
+		}
+	}
+	return(ESRCH);
+}
+
+
+/* Given a session ID and token, find the tevp on the specified session's
+   sn_delq which corresponds to that session ID/token pair.  If a match is
+   found, lock the tevp's te_lock and return a pointer to the tevp.
+   '*lcp' is used to obtain the tevp's te_lock.	 The caller is responsible
+   for eventually unlocking it.
+*/
+
+int
+dm_find_msg_and_lock(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	dm_tokevent_t	**tevpp,
+	unsigned long	*lcp)		/* address of returned lock cookie */
+{
+	dm_session_t	*s;
+	int		error;
+
+	if ((error = dm_find_session_and_lock(sid, &s, lcp)) != 0)
+		return(error);
+
+	if ((error = dm_find_msg(s, token, tevpp)) != 0) {
+		mutex_spinunlock(&s->sn_qlock, *lcp);
+		return(error);
+	}
+	nested_spinlock(&(*tevpp)->te_lock);
+	nested_spinunlock(&s->sn_qlock);
+	return(0);
+}
+
+
+/* Create a new session, or resume an old session if one is given. */
+
+int
+dm_create_session(
+	dm_sessid_t	old,
+	char		*info,
+	dm_sessid_t	*new)
+{
+	dm_session_t	*s;
+	dm_sessid_t	sid;
+	char		sessinfo[DM_SESSION_INFO_LEN];
+	size_t		len;
+	int		error;
+	unsigned long		lc;		/* lock cookie */
+
+	len = strnlen_user(info, DM_SESSION_INFO_LEN-1);
+	if (copy_from_user(sessinfo, info, len))
+		return(EFAULT);
+	lc = mutex_spinlock(&dm_session_lock);
+	sid = dm_next_sessid++;
+	mutex_spinunlock(&dm_session_lock, lc);
+	if (copy_to_user(new, &sid, sizeof(sid)))
+		return(EFAULT);
+
+	if (old == DM_NO_SESSION) {
+		s = kmem_cache_alloc(dm_session_cachep, SLAB_KERNEL);
+		if (s == NULL) {
+			printk("%s/%d: kmem_cache_alloc(dm_session_cachep) returned NULL\n", __FUNCTION__, __LINE__);
+			return ENOMEM;
+		}
+		memset(s, 0, sizeof(*s));
+
+		sv_init(&s->sn_readerq, SV_DEFAULT, "dmreadq");
+		sv_init(&s->sn_writerq, SV_DEFAULT, "dmwritq");
+		spinlock_init(&s->sn_qlock, "sn_qlock");
+		lc = mutex_spinlock(&dm_session_lock);
+	} else {
+		lc = mutex_spinlock(&dm_session_lock);
+		if ((error = dm_find_session(old, &s)) != 0) {
+			mutex_spinunlock(&dm_session_lock, lc);
+			return(error);
+		}
+#ifdef CONFIG_PROC_FS
+		{
+		char buf[100];
+		sprintf(buf, DMAPI_DBG_PROCFS "/sessions/0x%p", s);
+		remove_proc_entry(buf, NULL);
+		}
+#endif
+		unlink_session(s);
+	}
+	bcopy(sessinfo, s->sn_info, len);
+	s->sn_info[len-1] = 0;		/* if not NULL, then now 'tis */
+	s->sn_sessid = sid;
+	link_session(s);
+#ifdef CONFIG_PROC_FS
+	{
+	char buf[100];
+	struct proc_dir_entry *entry;
+
+	sprintf(buf, DMAPI_DBG_PROCFS "/sessions/0x%p", s);
+	entry = create_proc_read_entry(buf, 0, 0, sessions_read_pfs, s);
+	entry->owner = THIS_MODULE;
+	}
+#endif
+	mutex_spinunlock(&dm_session_lock, lc);
+	return(0);
+}
+
+
+int
+dm_destroy_session(
+	dm_sessid_t	sid)
+{
+	dm_session_t	*s;
+	int		error;
+	unsigned long		lc;		/* lock cookie */
+
+	/* The dm_session_lock must be held until the session is unlinked. */
+
+	lc = mutex_spinlock(&dm_session_lock);
+
+	if ((error = dm_find_session(sid, &s)) != 0) {
+		mutex_spinunlock(&dm_session_lock, lc);
+		return(error);
+	}
+	nested_spinlock(&s->sn_qlock);
+
+	/* The session exists.	Check to see if it is still in use.  If any
+	   messages still exist on the sn_newq or sn_delq, or if any processes
+	   are waiting for messages to arrive on the session, then the session
+	   must not be destroyed.
+	*/
+
+	if (s->sn_newq.eq_head || s->sn_readercnt || s->sn_delq.eq_head) {
+		nested_spinunlock(&s->sn_qlock);
+		mutex_spinunlock(&dm_session_lock, lc);
+		return(EBUSY);
+	}
+
+#ifdef CONFIG_PROC_FS
+	{
+	char buf[100];
+	sprintf(buf, DMAPI_DBG_PROCFS "/sessions/0x%p", s);
+	remove_proc_entry(buf, NULL);
+	}
+#endif
+
+	/* The session is not in use.  Dequeue it from the session chain. */
+
+	unlink_session(s);
+	nested_spinunlock(&s->sn_qlock);
+	mutex_spinunlock(&dm_session_lock, lc);
+
+	/* Now clear the sessions's disposition registration, and then destroy
+	   the session structure.
+	*/
+
+	dm_clear_fsreg(s);
+
+	spinlock_destroy(&s->sn_qlock);
+	sv_destroy(&s->sn_readerq);
+	sv_destroy(&s->sn_writerq);
+#ifdef __sgi
+	if (s->sn_sesshash)
+		kmem_free(s->sn_sesshash, dm_hash_buckets * sizeof(dm_sesshash_t));
+#endif
+	kmem_cache_free(dm_session_cachep, s);
+	return(0);
+}
+
+
+/*
+ *  Return a list of all active sessions.
+ */
+
+int
+dm_getall_sessions(
+	u_int		nelem,
+	dm_sessid_t	*sidp,
+	u_int		*nelemp)
+{
+	dm_session_t	*s;
+	u_int		sesscnt;
+	dm_sessid_t	*sesslist;
+	unsigned long		lc;		/* lock cookie */
+	int		error;
+	int		i;
+
+	/* Loop until we can get the right amount of temp space, being careful
+	   not to hold a mutex during the allocation.  Usually only one trip.
+	*/
+
+	for (;;) {
+		if ((sesscnt = dm_sessions_active) == 0) {
+			/*if (suword(nelemp, 0))*/
+			if (put_user(0, nelemp))
+				return(EFAULT);
+			return(0);
+		}
+		sesslist = kmalloc(sesscnt * sizeof(*sidp), GFP_KERNEL);
+		if (sesslist == NULL) {
+			printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+			return ENOMEM;
+		}
+
+		lc = mutex_spinlock(&dm_session_lock);
+		if (sesscnt == dm_sessions_active)
+			break;
+
+		mutex_spinunlock(&dm_session_lock, lc);
+		kfree(sesslist);
+	}
+
+	/* Make a temp copy of the data, then release the mutex. */
+
+	for (i = 0, s = dm_sessions; i < sesscnt; i++, s = s->sn_next)
+		sesslist[i] = s->sn_sessid;
+
+	mutex_spinunlock(&dm_session_lock, lc);
+
+	/* Now copy the data to the user. */
+
+	if(put_user(sesscnt, nelemp)) {
+		error = EFAULT;
+	} else if (sesscnt > nelem) {
+		error = E2BIG;
+	} else if (copy_to_user(sidp, sesslist, sesscnt * sizeof(*sidp))) {
+		error = EFAULT;
+	} else {
+		error = 0;
+	}
+	kfree(sesslist);
+	return(error);
+}
+
+
+/*
+ *  Return the descriptive string associated with a session.
+ */
+
+int
+dm_query_session(
+	dm_sessid_t	sid,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp)
+{
+	dm_session_t	*s;		/* pointer to session given by sid */
+	int		len;		/* length of session info string */
+	int		error;
+	char		sessinfo[DM_SESSION_INFO_LEN];
+	unsigned long	lc;		/* lock cookie */
+
+	if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0)
+		return(error);
+
+	len = strlen(s->sn_info) + 1;	/* NULL terminated when created */
+	bcopy(s->sn_info, sessinfo, len);
+
+	mutex_spinunlock(&s->sn_qlock, lc);
+
+	/* Now that the mutex is released, copy the sessinfo to the user. */
+
+	if (put_user(len, rlenp)) {
+		error = EFAULT;
+	} else if (len > buflen) {
+		error = E2BIG;
+	} else if (copy_to_user(bufp, sessinfo, len)) {
+		error = EFAULT;
+	} else {
+		error = 0;
+	}
+	return(error);
+}
+
+
+/*
+ *  Return all of the previously delivered tokens (that is, their IDs)
+ *  for the given session.
+ */
+
+int
+dm_getall_tokens(
+	dm_sessid_t	sid,		/* session obtaining tokens from */
+	u_int		nelem,		/* size of tokenbufp */
+	dm_token_t	*tokenbufp,	/* buffer to copy token IDs to */
+	u_int		*nelemp)	/* return number copied to tokenbufp */
+{
+	dm_session_t	*s;		/* pointer to session given by sid */
+	dm_tokevent_t	*tevp;		/* event message queue traversal */
+	unsigned long	lc;		/* lock cookie */
+	int		tokcnt;
+	dm_token_t	*toklist;
+	int		error;
+	int		i;
+
+	/* Loop until we can get the right amount of temp space, being careful
+	   not to hold a mutex during the allocation.  Usually only one trip.
+	*/
+
+	for (;;) {
+		if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0)
+			return(error);
+		tokcnt = s->sn_delq.eq_count;
+		mutex_spinunlock(&s->sn_qlock, lc);
+
+		if (tokcnt == 0) {
+			/*if (suword(nelemp, 0))*/
+			if (put_user(0, nelemp))
+				return(EFAULT);
+			return(0);
+		}
+		toklist = kmalloc(tokcnt * sizeof(*tokenbufp), GFP_KERNEL);
+		if (toklist == NULL) {
+			printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+			return ENOMEM;
+		}
+
+		if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0) {
+			kfree(toklist);
+			return(error);
+		}
+
+		if (tokcnt == s->sn_delq.eq_count)
+			break;
+
+		mutex_spinunlock(&s->sn_qlock, lc);
+		kfree(toklist);
+	}
+
+	/* Make a temp copy of the data, then release the mutex. */
+
+	tevp = s->sn_delq.eq_head;
+	for (i = 0; i < tokcnt; i++, tevp = tevp->te_next)
+		toklist[i] = tevp->te_msg.ev_token;
+
+	mutex_spinunlock(&s->sn_qlock, lc);
+
+	/* Now copy the data to the user. */
+
+	if (put_user(tokcnt, nelemp)) {
+		error = EFAULT;
+	} else if (tokcnt > nelem) {
+		error = E2BIG;
+	} else if (copy_to_user(tokenbufp,toklist,tokcnt*sizeof(*tokenbufp))) {
+		error = EFAULT;
+	} else {
+		error = 0;
+	}
+	kfree(toklist);
+	return(error);
+}
+
+
+/*
+ *  Return the message identified by token.
+ */
+
+int
+dm_find_eventmsg(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp)
+{
+	dm_tokevent_t	*tevp;		/* message identified by token */
+	int		msgsize;	/* size of message to copy out */
+	void		*msg;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	/* Because some of the events (dm_data_event_t in particular) contain
+	   __u64 fields, we need to make sure that the buffer provided by the
+	   caller is aligned such that he can read those fields successfully.
+	*/
+
+	if (((__psint_t)bufp & (sizeof(__u64) - 1)) != 0)
+		return(EFAULT);
+
+	/* Allocate the right amount of temp space, being careful not to hold
+	   a mutex during the allocation.
+	*/
+
+	if ((error = dm_find_msg_and_lock(sid, token, &tevp, &lc)) != 0)
+		return(error);
+	msgsize = tevp->te_allocsize - offsetof(dm_tokevent_t, te_msg);
+	mutex_spinunlock(&tevp->te_lock, lc);
+
+	msg = kmalloc(msgsize, GFP_KERNEL);
+	if (msg == NULL) {
+		printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+		return ENOMEM;
+	}
+
+	if ((error = dm_find_msg_and_lock(sid, token, &tevp, &lc)) != 0) {
+		kfree(msg);
+		return(error);
+	}
+
+	/* Make a temp copy of the data, then release the mutex. */
+
+	bcopy(&tevp->te_msg, msg, msgsize);
+	mutex_spinunlock(&tevp->te_lock, lc);
+
+	/* Now copy the data to the user. */
+
+	if (put_user(msgsize,rlenp)) {
+		error = EFAULT;
+	} else if (msgsize > buflen) {		/* user buffer not big enough */
+		error = E2BIG;
+	} else if (copy_to_user( bufp, msg, msgsize )) {
+		error = EFAULT;
+	} else {
+		error = 0;
+	}
+	kfree(msg);
+	return(error);
+}
+
+
+int
+dm_move_event(
+	dm_sessid_t	srcsid,
+	dm_token_t	token,
+	dm_sessid_t	targetsid,
+	dm_token_t	*rtokenp)
+{
+	dm_session_t	*s1;
+	dm_session_t	*s2;
+	dm_tokevent_t	*tevp;
+	int		error;
+	unsigned long		lc;		/* lock cookie */
+#ifdef __sgi
+	int		hash_it;
+#endif
+
+	lc = mutex_spinlock(&dm_session_lock);
+
+	if ((error = dm_find_session(srcsid, &s1)) != 0 ||
+	    (error = dm_find_session(targetsid, &s2)) != 0 ||
+	    (error = dm_find_msg(s1, token, &tevp)) != 0) {
+		mutex_spinunlock(&dm_session_lock, lc);
+		return(error);
+	}
+	dm_unlink_event(tevp, &s1->sn_delq);
+#ifdef __sgi
+	if (tevp->te_flags & DM_TEF_HASHED) {
+		unhash_event(s1, tevp);
+		hash_it = 1;
+	}
+#endif
+	dm_link_event(tevp, &s2->sn_delq);
+#ifdef __sgi
+	if (hash_it)
+		hash_event(s2, tevp);
+#endif
+	mutex_spinunlock(&dm_session_lock, lc);
+
+	if (copy_to_user(rtokenp, &token, sizeof(token)))
+		return(EFAULT);
+	return(0);
+}
+
+
+/* ARGSUSED */
+int
+dm_pending(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	dm_timestruct_t *delay)		/* unused */
+{
+	dm_tokevent_t	*tevp;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	if ((error = dm_find_msg_and_lock(sid, token, &tevp, &lc)) != 0)
+		return(error);
+
+	tevp->te_flags |= DM_TEF_INTERMED;
+	if (tevp->te_evt_ref > 0)	/* if event generation threads exist */
+		sv_broadcast(&tevp->te_evt_queue);
+
+	mutex_spinunlock(&tevp->te_lock, lc);
+	return(0);
+}
+
+
+int
+dm_get_events(
+	dm_sessid_t	sid,
+	u_int		maxmsgs,
+	u_int		flags,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp)
+{
+	dm_session_t	*s;		/* pointer to session given by sid */
+	dm_tokevent_t	*tevp;		/* next event message on queue */
+	int		error;
+	unsigned long	lc1;		/* first lock cookie */
+	unsigned long	lc2 = 0;	/* second lock cookie */
+	int		totalsize;
+	int		msgsize;
+	dm_eventmsg_t	*prevmsg;
+	int		prev_msgsize = 0;
+	u_int		msgcnt;
+
+	/* Because some of the events (dm_data_event_t in particular) contain
+	   __u64 fields, we need to make sure that the buffer provided by the
+	   caller is aligned such that he can read those fields successfully.
+	*/
+
+	if (((__psint_t)bufp & (sizeof(__u64) - 1)) != 0)
+		return(EFAULT);
+
+	/* Find the indicated session and lock it. */
+
+	if ((error = dm_find_session_and_lock(sid, &s, &lc1)) != 0)
+		return(error);
+
+	/* Check for messages on sn_newq.  If there aren't any that haven't
+	   already been grabbed by another process, and if we are supposed to
+	   to wait until one shows up, then go to sleep interruptibly on the
+	   sn_readerq semaphore.  The session can't disappear out from under
+	   us as long as sn_readerq is non-zero.
+	*/
+
+	for (;;) {
+		int		rc;
+
+		for (tevp = s->sn_newq.eq_head; tevp; tevp = tevp->te_next) {
+			lc2 = mutex_spinlock(&tevp->te_lock);
+			if (!(tevp->te_flags & DM_TEF_LOCKED))
+				break;
+			mutex_spinunlock(&tevp->te_lock, lc2);
+		}
+		if (tevp)
+			break;		/* got one! */
+
+		if (!(flags & DM_EV_WAIT)) {
+			mutex_spinunlock(&s->sn_qlock, lc1);
+			return(EAGAIN);
+		}
+		s->sn_readercnt++;
+
+		sv_wait_sig(&s->sn_readerq, 1, &s->sn_qlock, lc1);
+		rc = signal_pending(current);
+
+		lc1 = mutex_spinlock(&s->sn_qlock);
+		s->sn_readercnt--;
+		if (rc) {	/* if signal was received */
+			mutex_spinunlock(&s->sn_qlock, lc1);
+			return(EINTR);
+		}
+	}
+
+	/* At least one message is available for delivery, and we have both the
+	   session lock and event lock.	 Mark the event so that it is not
+	   grabbed by other daemons, then drop both locks prior copying the
+	   data to the caller's buffer.	 Leaving the event on the queue in a
+	   marked state prevents both the session and the event from
+	   disappearing out from under us while we don't have the locks.
+	*/
+
+	tevp->te_flags |= DM_TEF_LOCKED;
+	mutex_spinunlock(&tevp->te_lock, lc2);	/* reverse cookie order */
+	mutex_spinunlock(&s->sn_qlock, lc1);
+
+	/* Continue to deliver messages until there are no more, the
+	   user's buffer becomes full, or we hit his maxmsgs limit.
+	*/
+
+	totalsize = 0;		/* total bytes transferred to the user */
+	prevmsg = NULL;
+	msgcnt = 0;
+
+	while (tevp) {
+		/* Compute the number of bytes to be moved, rounding up to an
+		   8-byte boundary so that any subsequent messages will also be
+		   aligned.
+		*/
+
+		msgsize = tevp->te_allocsize - offsetof(dm_tokevent_t, te_msg);
+		msgsize = (msgsize + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1);
+		totalsize += msgsize;
+
+		/* If it fits, copy the message into the user's buffer and
+		   update his 'rlenp'.	Update the _link pointer for any
+		   previous message.
+		*/
+
+		if (totalsize > buflen) {	/* no more room */
+			error = E2BIG;
+		} else if (put_user(totalsize, rlenp)) {
+			error = EFAULT;
+		} else if (copy_to_user(bufp, &tevp->te_msg, msgsize)) {
+			error = EFAULT;
+		} else if (prevmsg && put_user(prev_msgsize, &prevmsg->_link)) {
+			error = EFAULT;
+		} else {
+			error = 0;
+		}
+
+		/* If an error occurred, just unmark the event and leave it on
+		   the queue for someone else.	Note that other daemons may
+		   have gone to sleep because this event was marked, so wake
+		   them up.  Also, if at least one message has already been
+		   delivered, then an error here is not really an error.
+		*/
+
+		lc1 = mutex_spinlock(&s->sn_qlock);
+		lc2 = mutex_spinlock(&tevp->te_lock);
+		tevp->te_flags &= ~DM_TEF_LOCKED;	/* drop the mark */
+
+		if (error) {
+			if (s->sn_readercnt)
+				sv_signal(&s->sn_readerq);
+
+			mutex_spinunlock(&tevp->te_lock, lc2);	/* rev. order */
+			mutex_spinunlock(&s->sn_qlock, lc1);
+			if (prevmsg)
+				return(0);
+			if (error == E2BIG && put_user(totalsize,rlenp))
+				error = EFAULT;
+			return(error);
+		}
+
+		/* The message was successfully delivered.  Unqueue it. */
+
+		dm_unlink_event(tevp, &s->sn_newq);
+
+		/* Wake up the first of any processes waiting for room on the
+		   sn_newq.
+		*/
+
+		if (s->sn_writercnt)
+			sv_signal(&s->sn_writerq);
+
+		/* If the message is synchronous, add it to the sn_delq while
+		   still holding the lock.  If it is asynchronous, free it.
+		*/
+
+		if (tevp->te_msg.ev_token != DM_INVALID_TOKEN) { /* synch */
+			dm_link_event(tevp, &s->sn_delq);
+			mutex_spinunlock(&tevp->te_lock, lc2);
+		} else {
+			tevp->te_flags |= DM_TEF_FINAL;
+#ifdef __sgi
+			if (tevp->te_flags & DM_TEF_HASHED)
+				unhash_event(s, tevp);
+#endif
+			mutex_spinunlock(&tevp->te_lock, lc2);
+			dm_put_tevp(tevp, NULL);/* can't cause destroy events */
+		}
+
+		/* Update our notion of where we are in the user's buffer.  If
+		   he doesn't want any more messages, then stop.
+		*/
+
+		prevmsg = (dm_eventmsg_t *)bufp;
+		prev_msgsize = msgsize;
+		bufp = (char *)bufp + msgsize;
+
+		msgcnt++;
+		if (maxmsgs && msgcnt >= maxmsgs) {
+			mutex_spinunlock(&s->sn_qlock, lc1);
+			break;
+		}
+
+		/* While still holding the sn_qlock,  see if any additional
+		   messages are available for delivery.
+		*/
+
+		for (tevp = s->sn_newq.eq_head; tevp; tevp = tevp->te_next) {
+			lc2 = mutex_spinlock(&tevp->te_lock);
+			if (!(tevp->te_flags & DM_TEF_LOCKED)) {
+				tevp->te_flags |= DM_TEF_LOCKED;
+				mutex_spinunlock(&tevp->te_lock, lc2);
+				break;
+			}
+			mutex_spinunlock(&tevp->te_lock, lc2);
+		}
+		mutex_spinunlock(&s->sn_qlock, lc1);
+	}
+	return(0);
+}
+
+
+/*
+ *  Remove an event message from the delivered queue, set the returned
+ *  error where the event generator wants it, and wake up the generator.
+ *  Also currently have the user side release any locks it holds...
+ */
+
+/* ARGSUSED */
+int
+dm_respond_event(
+	dm_sessid_t	sid,
+	dm_token_t	token,
+	dm_response_t	response,
+	int		reterror,
+	size_t		buflen,		/* unused */
+	void		*respbufp)	/* unused */
+{
+	dm_session_t	*s;		/* pointer to session given by sid */
+	dm_tokevent_t	*tevp;		/* event message queue traversal */
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	/* Sanity check the input parameters. */
+
+	switch (response) {
+	case DM_RESP_CONTINUE:	/* continue must have reterror == 0 */
+		if (reterror != 0)
+			return(EINVAL);
+		break;
+	case DM_RESP_ABORT:	/* abort must have errno set */
+		if (reterror <= 0)
+			return(EINVAL);
+		break;
+	case DM_RESP_DONTCARE:
+		if (reterror > 0)
+			return(EINVAL);
+		reterror = -1;	/* to distinguish DM_RESP_DONTCARE */
+		break;
+	default:
+		return(EINVAL);
+	}
+
+	/* Hold session lock until the event is unqueued. */
+
+	if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0)
+		return(error);
+
+	if ((error = dm_find_msg(s, token, &tevp)) != 0) {
+		mutex_spinunlock(&s->sn_qlock, lc);
+		return(error);
+	}
+	nested_spinlock(&tevp->te_lock);
+
+	if (reterror == -1 && tevp->te_msg.ev_type != DM_EVENT_MOUNT) {
+		error = EINVAL;
+		nested_spinunlock(&tevp->te_lock);
+		mutex_spinunlock(&s->sn_qlock, lc);
+	} else {
+		dm_unlink_event(tevp, &s->sn_delq);
+#ifdef __sgi
+		if (tevp->te_flags & DM_TEF_HASHED)
+			unhash_event(s, tevp);
+#endif
+		tevp->te_reply = reterror;
+		tevp->te_flags |= DM_TEF_FINAL;
+		if (tevp->te_evt_ref)
+			sv_broadcast(&tevp->te_evt_queue);
+		nested_spinunlock(&tevp->te_lock);
+		mutex_spinunlock(&s->sn_qlock, lc);
+		error = 0;
+
+		/* Absolutely no locks can be held when calling dm_put_tevp! */
+
+		dm_put_tevp(tevp, NULL);  /* this can generate destroy events */
+	}
+	return(error);
+}
+
+
+/* Queue the filled in event message pointed to by tevp on the session s, and
+   (if a synchronous event) wait for the reply from the DMAPI application.
+   The caller MUST be holding the session lock before calling this routine!
+   The session lock is always released upon exit.
+   Returns:
+	 -1 == don't care
+	  0 == success (or async event)
+	> 0 == errno describing reason for failure
+*/
+
+static int
+dm_enqueue(
+	dm_session_t	*s,
+	unsigned long	lc,		/* input lock cookie */
+	dm_tokevent_t	*tevp,		/* in/out parameter */
+	int		sync,
+	int		flags,
+	int		interruptable)
+{
+	int		is_unmount = 0;
+#ifdef __sgi
+	int		is_hashable = 0;
+#endif
+	int		reply;
+
+#ifdef __sgi
+	/* If the caller isn't planning to stick around for the result
+	   and this request is identical to one that is already on the
+	   queues then just give the caller an EAGAIN.	Release the
+	   session lock before returning.
+
+	   We look only at NDELAY requests with an event type of READ,
+	   WRITE, or TRUNCATE on objects that are regular files.
+	*/
+
+	if ((flags & DM_FLAGS_NDELAY) && DM_EVENT_RDWRTRUNC(tevp) &&
+	    (tevp->te_tdp->td_type == DM_TDT_REG)) {
+		if (repeated_event(s, tevp)) {
+			mutex_spinunlock(&s->sn_qlock, lc);
+			return(EAGAIN);
+		}
+		is_hashable = 1;
+	}
+#endif
+
+	if (tevp->te_msg.ev_type == DM_EVENT_UNMOUNT)
+		is_unmount = 1;
+
+	/* Check for room on sn_newq.  If there is no room for new messages,
+	   then go to sleep on the sn_writerq semaphore.  The
+	   session cannot disappear out from under us as long as sn_writercnt
+	   is non-zero.
+	*/
+
+	while (s->sn_newq.eq_count >= dm_max_queued_msgs) {	/* no room */
+		s->sn_writercnt++;
+		dm_link_event(tevp, &s->sn_evt_writerq);
+		if (interruptable) {
+			sv_wait_sig(&s->sn_writerq, 1, &s->sn_qlock, lc);
+			if (signal_pending(current)) {
+				s->sn_writercnt--;
+				return(EINTR);
+			}
+		} else {
+			sv_wait(&s->sn_writerq, 1, &s->sn_qlock, lc);
+		}
+		lc = mutex_spinlock(&s->sn_qlock);
+		s->sn_writercnt--;
+		dm_unlink_event(tevp, &s->sn_evt_writerq);
+	}
+
+	/* Assign a sequence number and token to the event and bump the
+	   application reference count by one.	We don't need 'te_lock' here
+	   because this thread is still the only thread that can see the event.
+	*/
+
+	nested_spinlock(&dm_token_lock);
+	tevp->te_msg.ev_sequence = dm_next_sequence++;
+	if (sync) {
+		tevp->te_msg.ev_token = dm_next_token++;
+	} else {
+		tevp->te_msg.ev_token = DM_INVALID_TOKEN;
+	}
+	nested_spinunlock(&dm_token_lock);
+
+	tevp->te_app_ref++;
+
+	/* Room exists on the sn_newq queue, so add this request.  If the
+	   queue was previously empty, wake up the first of any processes
+	   that are waiting for an event.
+	*/
+
+	dm_link_event(tevp, &s->sn_newq);
+#ifdef __sgi
+	if (is_hashable)
+		hash_event(s, tevp);
+#endif
+
+	if (s->sn_readercnt)
+		sv_signal(&s->sn_readerq);
+
+	mutex_spinunlock(&s->sn_qlock, lc);
+
+	/* Now that the message is queued, processes issuing asynchronous
+	   events or DM_EVENT_UNMOUNT events are ready to continue.
+	*/
+
+	if (!sync || is_unmount)
+		return(0);
+
+	/* Synchronous requests wait until a final reply is received.  If the
+	   caller supplied the DM_FLAGS_NDELAY flag, the process will return
+	   EAGAIN if dm_pending() sets DM_TEF_INTERMED.	 We also let users
+	   Cntl-C out of a read, write, and truncate requests.
+	*/
+
+	lc = mutex_spinlock(&tevp->te_lock);
+
+	while (!(tevp->te_flags & DM_TEF_FINAL)) {
+		if ((tevp->te_flags & DM_TEF_INTERMED) &&
+		    (flags & DM_FLAGS_NDELAY)) {
+			mutex_spinunlock(&tevp->te_lock, lc);
+			return(EAGAIN);
+		}
+		if (tevp->te_msg.ev_type == DM_EVENT_READ ||
+		    tevp->te_msg.ev_type == DM_EVENT_WRITE ||
+		    tevp->te_msg.ev_type == DM_EVENT_TRUNCATE) {
+			sv_wait_sig(&tevp->te_evt_queue, 1, &tevp->te_lock, lc);
+			if (signal_pending(current)){
+				return(EINTR);
+			}
+		} else {
+			sv_wait(&tevp->te_evt_queue, 1, &tevp->te_lock, lc);
+		}
+		lc = mutex_spinlock(&tevp->te_lock);
+	}
+
+	/* Return both the tevp and the reply which was stored in the tevp by
+	   dm_respond_event.  The tevp structure has already been removed from
+	   the reply queue by this point in dm_respond_event().
+	*/
+
+	reply = tevp->te_reply;
+	mutex_spinunlock(&tevp->te_lock, lc);
+	return(reply);
+}
+
+
+/* The filesystem is guaranteed to stay mounted while this event is
+   outstanding.
+*/
+
+int
+dm_enqueue_normal_event(
+	vfs_t		*vfsp,
+	dm_tokevent_t	*tevp,
+	int		flags)
+{
+	dm_session_t	*s;
+	int		error;
+	int		sync;
+	unsigned long	lc;		/* lock cookie */
+
+	switch (tevp->te_msg.ev_type) {
+	case DM_EVENT_READ:
+	case DM_EVENT_WRITE:
+	case DM_EVENT_TRUNCATE:
+	case DM_EVENT_PREUNMOUNT:
+	case DM_EVENT_UNMOUNT:
+	case DM_EVENT_NOSPACE:
+	case DM_EVENT_CREATE:
+	case DM_EVENT_REMOVE:
+	case DM_EVENT_RENAME:
+	case DM_EVENT_SYMLINK:
+	case DM_EVENT_LINK:
+	case DM_EVENT_DEBUT:		/* not currently supported */
+		sync = 1;
+		break;
+
+	case DM_EVENT_DESTROY:
+	case DM_EVENT_POSTCREATE:
+	case DM_EVENT_POSTREMOVE:
+	case DM_EVENT_POSTRENAME:
+	case DM_EVENT_POSTSYMLINK:
+	case DM_EVENT_POSTLINK:
+	case DM_EVENT_ATTRIBUTE:
+	case DM_EVENT_CLOSE:		/* not currently supported */
+	case DM_EVENT_CANCEL:		/* not currently supported */
+		sync = 0;
+		break;
+
+	default:
+		return(EIO);		/* garbage event number */
+	}
+
+	/* Wait until a session selects disposition for the event.  The session
+	   is locked upon return from dm_waitfor_disp_session().
+	*/
+
+	if ((error = dm_waitfor_disp_session(vfsp, tevp, &s, &lc)) != 0)
+		return(error);
+
+	return(dm_enqueue(s, lc, tevp, sync, flags, 0));
+}
+
+
+/* Traverse the session list checking for sessions with the WANTMOUNT flag
+   set.	 When one is found, send it the message.  Possible responses to the
+   message are one of DONTCARE, CONTINUE, or ABORT.  The action taken in each
+   case is:
+	DONTCARE (-1)  - Send the event to the next session with WANTMOUNT set
+	CONTINUE ( 0) - Proceed with the mount, errno zero.
+	ABORT	 (>0) - Fail the mount, return the returned errno.
+
+   The mount request is sent to sessions in ascending session ID order.
+   Since the session list can change dramatically while this process is
+   sleeping in dm_enqueue(), this routine must use session IDs rather than
+   session pointers when keeping track of where it is in the list.  Since
+   new sessions are always added at the end of the queue, and have increasing
+   session ID values, we don't have to worry about missing any session.
+*/
+
+int
+dm_enqueue_mount_event(
+	vfs_t		*vfsp,
+	dm_tokevent_t	*tevp)
+{
+	dm_session_t	*s;
+	dm_sessid_t	sid;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	/* Make the mounting filesystem visible to other DMAPI calls. */
+
+	if ((error = dm_add_fsys_entry(vfsp, tevp)) != 0){
+		return(error);
+	}
+
+	/* Walk through the session list presenting the mount event to each
+	   session that is interested until a session accepts or rejects it,
+	   or until all sessions ignore it.
+	*/
+
+	for (sid = DM_NO_SESSION, error = -1; error < 0; sid = s->sn_sessid) {
+
+		lc = mutex_spinlock(&dm_session_lock);
+		for (s = dm_sessions; s; s = s->sn_next) {
+			if (s->sn_sessid > sid && s->sn_flags & DM_SN_WANTMOUNT) {
+				nested_spinlock(&s->sn_qlock);
+				nested_spinunlock(&dm_session_lock);
+				break;
+			}
+		}
+		if (s == NULL) {
+			mutex_spinunlock(&dm_session_lock, lc);
+			break;		/* noone wants it; proceed with mount */
+		}
+		error = dm_enqueue(s, lc, tevp, 1, 0, 0);
+	}
+
+	/* If the mount will be allowed to complete, then update the fsrp entry
+	   accordingly.	 If the mount is to be aborted, remove the fsrp entry.
+	*/
+
+	if (error <= 0) {
+		dm_change_fsys_entry(vfsp, DM_STATE_MOUNTED);
+		error = 0;
+	} else {
+		dm_remove_fsys_entry(vfsp);
+	}
+	return(error);
+}
+
+int
+dm_enqueue_sendmsg_event(
+	dm_sessid_t	targetsid,
+	dm_tokevent_t	*tevp,
+	int		sync)
+{
+	dm_session_t	*s;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	if ((error = dm_find_session_and_lock(targetsid, &s, &lc)) != 0)
+		return(error);
+
+	return(dm_enqueue(s, lc, tevp, sync, 0, 1));
+}
+
+
+dm_token_t
+dm_enqueue_user_event(
+	dm_sessid_t	sid,
+	dm_tokevent_t	*tevp,
+	dm_token_t	*tokenp)
+{
+	dm_session_t	*s;
+	int		error;
+	unsigned long	lc;		/* lock cookie */
+
+	/* Atomically find and lock the session whose session id is 'sid'. */
+
+	if ((error = dm_find_session_and_lock(sid, &s, &lc)) != 0)
+		return(error);
+
+	/* Assign a sequence number and token to the event, bump the
+	   application reference count by one, and decrement the event
+	   count because the caller gives up all ownership of the event.
+	   We don't need 'te_lock' here because this thread is still the
+	   only thread that can see the event.
+	*/
+
+	nested_spinlock(&dm_token_lock);
+	tevp->te_msg.ev_sequence = dm_next_sequence++;
+	*tokenp = tevp->te_msg.ev_token = dm_next_token++;
+	nested_spinunlock(&dm_token_lock);
+
+	tevp->te_flags &= ~(DM_TEF_INTERMED|DM_TEF_FINAL);
+	tevp->te_app_ref++;
+	tevp->te_evt_ref--;
+
+	/* Add the request to the tail of the sn_delq.	Now it's visible. */
+
+	dm_link_event(tevp, &s->sn_delq);
+	mutex_spinunlock(&s->sn_qlock, lc);
+
+	return(0);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_sysent.c linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_sysent.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/dmapi/dmapi_sysent.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/dmapi/dmapi_sysent.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,758 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/* Data Migration API (DMAPI)
+ */
+
+
+/* We're using MISC_MAJOR / DMAPI_MINOR. */
+
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/major.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+
+#include "dmapi_private.h"
+
+kmem_cache_t	*dm_fsreg_cachep = NULL;
+kmem_cache_t	*dm_tokdata_cachep = NULL;
+kmem_cache_t	*dm_session_cachep = NULL;
+
+static int
+dmapi_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+	    unsigned long arg)
+{
+	sys_dmapi_args_t kargs;
+	sys_dmapi_args_t *uap = &kargs;
+	int error = 0;
+	int rvp = -ENOSYS;
+	int use_rvp = 0;
+
+	if (!capable(CAP_MKNOD))
+		return(-EPERM);
+
+	if( copy_from_user( &kargs, (sys_dmapi_args_t*)arg,
+			   sizeof(sys_dmapi_args_t) ) )
+		return -EFAULT;
+
+	switch (cmd) {
+	case DM_CLEAR_INHERIT:
+		error = dm_clear_inherit(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_attrname_t *) DM_Parg(uap,5));/* attrnamep */
+		break;
+	case DM_CREATE_BY_HANDLE:
+		error = dm_create_by_handle(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* dirhanp */
+				(size_t)	DM_Uarg(uap,3), /* dirhlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(void *)	DM_Parg(uap,5), /* hanp */
+				(size_t)	DM_Uarg(uap,6), /* hlen */
+				(char *)	DM_Parg(uap,7));/* cname */
+		break;
+	case DM_CREATE_SESSION:
+		error = dm_create_session(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* oldsid */
+				(char *)	DM_Parg(uap,2), /* sessinfop */
+				(dm_sessid_t *) DM_Parg(uap,3));/* newsidp */
+		break;
+	case DM_CREATE_USEREVENT:
+		error = dm_create_userevent(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(size_t)	DM_Uarg(uap,2), /* msglen */
+				(void *)	DM_Parg(uap,3), /* msgdatap */
+				(dm_token_t *)	DM_Parg(uap,4));/* tokenp */
+		break;
+	case DM_DESTROY_SESSION:
+		error = dm_destroy_session(
+				(dm_sessid_t)	DM_Uarg(uap,1));/* sid */
+		break;
+	case DM_DOWNGRADE_RIGHT:
+		error = dm_downgrade_right(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4));/* token */
+		break;
+	case DM_FD_TO_HANDLE:
+		error = dm_fd_to_hdl(
+				(int)		DM_Uarg(uap,1), /* fd */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t *)	DM_Parg(uap,3));/* hlenp */
+		break;
+	case DM_FIND_EVENTMSG:
+		error = dm_find_eventmsg(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(dm_token_t)	DM_Uarg(uap,2), /* token */
+				(size_t)	DM_Uarg(uap,3), /* buflen */
+				(void *)	DM_Parg(uap,4), /* bufp */
+				(size_t *)	DM_Parg(uap,5));/* rlenp */
+		break;
+	case DM_GET_ALLOCINFO:
+		use_rvp = 1;
+		error = dm_get_allocinfo_rvp(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_off_t *)	DM_Parg(uap,5), /* offp */
+				(u_int)		DM_Uarg(uap,6), /* nelem */
+				(dm_extent_t *) DM_Parg(uap,7), /* extentp */
+				(u_int *)	DM_Parg(uap,8), /* nelemp */
+						&rvp);
+		break;
+	case DM_GET_BULKALL:
+		use_rvp = 1;
+		error = dm_get_bulkall_rvp(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* mask */
+				(dm_attrname_t *) DM_Parg(uap,6),/* attrnamep */
+				(dm_attrloc_t *) DM_Parg(uap,7),/* locp */
+				(size_t)	DM_Uarg(uap,8), /* buflen */
+				(void *)	DM_Parg(uap,9), /* bufp */
+				(size_t *)	DM_Parg(uap,10),/* rlenp */
+						&rvp);
+		break;
+	case DM_GET_BULKATTR:
+		use_rvp = 1;
+		error = dm_get_bulkattr_rvp(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* mask */
+				(dm_attrloc_t *)DM_Parg(uap,6), /* locp */
+				(size_t)	DM_Uarg(uap,7), /* buflen */
+				(void *)	DM_Parg(uap,8), /* bufp */
+				(size_t *)	DM_Parg(uap,9), /* rlenp */
+						&rvp);
+		break;
+	case DM_GET_CONFIG:
+		error = dm_get_config(
+				(void *)	DM_Parg(uap,1), /* hanp */
+				(size_t)	DM_Uarg(uap,2), /* hlen */
+				(dm_config_t)	DM_Uarg(uap,3), /* flagname */
+				(dm_size_t *)	DM_Parg(uap,4));/* retvalp */
+		break;
+	case DM_GET_CONFIG_EVENTS:
+		error = dm_get_config_events(
+				(void *)	DM_Parg(uap,1), /* hanp */
+				(size_t)	DM_Uarg(uap,2), /* hlen */
+				(u_int)		DM_Uarg(uap,3), /* nelem */
+				(dm_eventset_t *) DM_Parg(uap,4),/* eventsetp */
+				(u_int *)	DM_Parg(uap,5));/* nelemp */
+		break;
+	case DM_GET_DIOINFO:
+		error = dm_get_dioinfo(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_dioinfo_t *)DM_Parg(uap,5));/* diop */
+		break;
+	case DM_GET_DIRATTRS:
+		use_rvp = 1;
+		error = dm_get_dirattrs_rvp(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* mask */
+				(dm_attrloc_t *)DM_Parg(uap,6), /* locp */
+				(size_t)	DM_Uarg(uap,7), /* buflen */
+				(void *)	DM_Parg(uap,8), /* bufp */
+				(size_t *)	DM_Parg(uap,9), /* rlenp */
+						&rvp);
+		break;
+	case DM_GET_DMATTR:
+		error = dm_get_dmattr(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_attrname_t *) DM_Parg(uap,5),/* attrnamep */
+				(size_t)	DM_Uarg(uap,6), /* buflen */
+				(void *)	DM_Parg(uap,7), /* bufp */
+				(size_t *)	DM_Parg(uap,8));/* rlenp */
+
+		break;
+	case DM_GET_EVENTLIST:
+		error = dm_get_eventlist(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* nelem */
+				(dm_eventset_t *) DM_Parg(uap,6),/* eventsetp */
+				(u_int *)	DM_Parg(uap,7));/* nelemp */
+		break;
+	case DM_GET_EVENTS:
+		error = dm_get_events(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(u_int)		DM_Uarg(uap,2), /* maxmsgs */
+				(u_int)		DM_Uarg(uap,3), /* flags */
+				(size_t)	DM_Uarg(uap,4), /* buflen */
+				(void *)	DM_Parg(uap,5), /* bufp */
+				(size_t *)	DM_Parg(uap,6));/* rlenp */
+		break;
+	case DM_GET_FILEATTR:
+		error = dm_get_fileattr(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* mask */
+				(dm_stat_t *)	DM_Parg(uap,6));/* statp */
+		break;
+	case DM_GET_MOUNTINFO:
+		error = dm_get_mountinfo(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(size_t)	DM_Uarg(uap,5), /* buflen */
+				(void *)	DM_Parg(uap,6), /* bufp */
+				(size_t *)	DM_Parg(uap,7));/* rlenp */
+		break;
+	case DM_GET_REGION:
+		error = dm_get_region(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* nelem */
+				(dm_region_t *) DM_Parg(uap,6), /* regbufp */
+				(u_int *)	DM_Parg(uap,7));/* nelemp */
+		break;
+	case DM_GETALL_DISP:
+		error = dm_getall_disp(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(size_t)	DM_Uarg(uap,2), /* buflen */
+				(void *)	DM_Parg(uap,3), /* bufp */
+				(size_t *)	DM_Parg(uap,4));/* rlenp */
+		break;
+	case DM_GETALL_DMATTR:
+		error = dm_getall_dmattr(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(size_t)	DM_Uarg(uap,5), /* buflen */
+				(void *)	DM_Parg(uap,6), /* bufp */
+				(size_t *)	DM_Parg(uap,7));/* rlenp */
+		break;
+	case DM_GETALL_INHERIT:
+		error = dm_getall_inherit(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* nelem */
+				(dm_inherit_t *)DM_Parg(uap,6), /* inheritbufp*/
+				(u_int *)	DM_Parg(uap,7));/* nelemp */
+		break;
+	case DM_GETALL_SESSIONS:
+		error = dm_getall_sessions(
+				(u_int)		DM_Uarg(uap,1), /* nelem */
+				(dm_sessid_t *) DM_Parg(uap,2), /* sidbufp */
+				(u_int *)	DM_Parg(uap,3));/* nelemp */
+		break;
+	case DM_GETALL_TOKENS:
+		error = dm_getall_tokens(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(u_int)		DM_Uarg(uap,2), /* nelem */
+				(dm_token_t *)	DM_Parg(uap,3), /* tokenbufp */
+				(u_int *)	DM_Parg(uap,4));/* nelemp */
+		break;
+	case DM_INIT_ATTRLOC:
+		error = dm_init_attrloc(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_attrloc_t *) DM_Parg(uap,5));/* locp */
+		break;
+	case DM_MKDIR_BY_HANDLE:
+		error = dm_mkdir_by_handle(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* dirhanp */
+				(size_t)	DM_Uarg(uap,3), /* dirhlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(void *)	DM_Parg(uap,5), /* hanp */
+				(size_t)	DM_Uarg(uap,6), /* hlen */
+				(char *)	DM_Parg(uap,7));/* cname */
+		break;
+	case DM_MOVE_EVENT:
+		error = dm_move_event(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* srcsid */
+				(dm_token_t)	DM_Uarg(uap,2), /* token */
+				(dm_sessid_t)	DM_Uarg(uap,3), /* targetsid */
+				(dm_token_t *)	DM_Parg(uap,4));/* rtokenp */
+		break;
+	case DM_OBJ_REF_HOLD:
+		error = dm_obj_ref_hold(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(dm_token_t)	DM_Uarg(uap,2), /* token */
+				(void *)	DM_Parg(uap,3), /* hanp */
+				(size_t)	DM_Uarg(uap,4));/* hlen */
+		break;
+	case DM_OBJ_REF_QUERY:
+		use_rvp = 1;
+		error = dm_obj_ref_query_rvp(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(dm_token_t)	DM_Uarg(uap,2), /* token */
+				(void *)	DM_Parg(uap,3), /* hanp */
+				(size_t)	DM_Uarg(uap,4), /* hlen */
+						&rvp);
+		break;
+	case DM_OBJ_REF_RELE:
+		error = dm_obj_ref_rele(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(dm_token_t)	DM_Uarg(uap,2), /* token */
+				(void *)	DM_Parg(uap,3), /* hanp */
+				(size_t)	DM_Uarg(uap,4));/* hlen */
+		break;
+	case DM_PATH_TO_FSHANDLE:
+		error = dm_path_to_fshdl(
+				(char *)	DM_Parg(uap,1), /* path */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t *)	DM_Parg(uap,3));/* hlenp */
+		break;
+	case DM_PATH_TO_HANDLE:
+		error = dm_path_to_hdl(
+				(char *)	DM_Parg(uap,1), /* path */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t *)	DM_Parg(uap,3));/* hlenp */
+		break;
+	case DM_PENDING:
+		error = dm_pending(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(dm_token_t)	DM_Uarg(uap,2), /* token */
+				(dm_timestruct_t *) DM_Parg(uap,3));/* delay */
+		break;
+	case DM_PROBE_HOLE:
+		error = dm_probe_hole(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_off_t)	DM_Uarg(uap,5), /* off */
+				(dm_size_t)	DM_Uarg(uap,6), /* len */
+				(dm_off_t *)	DM_Parg(uap,7), /* roffp */
+				(dm_size_t *)	DM_Parg(uap,8));/* rlenp */
+		break;
+	case DM_PUNCH_HOLE:
+		error = dm_punch_hole(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_off_t)	DM_Uarg(uap,5), /* off */
+				(dm_size_t)	DM_Uarg(uap,6));/* len */
+		break;
+	case DM_QUERY_RIGHT:
+		error = dm_query_right(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_right_t *)	DM_Parg(uap,5));/* rightp */
+		break;
+	case DM_QUERY_SESSION:
+		error = dm_query_session(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(size_t)	DM_Uarg(uap,2), /* buflen */
+				(void *)	DM_Parg(uap,3), /* bufp */
+				(size_t *)	DM_Parg(uap,4));/* rlenp */
+		break;
+	case DM_READ_INVIS:
+		use_rvp = 1;
+		error = dm_read_invis_rvp(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_off_t)	DM_Uarg(uap,5), /* off */
+				(dm_size_t)	DM_Uarg(uap,6), /* len */
+				(void *)	DM_Parg(uap,7), /* bufp */
+						&rvp);
+		break;
+	case DM_RELEASE_RIGHT:
+		error = dm_release_right(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4));/* token */
+		break;
+	case DM_REMOVE_DMATTR:
+		error = dm_remove_dmattr(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(int)		DM_Uarg(uap,5), /* setdtime */
+				(dm_attrname_t *) DM_Parg(uap,6));/* attrnamep */
+		break;
+	case DM_REQUEST_RIGHT:
+		error = dm_request_right(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* flags */
+				(dm_right_t)	DM_Uarg(uap,6));/* right */
+		break;
+	case DM_RESPOND_EVENT:
+		error = dm_respond_event(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(dm_token_t)	DM_Uarg(uap,2), /* token */
+				(dm_response_t) DM_Uarg(uap,3), /* response */
+				(int)		DM_Uarg(uap,4), /* reterror */
+				(size_t)	DM_Uarg(uap,5), /* buflen */
+				(void *)	DM_Parg(uap,6));/* respbufp */
+		break;
+	case DM_SEND_MSG:
+		error = dm_send_msg(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* targetsid */
+				(dm_msgtype_t)	DM_Uarg(uap,2), /* msgtype */
+				(size_t)	DM_Uarg(uap,3), /* buflen */
+				(void *)	DM_Parg(uap,4));/* bufp */
+		break;
+	case DM_SET_DISP:
+		error = dm_set_disp(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_eventset_t *) DM_Parg(uap,5),/* eventsetp */
+				(u_int)		DM_Uarg(uap,6));/* maxevent */
+		break;
+	case DM_SET_DMATTR:
+		error = dm_set_dmattr(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_attrname_t *) DM_Parg(uap,5),/* attrnamep */
+				(int)		DM_Uarg(uap,6), /* setdtime */
+				(size_t)	DM_Uarg(uap,7), /* buflen */
+				(void *)	DM_Parg(uap,8));/* bufp */
+		break;
+	case DM_SET_EVENTLIST:
+		error = dm_set_eventlist(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_eventset_t *) DM_Parg(uap,5),/* eventsetp */
+				(u_int)		DM_Uarg(uap,6));/* maxevent */
+		break;
+	case DM_SET_FILEATTR:
+		error = dm_set_fileattr(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* mask */
+				(dm_fileattr_t *)DM_Parg(uap,6));/* attrp */
+		break;
+	case DM_SET_INHERIT:
+		error = dm_set_inherit(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_attrname_t *)DM_Parg(uap,5),/* attrnamep */
+				(mode_t)	DM_Uarg(uap,6));/* mode */
+		break;
+	case DM_SET_REGION:
+		error = dm_set_region(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(u_int)		DM_Uarg(uap,5), /* nelem */
+				(dm_region_t *) DM_Parg(uap,6), /* regbufp */
+				(dm_boolean_t *) DM_Parg(uap,7));/* exactflagp */
+		break;
+	case DM_SET_RETURN_ON_DESTROY:
+		error = dm_set_return_on_destroy(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(dm_attrname_t *) DM_Parg(uap,5),/* attrnamep */
+				(dm_boolean_t)	DM_Uarg(uap,6));/* enable */
+		break;
+	case DM_SYMLINK_BY_HANDLE:
+		error = dm_symlink_by_handle(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* dirhanp */
+				(size_t)	DM_Uarg(uap,3), /* dirhlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(void *)	DM_Parg(uap,5), /* hanp */
+				(size_t)	DM_Uarg(uap,6), /* hlen */
+				(char *)	DM_Parg(uap,7), /* cname */
+				(char *)	DM_Parg(uap,8));/* path */
+		break;
+	case DM_SYNC_BY_HANDLE:
+		error = dm_sync_by_handle(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4));/* token */
+		break;
+	case DM_UPGRADE_RIGHT:
+		error = dm_upgrade_right(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4));/* token */
+		break;
+	case DM_WRITE_INVIS:
+		use_rvp = 1;
+		error = dm_write_invis_rvp(
+				(dm_sessid_t)	DM_Uarg(uap,1), /* sid */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(dm_token_t)	DM_Uarg(uap,4), /* token */
+				(int)		DM_Uarg(uap,5), /* flags */
+				(dm_off_t)	DM_Uarg(uap,6), /* off */
+				(dm_size_t)	DM_Uarg(uap,7), /* len */
+				(void *)	DM_Parg(uap,8), /* bufp */
+						&rvp);
+		break;
+	case DM_OPEN_BY_HANDLE:
+		use_rvp = 1;
+		error = dm_open_by_handle_rvp(
+				(unsigned int)	DM_Uarg(uap,1), /* fd */
+				(void *)	DM_Parg(uap,2), /* hanp */
+				(size_t)	DM_Uarg(uap,3), /* hlen */
+				(int)		DM_Uarg(uap,4), /* flags */
+						&rvp);
+		break;
+	default:
+		error = ENOSYS;
+		break;
+	}
+	/* If it was an *_rvp() function, then
+		if error==0, return |rvp|
+	*/
+	if( use_rvp && (error == 0) )
+		return rvp;
+	else
+		return -error;
+}
+
+
+
+static int
+dmapi_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+
+static int
+dmapi_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+
+/* say hello, and let me know the device is hooked up */
+static ssize_t
+dmapi_dump(struct file *file, char *buf, size_t count, loff_t *ppos)
+{
+	char tmp[50];
+	int len;
+	if( *ppos == 0 ){
+		len = sprintf( tmp, "# " DM_VER_STR_CONTENTS "\n" );
+		if( copy_to_user(buf, tmp, len) )
+			return -EFAULT;
+		*ppos += 1;
+		return len;
+	}
+	return 0;
+}
+
+static struct file_operations dmapi_fops = {
+	open:		dmapi_open,
+	ioctl:		dmapi_ioctl,
+	read:		dmapi_dump,
+	release:	dmapi_release
+};
+
+static struct miscdevice dmapi_dev = {
+	minor:	DMAPI_MINOR,
+	name:	"dmapi",
+	fops:	&dmapi_fops
+};
+
+
+
+#ifdef CONFIG_PROC_FS
+static int
+dmapi_summary(char *buffer, char **start, off_t offset,
+		 int count, int *eof, void *data)
+{
+	int len;
+
+	extern u_int dm_sessions_active;
+	extern dm_sessid_t dm_next_sessid;
+	extern dm_token_t dm_next_token;
+	extern dm_sequence_t dm_next_sequence;
+	extern int dm_fsys_cnt;
+
+#define CHKFULL if(len >= count) break;
+#define ADDBUF(a,b)	len += sprintf(buffer + len, a, b); CHKFULL;
+
+	len=0;
+	while(1){
+		ADDBUF("dm_sessions_active=%u\n", dm_sessions_active);
+		ADDBUF("dm_next_sessid=%d\n", (int)dm_next_sessid);
+		ADDBUF("dm_next_token=%d\n", (int)dm_next_token);
+		ADDBUF("dm_next_sequence=%u\n", (u_int)dm_next_sequence);
+		ADDBUF("dm_fsys_cnt=%d\n", dm_fsys_cnt);
+
+		break;
+	}
+
+	if (offset >= len) {
+		*start = buffer;
+		*eof = 1;
+		return 0;
+	}
+	*start = buffer + offset;
+	if ((len -= offset) > count)
+		return count;
+	*eof = 1;
+
+	return len;
+}
+#endif
+
+
+void
+dmapi_init_procfs(void)
+{
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *entry;
+
+	if ((entry = proc_mkdir( DMAPI_DBG_PROCFS, 0)) == NULL )
+		return;
+	entry->owner = THIS_MODULE;
+	entry->mode = S_IFDIR | S_IRUSR | S_IXUSR;
+
+	if ((entry = proc_mkdir( DMAPI_DBG_PROCFS "/fsreg", 0)) == NULL )
+		return;
+	entry->owner = THIS_MODULE;
+
+	if ((entry = proc_mkdir( DMAPI_DBG_PROCFS "/sessions", 0)) == NULL )
+		return;
+	entry->owner = THIS_MODULE;
+
+	entry = create_proc_read_entry( DMAPI_DBG_PROCFS "/summary", 0, 0, dmapi_summary, NULL);
+	entry->owner = THIS_MODULE;
+
+	entry = proc_mknod( DMAPI_PROCFS, S_IFCHR | S_IRUSR | S_IWUSR,
+			   NULL, MKDEV(MISC_MAJOR,DMAPI_MINOR));
+	if( entry == NULL )
+		return;
+	entry->owner = THIS_MODULE;
+#endif
+}
+
+void
+dmapi_cleanup_procfs(void)
+{
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry( DMAPI_PROCFS, NULL);
+	remove_proc_entry( DMAPI_DBG_PROCFS "/summary", NULL);
+	remove_proc_entry( DMAPI_DBG_PROCFS "/fsreg", NULL);
+	remove_proc_entry( DMAPI_DBG_PROCFS "/sessions", NULL);
+	remove_proc_entry( DMAPI_DBG_PROCFS, NULL);
+#endif
+}
+
+
+int __init dmapi_init(void)
+{
+	int ret;
+
+	dm_tokdata_cachep = kmem_cache_create("dm_tokdata",
+				sizeof(struct dm_tokdata), 0, 0, NULL, NULL);
+	if (dm_tokdata_cachep == NULL)
+		return -ENOMEM;
+
+	dm_fsreg_cachep = kmem_cache_create("dm_fsreg",
+				sizeof(struct dm_fsreg), 0, 0, NULL, NULL);
+	if (dm_fsreg_cachep == NULL) {
+		kmem_cache_destroy(dm_tokdata_cachep);
+		return -ENOMEM;
+	}
+
+	dm_session_cachep = kmem_cache_create("dm_session",
+				sizeof(struct dm_session), 0, 0, NULL, NULL);
+	if (dm_session_cachep == NULL) {
+		kmem_cache_destroy(dm_tokdata_cachep);
+		kmem_cache_destroy(dm_fsreg_cachep);
+		return -ENOMEM;
+	}
+
+	ret = misc_register(&dmapi_dev);
+	if( ret != 0 )
+		printk(KERN_ERR "dmapi_init: misc_register returned %d\n", ret);
+	dmapi_init_procfs();
+	return(0);
+}
+
+void __exit dmapi_uninit(void)
+{
+	misc_deregister(&dmapi_dev);
+	dmapi_cleanup_procfs();
+	kmem_cache_destroy(dm_tokdata_cachep);
+	kmem_cache_destroy(dm_fsreg_cachep);
+	kmem_cache_destroy(dm_session_cachep);
+	dm_fsys_vector_free();
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/err.log linux-2.4.20-pre10-mjc2/fs/xfs/err.log
--- linux-2.4.20-pre10-mjc1/fs/xfs/err.log	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/err.log	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,2 @@
+Makefile:150: /Rules.make: No such file or directory
+make: *** No rule to make target `/Rules.make'.  Stop.
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/Makefile linux-2.4.20-pre10-mjc2/fs/xfs/linux/Makefile
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/Makefile	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/Makefile	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,65 @@
+#
+#
+# Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+# Makefile for XFS on Linux.
+
+# This needs -I.. because everything does #include <xfs.h> instead of "xfs.h".
+# The code is wrong, local files should be included using "xfs.h", not <xfs.h>
+# but I am not going to change every file at the moment.
+EXTRA_CFLAGS += -I.. -funsigned-char
+
+ifeq ($(CONFIG_XFS_DEBUG),y)
+	EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG -DXFSDEBUG
+endif
+
+O_TARGET			:= linux_xfs.o
+ifneq ($(MAKECMDGOALS),modules_install)
+  obj-m				:= $(O_TARGET)
+endif
+
+export-objs			:= xfs_globals.o
+
+obj-$(CONFIG_PROC_FS)		+= xfs_stats.o
+obj-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
+
+obj-y				+= xfs_aops.o \
+				   xfs_behavior.o \
+				   xfs_file.o \
+				   xfs_fs_subr.o \
+				   xfs_globals.o \
+				   xfs_ioctl.o \
+				   xfs_iops.o \
+				   xfs_lrw.o \
+				   xfs_super.o \
+				   xfs_vnode.o
+
+include $(TOPDIR)/Rules.make
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/Makefile.in linux-2.4.20-pre10-mjc2/fs/xfs/linux/Makefile.in
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/Makefile.in	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/Makefile.in	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,44 @@
+#
+#
+# Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+# Makefile for XFS on Linux.
+
+expsyms(xfs_globals.o)
+
+objlink(linux_xfs.o xfs_behavior.o xfs_cred.o xfs_file.o xfs_fs_subr.o
+	xfs_globals.o xfs_ioctl.o xfs_iops.o xfs_lrw.o xfs_stats.o xfs_sysctl.o
+	xfs_super.o xfs_vfs.o xfs_vnode.o xfs_aops.o)
+
+# No select() for linux_xfs in this directory.	It is a sub-component of XFS,
+# see fs/xfs/Makefile.in for the objlink.
+
+extra_cflags_all($(XFS_EXTRA_CFLAGS))
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_aops.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_aops.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_aops.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_aops.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,912 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/mm.h>
+#include <linux/iobuf.h>
+#include <linux/locks.h>
+#include <linux/pagemap.h>
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,9)
+#define page_buffers(page)	((page)->buffers)
+#define page_has_buffers(page)	((page)->buffers)
+#endif
+
+STATIC int linvfs_pb_bmap(struct inode *, loff_t, ssize_t,
+			  struct page_buf_bmap_s *, int);
+STATIC int delalloc_convert(struct inode *, struct page *, int, int);
+
+/*
+ * match_offset_to_mapping
+ * Finds the corresponding mapping in block @map array of the
+ * given @offset within a @page.
+ */
+STATIC page_buf_bmap_t *
+match_offset_to_mapping(
+	struct page		*page,
+	page_buf_bmap_t		*map,
+	unsigned long		offset)
+{
+	loff_t			full_offset;	/* offset from start of file */
+
+	ASSERT(offset < PAGE_CACHE_SIZE);
+
+	full_offset = page->index;		/* NB: using 64bit number */
+	full_offset <<= PAGE_CACHE_SHIFT;	/* offset from file start */
+	full_offset += offset;			/* offset from page start */
+
+	if (full_offset < map->pbm_offset)
+		return NULL;
+	if (map->pbm_offset + map->pbm_bsize > full_offset)
+		return map;
+	return NULL;
+}
+
+STATIC void
+map_buffer_at_offset(
+	struct page		*page,
+	struct buffer_head	*bh,
+	unsigned long		offset,
+	int			block_bits,
+	page_buf_bmap_t		*mp)
+{
+	page_buf_daddr_t	bn;
+	loff_t			delta;
+	int			sector_shift;
+
+	ASSERT(!(mp->pbm_flags & PBMF_HOLE));
+	ASSERT(!(mp->pbm_flags & PBMF_DELAY));
+	ASSERT(!(mp->pbm_flags & PBMF_UNWRITTEN));
+	ASSERT(mp->pbm_bn != PAGE_BUF_DADDR_NULL);
+
+	delta = page->index;
+	delta <<= PAGE_CACHE_SHIFT;
+	delta += offset;
+	delta -= mp->pbm_offset;
+	delta >>= block_bits;
+
+	sector_shift = block_bits - 9;
+	bn = mp->pbm_bn >> sector_shift;
+	bn += delta;
+	ASSERT((bn << sector_shift) >= mp->pbm_bn);
+
+	lock_buffer(bh);
+	bh->b_blocknr = bn;
+	bh->b_dev = mp->pbm_target->pbr_kdev;
+	set_bit(BH_Mapped, &bh->b_state);
+	clear_bit(BH_Delay, &bh->b_state);
+}
+
+/*
+ * Convert delalloc space to real space, do not flush the
+ * data out to disk, that will be done by the caller.
+ */
+STATIC int
+release_page(
+	struct page		*page)
+{
+	struct inode		*inode = (struct inode*)page->mapping->host;
+	unsigned long		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int			ret;
+
+	/* Are we off the end of the file ? */
+	if (page->index >= end_index) {
+		unsigned offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+		if ((page->index >= end_index+1) || !offset) {
+			ret =  -EIO;
+			goto out;
+		}
+	}
+
+	ret = delalloc_convert(inode, page, 0, 0);
+
+out:
+	if (ret < 0) {
+		block_flushpage(page, 0);
+		ClearPageUptodate(page);
+
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Convert delalloc or unmapped space to real space and flush out
+ * to disk.
+ */
+STATIC int
+write_full_page(
+	struct page		*page,
+	int			delalloc)
+{
+	struct inode		*inode = (struct inode*)page->mapping->host;
+	unsigned long		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int			ret;
+
+	/* Are we off the end of the file ? */
+	if (page->index >= end_index) {
+		unsigned offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+		if ((page->index >= end_index+1) || !offset) {
+			ret =  -EIO;
+			goto out;
+		}
+	}
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
+	}
+
+	ret = delalloc_convert(inode, page, 1, delalloc == 0);
+
+out:
+	if (ret < 0) {
+		/*
+		 * If it's delalloc and we have nowhere to put it,
+		 * throw it away.
+		 */
+		if (delalloc)
+			block_flushpage(page, 0);
+		ClearPageUptodate(page);
+		unlock_page(page);
+	}
+
+	return ret;
+}
+
+/*
+ * Look for a page at index which is unlocked and not mapped
+ * yet - clustering for mmap write case.
+ */
+STATIC unsigned int
+probe_unmapped_page(
+	struct address_space	*mapping,
+	unsigned long		index,
+	unsigned int		pg_offset)
+{
+	struct page		*page;
+	int			ret = 0;
+
+	page = find_get_page(mapping, index);
+	if (!page)
+		return 0;
+	if (TryLockPage(page)) {
+		page_cache_release(page);
+		return 0;
+	}
+	if (page->mapping && PageDirty(page)) {
+		if (!page_has_buffers(page)) {
+			ret = PAGE_CACHE_SIZE;
+		} else {
+			struct buffer_head	*bh, *head;
+			bh = head = page_buffers(page);
+			do {
+				if (buffer_mapped(bh) || !buffer_uptodate(bh)) {
+					break;
+				}
+				ret += bh->b_size;
+				if (ret >= pg_offset)
+					break;
+			} while ((bh = bh->b_this_page) != head);
+		}
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+
+STATIC unsigned int
+probe_unmapped_cluster(
+	struct inode		*inode,
+	struct page		*startpage,
+	struct buffer_head	*bh,
+	struct buffer_head	*head)
+{
+	unsigned long		tindex, tlast;
+	unsigned int		len, total = 0;
+	struct address_space	*mapping = inode->i_mapping;
+
+	/* First sum forwards in this page */
+	do {
+		if (buffer_mapped(bh))
+			break;
+		total += bh->b_size;
+	} while ((bh = bh->b_this_page) != head);
+
+	/* if we reached the end of the page, sum forwards in
+	 * following pages.
+	 */
+	if (bh == head) {
+		tlast = inode->i_size >> PAGE_CACHE_SHIFT;
+		for (tindex = startpage->index + 1; tindex < tlast; tindex++) {
+			len = probe_unmapped_page(mapping, tindex,
+							PAGE_CACHE_SIZE);
+			if (!len)
+				break;
+			total += len;
+		}
+		if ((tindex == tlast) && (inode->i_size & ~PAGE_CACHE_MASK)) {
+			len = probe_unmapped_page(mapping, tindex,
+					inode->i_size & ~PAGE_CACHE_MASK);
+			total += len;
+		}
+	}
+	return total;
+}
+
+/*
+ * Probe for a given page (index) in the inode & test if it is delayed.
+ * Returns page locked and with an extra reference count.
+ */
+STATIC struct page *
+probe_page(
+	struct inode		*inode,
+	unsigned long		index)
+{
+	struct page		*page;
+
+	page = find_get_page(inode->i_mapping, index);
+	if (!page)
+		return NULL;
+	if (TryLockPage(page)) {
+		page_cache_release(page);
+		return NULL;
+	}
+	if (page->mapping && page_has_buffers(page)) {
+		struct buffer_head	*bh, *head;
+
+		bh = head = page_buffers(page);
+		do {
+			if (buffer_delay(bh))
+				return page;
+		} while ((bh = bh->b_this_page) != head);
+	}
+	unlock_page(page);
+	page_cache_release(page);
+	return NULL;
+}
+
+STATIC void
+submit_page(
+	struct page		*page,
+	struct buffer_head	*bh_arr[],
+	int			cnt)
+{
+	struct buffer_head	*bh;
+	int			i;
+
+	if (cnt) {
+		for (i = 0; i < cnt; i++) {
+			bh = bh_arr[i];
+			set_buffer_async_io(bh);
+			set_bit(BH_Uptodate, &bh->b_state);
+			clear_bit(BH_Dirty, &bh->b_state);
+		}
+
+		for (i = 0; i < cnt; i++)
+			submit_bh(WRITE, bh_arr[i]);
+	} else
+		unlock_page(page);
+}
+
+STATIC int
+map_page(
+	struct inode		*inode,
+	struct page		*page,
+	page_buf_bmap_t		*maps,
+	struct buffer_head	*bh_arr[],
+	int			startio,
+	int			all_bh)
+{
+	struct buffer_head	*bh, *head;
+	page_buf_bmap_t		*mp = maps, *tmp;
+	unsigned long		end, offset, end_index;
+	int			i = 0, index = 0;
+	int			bbits = inode->i_blkbits;
+
+	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	if (page->index < end_index) {
+		end = PAGE_CACHE_SIZE;
+	} else {
+		end = inode->i_size & (PAGE_CACHE_SIZE-1);
+	}
+	bh = head = page_buffers(page);
+	do {
+		offset = i << bbits;
+		if (!buffer_uptodate(bh))
+			continue;
+		if (buffer_mapped(bh) && !buffer_delay(bh) && all_bh) {
+			if (startio && (offset < end)) {
+				lock_buffer(bh);
+				bh_arr[index++] = bh;
+			}
+			continue;
+		}
+		tmp = match_offset_to_mapping(page, mp, offset);
+		if (!tmp)
+			continue;
+		ASSERT(!(tmp->pbm_flags & PBMF_HOLE));
+		ASSERT(!(tmp->pbm_flags & PBMF_DELAY));
+		map_buffer_at_offset(page, bh, offset, bbits, tmp);
+		if (startio && (offset < end)) {
+			bh_arr[index++] = bh;
+		} else {
+			unlock_buffer(bh);
+		}
+	} while (i++, (bh = bh->b_this_page) != head);
+
+	return index;
+}
+
+/*
+ * Allocate & map buffers for page given the extent map. Write it out.
+ * except for the original page of a writepage, this is called on
+ * delalloc pages only, for the original page it is possible that
+ * the page has no mapping at all.
+ */
+STATIC void
+convert_page(
+	struct inode		*inode,
+	struct page		*page,
+	page_buf_bmap_t		*maps,
+	int			startio,
+	int			all_bh)
+{
+	struct buffer_head	*bh_arr[MAX_BUF_PER_PAGE];
+	int			cnt;
+
+	cnt = map_page(inode, page, maps, bh_arr, startio, all_bh);
+	submit_page(page, bh_arr, cnt);
+	page_cache_release(page);
+}
+
+/*
+ * Convert & write out a cluster of pages in the same extent as defined
+ * by mp and following the start page.
+ */
+STATIC void
+cluster_write(
+	struct inode		*inode,
+	unsigned long		tindex,
+	page_buf_bmap_t		*mp,
+	int			startio,
+	int			all_bh)
+{
+	unsigned long		tlast;
+	struct page		*page;
+
+	tlast = (mp->pbm_offset + mp->pbm_bsize) >> PAGE_CACHE_SHIFT;
+	for (; tindex < tlast; tindex++) {
+		if (!(page = probe_page(inode, tindex)))
+			break;
+		convert_page(inode, page, mp, startio, all_bh);
+	}
+}
+
+/*
+ * Calling this without allocate_space set means we are being asked to
+ * flush a dirty buffer head. When called with async_write set then we
+ * are coming from writepage. A writepage call with allocate_space set
+ * means we are being asked to write out all of the page which is before
+ * EOF and therefore need to allocate space for unmapped portions of the
+ * page.
+ */
+STATIC int
+delalloc_convert(
+	struct inode		*inode,		/* inode containing page */
+	struct page		*page,		/* page to convert - locked */
+	int			startio,	/* start io on the page */
+	int			allocate_space)
+{
+	struct buffer_head	*bh, *head;
+	struct buffer_head	*bh_arr[MAX_BUF_PER_PAGE];
+	page_buf_bmap_t		*mp, map;
+	int			i, cnt = 0;
+	int			len, err;
+	unsigned long		p_offset = 0;
+	loff_t			offset;
+	loff_t			end_offset;
+
+	offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	end_offset = offset + PAGE_CACHE_SIZE;
+	if (end_offset > inode->i_size)
+		end_offset = inode->i_size;
+
+	bh = head = page_buffers(page);
+	mp = NULL;
+
+	len = bh->b_size;
+	do {
+		if (!buffer_uptodate(bh) && !startio) {
+			goto next_bh;
+		}
+
+		if (mp) {
+			mp = match_offset_to_mapping(page, &map, p_offset);
+		}
+
+		if (buffer_delay(bh)) {
+			if (!mp) {
+				err = linvfs_pb_bmap(inode, offset, len, &map,
+						PBF_WRITE|PBF_FILE_ALLOCATE);
+				if (err)
+					goto error;
+				mp = match_offset_to_mapping(page, &map,
+								p_offset);
+			}
+			if (mp) {
+				map_buffer_at_offset(page, bh, p_offset,
+					inode->i_blkbits, mp);
+				if (startio) {
+					bh_arr[cnt++] = bh;
+				} else {
+					unlock_buffer(bh);
+				}
+			}
+		} else if (!buffer_mapped(bh) && allocate_space) {
+			int	size;
+
+			/* Getting here implies an unmapped buffer was found,
+			 * and we are in a path where we need to write the
+			 * whole page out.
+			 */
+			if (!mp) {
+				size = probe_unmapped_cluster(inode, page,
+								bh, head);
+				err = linvfs_pb_bmap(inode, offset, size, &map,
+						PBF_WRITE|PBF_DIRECT);
+				if (err)
+					goto error;
+				mp = match_offset_to_mapping(page, &map,
+								p_offset);
+			}
+			if (mp) {
+				map_buffer_at_offset(page, bh, p_offset,
+					inode->i_blkbits, mp);
+				if (startio) {
+					bh_arr[cnt++] = bh;
+				} else {
+					unlock_buffer(bh);
+				}
+			}
+		} else if (startio && buffer_mapped(bh)) {
+			if (buffer_dirty(bh) || allocate_space) {
+				lock_buffer(bh);
+				bh_arr[cnt++] = bh;
+			}
+		}
+
+next_bh:
+		offset += len;
+		p_offset += len;
+		bh = bh->b_this_page;
+	} while (offset < end_offset);
+
+	if (startio)
+		submit_page(page, bh_arr, cnt);
+
+	if (mp)
+		cluster_write(inode, page->index + 1, mp,
+				startio, allocate_space);
+
+	return 0;
+
+error:
+	for (i = 0; i < cnt; i++) {
+		unlock_buffer(bh_arr[i]);
+	}
+
+	return err;
+}
+
+STATIC int
+linvfs_get_block_core(
+	struct inode		*inode,
+	long			iblock,
+	struct buffer_head	*bh_result,
+	int			create,
+	int			direct,
+	page_buf_flags_t	flags)
+{
+	vnode_t			*vp = LINVFS_GET_VP(inode);
+	page_buf_bmap_t		pbmap;
+	int			retpbbm = 1;
+	int			error;
+	ssize_t			size;
+	loff_t			offset = (loff_t)iblock << inode->i_blkbits;
+
+	/* If we are doing writes at the end of the file,
+	 * allocate in chunks
+	 */
+	if (create && (offset >= inode->i_size) && !(flags & PBF_SYNC))
+		size = 1 << XFS_WRITE_IO_LOG;
+	else
+		size = 1 << inode->i_blkbits;
+
+	VOP_BMAP(vp, offset, size,
+		create ? flags : PBF_READ, NULL,
+		(struct page_buf_bmap_s *)&pbmap, &retpbbm, error);
+	if (error)
+		return -error;
+
+	if (retpbbm == 0)
+		return 0;
+
+	if (pbmap.pbm_bn != PAGE_BUF_DADDR_NULL) {
+		page_buf_daddr_t	bn;
+		loff_t			delta;
+
+		delta = offset - pbmap.pbm_offset;
+		delta >>= inode->i_blkbits;
+
+		bn = pbmap.pbm_bn >> (inode->i_blkbits - 9);
+		bn += delta;
+
+		bh_result->b_blocknr = bn;
+		set_bit(BH_Mapped, &bh_result->b_state);
+	}
+
+	/* If we previously allocated a block out beyond eof and
+	 * we are now coming back to use it then we will need to
+	 * flag it as new even if it has a disk address.
+	 */
+	if (create &&
+	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+	     (offset >= inode->i_size))) {
+		set_bit(BH_New, &bh_result->b_state);
+	}
+
+	if (pbmap.pbm_flags & PBMF_DELAY) {
+		if (unlikely(direct))
+			BUG();
+
+		if (create) {
+			set_bit(BH_Mapped, &bh_result->b_state);
+			set_bit(BH_Uptodate, &bh_result->b_state);
+		}
+		set_bit(BH_Delay, &bh_result->b_state);
+	}
+
+	return 0;
+}
+
+int
+linvfs_get_block(
+	struct inode		*inode,
+	long			iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return linvfs_get_block_core(inode, iblock, bh_result,
+					create, 0, PBF_WRITE);
+}
+
+STATIC int
+linvfs_get_block_sync(
+	struct inode		*inode,
+	long			iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return linvfs_get_block_core(inode, iblock, bh_result,
+					create, 0, PBF_SYNC|PBF_WRITE);
+}
+
+
+STATIC int
+linvfs_get_block_direct(
+	struct inode		*inode,
+	long			iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return linvfs_get_block_core(inode, iblock, bh_result,
+					create, 1, PBF_WRITE|PBF_DIRECT);
+}
+
+STATIC int
+linvfs_pb_bmap(
+	struct inode		*inode,
+	loff_t			offset,
+	ssize_t			count,
+	page_buf_bmap_t		*pbmapp,
+	int			flags)
+{
+	vnode_t			*vp = LINVFS_GET_VP(inode);
+	int			error, nmaps = 1;
+
+retry:
+	if (flags & PBF_FILE_ALLOCATE) {
+		VOP_STRATEGY(vp, offset, count, flags, NULL,
+				pbmapp, &nmaps, error);
+	} else {
+		VOP_BMAP(vp, offset, count, flags, NULL,
+				pbmapp, &nmaps, error);
+	}
+	if (flags & PBF_WRITE) {
+		if (unlikely((flags & PBF_DIRECT) && nmaps &&
+		    (pbmapp->pbm_flags & PBMF_DELAY))) {
+			flags = PBF_WRITE | PBF_FILE_ALLOCATE;
+			goto retry;
+		}
+		VMODIFY(vp);
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_bmap(
+	struct address_space	*mapping,
+	long			block)
+{
+	struct inode		*inode = (struct inode *)mapping->host;
+	vnode_t			*vp = LINVFS_GET_VP(inode);
+	int			error;
+
+	/* block	     - Linux disk blocks    512b */
+	/* bmap input offset - bytes		      1b */
+	/* bmap output bn    - XFS BBs		    512b */
+	/* bmap output delta - bytes		      1b */
+
+	vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address);
+
+	VOP_RWLOCK(vp, VRWLOCK_READ);
+	VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+	VOP_RWUNLOCK(vp, VRWLOCK_READ);
+	return generic_block_bmap(mapping, block, linvfs_get_block_direct);
+}
+
+STATIC int
+linvfs_read_full_page(
+	struct file		*unused,
+	struct page		*page)
+{
+	return block_read_full_page(page, linvfs_get_block);
+}
+
+STATIC int
+count_page_state(
+	struct page		*page,
+	int			*nr_delalloc,
+	int			*nr_unmapped)
+{
+	*nr_delalloc = *nr_unmapped = 0;
+
+	if (page_has_buffers(page)) {
+		struct buffer_head	*bh, *head;
+
+		bh = head = page_buffers(page);
+		do {
+			if (buffer_uptodate(bh) && !buffer_mapped(bh)) {
+				(*nr_unmapped)++;
+				continue;
+			}
+			if (!buffer_delay(bh))
+				continue;
+			(*nr_delalloc)++;
+		} while ((bh = bh->b_this_page) != head);
+		return 1;
+	}
+
+	return 0;
+}
+
+STATIC int
+linvfs_write_full_page(
+	struct page		*page)
+{
+	int			flagset = 0;
+	int			error;
+	int			need_trans;
+	int			nr_delalloc, nr_unmapped;
+
+	if (count_page_state(page, &nr_delalloc, &nr_unmapped)) {
+		need_trans = nr_delalloc + nr_unmapped;
+	} else {
+		need_trans = 1;
+	}
+
+	if ((current->flags & (PF_FSTRANS|PF_NOIO)) && need_trans)
+		goto out_fail;
+
+	if (need_trans) {
+		current->flags |= PF_NOIO;
+		flagset = 1;
+	}
+
+	error = write_full_page(page, nr_delalloc);
+
+	if (flagset)
+		current->flags &= ~PF_NOIO;
+	return error;
+
+out_fail:
+	SetPageDirty(page);
+	unlock_page(page);
+	return 0;
+}
+
+STATIC int
+linvfs_prepare_write(
+	struct file		*file,
+	struct page		*page,
+	unsigned int		from,
+	unsigned int		to)
+{
+	if (file && (file->f_flags & O_SYNC)) {
+		return block_prepare_write(page, from, to,
+						linvfs_get_block_sync);
+	} else {
+		return block_prepare_write(page, from, to,
+						linvfs_get_block);
+	}
+}
+
+/*
+ * Initiate I/O on a kiobuf of user memory
+ */
+STATIC int
+linvfs_direct_IO(
+	int			rw,
+	struct inode		*inode,
+	struct kiobuf		*iobuf,
+	unsigned long		blocknr,
+	int			blocksize)
+{
+	struct page		**maplist;
+	size_t			page_offset;
+	page_buf_t		*pb;
+	page_buf_bmap_t		map;
+	int			error = 0;
+	int			pb_flags, map_flags, pg_index = 0;
+	size_t			length, total;
+	loff_t			offset;
+	size_t			map_size, size;
+
+	total = length = iobuf->length;
+	offset = blocknr;
+	offset <<= inode->i_blkbits;
+
+	maplist = iobuf->maplist;
+	page_offset = iobuf->offset;
+
+	map_flags = (rw ? PBF_WRITE : PBF_READ) | PBF_DIRECT;
+	pb_flags = (rw ? PBF_WRITE : PBF_READ) | PBF_FORCEIO | _PBF_LOCKABLE;
+	while (length) {
+		error = linvfs_pb_bmap(inode, offset, length, &map, map_flags);
+		if (error)
+			break;
+
+		map_size = map.pbm_bsize - map.pbm_delta;
+		size = min(map_size, length);
+		if (map.pbm_flags & PBMF_HOLE) {
+			size_t	zero_len = size;
+
+			if (rw == WRITE)
+				break;
+
+			/* Need to zero it all */
+			while (zero_len) {
+				struct page	*page;
+				size_t		pg_len;
+
+				pg_len = min((size_t)
+						(PAGE_CACHE_SIZE - page_offset),
+						zero_len);
+
+				page = maplist[pg_index];
+
+				memset(kmap(page) + page_offset, 0, pg_len);
+				flush_dcache_page(page);
+				kunmap(page);
+
+				zero_len -= pg_len;
+				if ((pg_len + page_offset) == PAGE_CACHE_SIZE) {
+					pg_index++;
+					page_offset = 0;
+				} else {
+					page_offset = (page_offset + pg_len) &
+							~PAGE_CACHE_MASK;
+				}
+			}
+		} else {
+			int	pg_count;
+
+			pg_count = (size + page_offset + PAGE_CACHE_SIZE - 1)
+					>> PAGE_CACHE_SHIFT;
+			pb = pagebuf_lookup(map.pbm_target, inode, offset,
+							size, pb_flags);
+			/* Need to hook up pagebuf to kiobuf pages */
+			pb->pb_pages = &maplist[pg_index];
+			pb->pb_offset = page_offset;
+			pb->pb_page_count = pg_count;
+
+			pb->pb_bn = map.pbm_bn + (map.pbm_delta >> 9);
+			pagebuf_iostart(pb, pb_flags);
+			pb->pb_flags &= ~_PBF_LOCKABLE;
+			pagebuf_rele(pb);
+
+			page_offset = (page_offset + size) & ~PAGE_CACHE_MASK;
+			if (page_offset)
+				pg_count--;
+			pg_index += pg_count;
+		}
+
+		offset += size;
+		length -= size;
+	}
+
+	return error ? error : total - length;
+}
+
+/*
+ * This gets a page into cleanable state - page locked on entry
+ * kept locked on exit. If the page is marked dirty we should
+ * not come this way.
+ */
+STATIC int
+linvfs_release_page(
+	struct page		*page,
+	int			gfp_mask)
+{
+	int			need_trans;
+	int			nr_delalloc, nr_unmapped;
+
+	if (count_page_state(page, &nr_delalloc, &nr_unmapped)) {
+		need_trans = nr_delalloc;
+	} else {
+		need_trans = 0;
+	}
+
+	if (need_trans == 0) {
+		return 1;
+	}
+
+	if (gfp_mask & __GFP_FS) {
+		return release_page(page);
+	}
+	return 0;
+}
+
+
+struct address_space_operations linvfs_aops = {
+	.readpage		= linvfs_read_full_page,
+	.writepage		= linvfs_write_full_page,
+	.sync_page		= block_sync_page,
+	.releasepage		= linvfs_release_page,
+	.prepare_write		= linvfs_prepare_write,
+	.commit_write		= generic_commit_write,
+	.bmap			= linvfs_bmap,
+	.direct_IO		= linvfs_direct_IO,
+};
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_behavior.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_behavior.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_behavior.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_behavior.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ *
+ */
+
+/*
+ * Source file used to associate/disassociate behaviors with virtualized
+ * objects.  See behavior.h for more information about behaviors, etc.
+ *
+ * The implementation is split between functions in this file and macros
+ * in behavior.h.
+ */
+#include <xfs.h>
+
+kmem_zone_t	*bhv_global_zone;
+
+/*
+ * Global initialization function called out of main.
+ */
+void
+bhv_global_init(void)
+{
+	/*
+	 * Initialize a behavior zone used by subsystems using behaviors
+	 * but without any private data.  In the UNIKERNEL case, this zone
+	 * is used only for behaviors that are not yet isolated to a single
+	 * cell.  The only such user is in pshm.c in which a dummy vnode is
+	 * obtained in support of vce avoidance logic.
+	 */
+	bhv_global_zone = kmem_zone_init(sizeof(bhv_desc_t), "bhv_global_zone");
+}
+
+/*
+ * Remove a behavior descriptor from a position in a behavior chain;
+ * the postition is guaranteed not to be the first position.
+ * Should only be called by the bhv_remove() macro.
+ *
+ * The act of modifying the chain is done atomically w.r.t. ops-in-progress
+ * (see comment at top of behavior.h for more info on synchronization).
+ */
+void
+bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
+{
+	bhv_desc_t	*curdesc, *prev;
+
+	ASSERT(bhp->bh_first != NULL);
+	ASSERT(bhp->bh_first->bd_next != NULL);
+
+	prev = bhp->bh_first;
+	for (curdesc = bhp->bh_first->bd_next;
+	     curdesc != NULL;
+	     curdesc = curdesc->bd_next) {
+
+		if (curdesc == bdp)
+			break;		/* found it */
+		prev = curdesc;
+	}
+
+	ASSERT(curdesc == bdp);
+	prev->bd_next = bdp->bd_next;	/* remove from after prev */
+					/* atomic wrt oip's */
+}
+
+/*
+ * Look for a specific ops vector on the specified behavior chain.
+ * Return the associated behavior descriptor.  Or NULL, if not found.
+ */
+bhv_desc_t *
+bhv_lookup(bhv_head_t *bhp, void *ops)
+{
+	bhv_desc_t	*curdesc;
+
+	for (curdesc = bhp->bh_first;
+	     curdesc != NULL;
+	     curdesc = curdesc->bd_next) {
+
+		if (curdesc->bd_ops == ops)
+			return curdesc;
+	}
+
+	return NULL;
+}
+
+/*
+ * Look for a specific ops vector on the specified behavior chain.
+ * Return the associated behavior descriptor.  Or NULL, if not found.
+ *
+ * The caller has not read locked the behavior chain, so acquire the
+ * lock before traversing the chain.
+ */
+bhv_desc_t *
+bhv_lookup_unlocked(bhv_head_t *bhp, void *ops)
+{
+	bhv_desc_t	*bdp;
+
+	BHV_READ_LOCK(bhp);
+	bdp = bhv_lookup(bhp, ops);
+	BHV_READ_UNLOCK(bhp);
+
+	return bdp;
+}
+
+/*
+ * Return the base behavior in the chain, or NULL if the chain
+ * is empty.
+ *
+ * The caller has not read locked the behavior chain, so acquire the
+ * lock before traversing the chain.
+ */
+bhv_desc_t *
+bhv_base_unlocked(bhv_head_t *bhp)
+{
+	bhv_desc_t	*curdesc;
+
+	BHV_READ_LOCK(bhp);
+	for (curdesc = bhp->bh_first;
+	     curdesc != NULL;
+	     curdesc = curdesc->bd_next) {
+
+		if (curdesc->bd_next == NULL) {
+			BHV_READ_UNLOCK(bhp);
+			return curdesc;
+		}
+	}
+
+	BHV_READ_UNLOCK(bhp);
+	return NULL;
+}
+
+#define BHVMAGIC (void *)0xf00d
+
+/* ARGSUSED */
+void
+bhv_head_init(
+	bhv_head_t *bhp,
+	char *name)
+{
+	bhp->bh_first = NULL;
+	bhp->bh_lockp = BHVMAGIC;
+}
+
+
+/* ARGSUSED */
+void
+bhv_head_reinit(
+	bhv_head_t *bhp)
+{
+	ASSERT(bhp->bh_first == NULL);
+	ASSERT(bhp->bh_lockp == BHVMAGIC);
+}
+
+
+void
+bhv_insert_initial(
+	bhv_head_t *bhp,
+	bhv_desc_t *bdp)
+{
+	ASSERT(bhp->bh_first == NULL);
+	ASSERT(bhp->bh_lockp == BHVMAGIC);
+	(bhp)->bh_first = bdp;
+}
+
+void
+bhv_head_destroy(
+	bhv_head_t *bhp)
+{
+	ASSERT(bhp->bh_first == NULL);
+	ASSERT(bhp->bh_lockp == BHVMAGIC);
+	bhp->bh_lockp = NULL;
+}
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_behavior.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_behavior.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_behavior.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_behavior.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BEHAVIOR_H__
+#define __XFS_BEHAVIOR_H__
+
+/*
+ * Header file used to associate behaviors with virtualized objects.
+ *
+ * A virtualized object is an internal, virtualized representation of
+ * OS entities such as persistent files, processes, or sockets.	 Examples
+ * of virtualized objects include vnodes, vprocs, and vsockets.	 Often
+ * a virtualized object is referred to simply as an "object."
+ *
+ * A behavior is essentially an implementation layer associated with
+ * an object.  Multiple behaviors for an object are chained together,
+ * the order of chaining determining the order of invocation.  Each
+ * behavior of a given object implements the same set of interfaces
+ * (e.g., the VOP interfaces).
+ *
+ * Behaviors may be dynamically inserted into an object's behavior chain,
+ * such that the addition is transparent to consumers that already have
+ * references to the object.  Typically, a given behavior will be inserted
+ * at a particular location in the behavior chain.  Insertion of new
+ * behaviors is synchronized with operations-in-progress (oip's) so that
+ * the oip's always see a consistent view of the chain.
+ *
+ * The term "interpostion" is used to refer to the act of inserting
+ * a behavior such that it interposes on (i.e., is inserted in front
+ * of) a particular other behavior.  A key example of this is when a
+ * system implementing distributed single system image wishes to
+ * interpose a distribution layer (providing distributed coherency)
+ * in front of an object that is otherwise only accessed locally.
+ *
+ * Note that the traditional vnode/inode combination is simply a virtualized
+ * object that has exactly one associated behavior.
+ *
+ * Behavior synchronization is logic which is necessary under certain
+ * circumstances that there is no conflict between ongoing operations
+ * traversing the behavior chain and those dunamically modifying the
+ * behavior chain.  Because behavior synchronization adds extra overhead
+ * to virtual operation invocation, we want to restrict, as much as
+ * we can, the requirement for this extra code, to those situations
+ * in which it is truly necessary.
+ *
+ * Behavior synchronization is needed whenever there's at least one class
+ * of object in the system for which:
+ * 1) multiple behaviors for a given object are supported,
+ * -- AND --
+ * 2a) insertion of a new behavior can happen dynamically at any time during
+ *     the life of an active object,
+ *	-- AND --
+ *	3a) insertion of a new behavior needs to synchronize with existing
+ *	    ops-in-progress.
+ *	-- OR --
+ *	3b) multiple different behaviors can be dynamically inserted at
+ *	    any time during the life of an active object
+ *	-- OR --
+ *	3c) removal of a behavior can occur at any time during the life of
+ *	    an active object.
+ * -- OR --
+ * 2b) removal of a behavior can occur at any time during the life of an
+ *     active object
+ *
+ */
+
+typedef void	bhv_head_lock_t;
+
+/*
+ * Behavior head.  Head of the chain of behaviors.
+ * Contained within each virtualized object data structure.
+ */
+typedef struct bhv_head {
+	struct bhv_desc *bh_first;	/* first behavior in chain */
+	bhv_head_lock_t *bh_lockp;	/* pointer to lock info struct */
+} bhv_head_t;
+
+/*
+ * Behavior descriptor.	 Descriptor associated with each behavior.
+ * Contained within the behavior's private data structure.
+ */
+typedef struct bhv_desc {
+	void		*bd_pdata;	/* private data for this behavior */
+	void		*bd_vobj;	/* virtual object associated with */
+	void		*bd_ops;	/* ops for this behavior */
+	struct bhv_desc *bd_next;	/* next behavior in chain */
+} bhv_desc_t;
+
+/*
+ * Behavior identity field.  A behavior's identity determines the position
+ * where it lives within a behavior chain, and it's always the first field
+ * of the behavior's ops vector. The optional id field further identifies the
+ * subsystem responsible for the behavior.
+ */
+typedef struct bhv_identity {
+	__u16	bi_id;		/* owning subsystem id */
+	__u16	bi_position;	/* position in chain */
+} bhv_identity_t;
+
+typedef bhv_identity_t bhv_position_t;
+
+#define BHV_IDENTITY_INIT(id,pos)	{id, pos}
+
+#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos)
+
+
+/*
+ * Define boundaries of position values.
+ */
+#define BHV_POSITION_INVALID	0	/* invalid position number */
+#define BHV_POSITION_BASE	1	/* base (last) implementation layer */
+#define BHV_POSITION_TOP	63	/* top (first) implementation layer */
+
+/*
+ * Plumbing macros.
+ */
+#define BHV_HEAD_FIRST(bhp)	(ASSERT((bhp)->bh_first), (bhp)->bh_first)
+#define BHV_NEXT(bdp)		(ASSERT((bdp)->bd_next), (bdp)->bd_next)
+#define BHV_NEXTNULL(bdp)	((bdp)->bd_next)
+#define BHV_VOBJ(bdp)		(ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj)
+#define BHV_VOBJNULL(bdp)	((bdp)->bd_vobj)
+#define BHV_PDATA(bdp)		(bdp)->bd_pdata
+#define BHV_OPS(bdp)		(bdp)->bd_ops
+#define BHV_IDENTITY(bdp)	((bhv_identity_t *)(bdp)->bd_ops)
+#define BHV_POSITION(bdp)	(BHV_IDENTITY(bdp)->bi_position)
+
+
+#define BHV_READ_LOCK(bhp)
+#define BHV_READ_UNLOCK(bhp)
+#define BHV_NOT_READ_LOCKED(bhp)	1
+#define BHV_IS_WRITE_LOCKED(bhp)	1
+#define BHV_NOT_WRITE_LOCKED(bhp)	1
+
+extern void bhv_head_init(bhv_head_t *, char *);
+extern void bhv_head_destroy(bhv_head_t *);
+extern void bhv_head_reinit(bhv_head_t *);
+extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
+
+/*
+ * Initialize a new behavior descriptor.
+ * Arguments:
+ *   bdp - pointer to behavior descriptor
+ *   pdata - pointer to behavior's private data
+ *   vobj - pointer to associated virtual object
+ *   ops - pointer to ops for this behavior
+ */
+#define bhv_desc_init(bdp, pdata, vobj, ops)		\
+ {							\
+	(bdp)->bd_pdata = pdata;			\
+	(bdp)->bd_vobj = vobj;				\
+	(bdp)->bd_ops = ops;				\
+	(bdp)->bd_next = NULL;				\
+ }
+
+#define BHV_DESC_INIT(so,A,B)	bhv_desc_init(&(so->so_bhv),so,A,B)
+
+/*
+ * Remove a behavior descriptor from a behavior chain.
+ */
+#define bhv_remove(bhp, bdp)				\
+ {							\
+	if ((bhp)->bh_first == (bdp)) {			\
+		/*					\
+		* Remove from front of chain.		\
+		 * Atomic wrt oip's.			\
+		*/					\
+	       (bhp)->bh_first = (bdp)->bd_next;	\
+	 } else {					\
+	       /* remove from non-front of chain */	\
+	       bhv_remove_not_first(bhp, bdp);		\
+	}						\
+	(bdp)->bd_vobj = NULL;				\
+ }
+
+/*
+ * Behavior module prototypes.
+ */
+extern void		bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
+extern bhv_desc_t *	bhv_lookup(bhv_head_t *bhp, void *ops);
+extern bhv_desc_t *	bhv_lookup_unlocked(bhv_head_t *bhp, void *ops);
+extern bhv_desc_t *	bhv_base_unlocked(bhv_head_t *bhp);
+
+#endif /* __XFS_BEHAVIOR_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_cred.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_cred.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_cred.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_cred.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_CRED_H__
+#define __XFS_CRED_H__
+
+/*
+ * Credentials
+ */
+typedef struct cred {
+	/* EMPTY */
+} cred_t;
+
+extern struct cred *sys_cred;
+
+/* this is a hack.. (assums sys_cred is the only cred_t in the system) */
+static __inline int capable_cred(cred_t *cr, int cid)
+{
+	return (cr == sys_cred) ? 1 : capable(cid);
+}
+
+#endif	/* __XFS_CRED_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_file.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_file.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_file.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_file.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/dcache.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/mman.h> /* for PROT_WRITE */
+
+static struct vm_operations_struct linvfs_file_vm_ops;
+
+
+STATIC ssize_t
+linvfs_read(
+	struct file	*filp,
+	char		*buf,
+	size_t		size,
+	loff_t		*offset)
+{
+	vnode_t		*vp;
+	int		error;
+
+	vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
+	ASSERT(vp);
+
+	VOP_READ(vp, filp, buf, size, offset, NULL, error);
+
+	return(error);
+}
+
+
+STATIC ssize_t
+linvfs_write(
+	struct file	*file,
+	const char	*buf,
+	size_t		count,
+	loff_t		*ppos)
+{
+	struct inode	*inode = file->f_dentry->d_inode;
+	loff_t		pos;
+	vnode_t		*vp;
+	int		err;	/* Use negative errors in this f'n */
+
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	pos = *ppos;
+	err = -EINVAL;
+	if (pos < 0)
+		goto out;
+
+	err = file->f_error;
+	if (err) {
+		file->f_error = 0;
+		goto out;
+	}
+
+	vp = LINVFS_GET_VP(inode);
+	ASSERT(vp);
+
+	/* We allow multiple direct writers in, there is no
+	 * potential call to vmtruncate in that path.
+	 */
+	if (!(file->f_flags & O_DIRECT))
+		down(&inode->i_sem);
+
+	VOP_WRITE(vp, file, buf, count, &pos, NULL, err);
+	*ppos = pos;
+
+	if (!(file->f_flags & O_DIRECT))
+		up(&inode->i_sem);
+out:
+
+	return(err);
+}
+
+
+STATIC int
+linvfs_open(
+	struct inode	*inode,
+	struct file	*filp)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	int		error;
+
+	if (!(filp->f_flags & O_LARGEFILE) && inode->i_size > MAX_NON_LFS)
+		return -EFBIG;
+
+	ASSERT(vp);
+	VOP_OPEN(vp, NULL, error);
+	return -error;
+}
+
+
+STATIC int
+linvfs_release(
+	struct inode	*inode,
+	struct file	*filp)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	int		error = 0;
+
+	if (vp)
+		VOP_RELEASE(vp, error);
+	return -error;
+}
+
+
+STATIC int
+linvfs_fsync(
+	struct file	*filp,
+	struct dentry	*dentry,
+	int		datasync)
+{
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	int		error;
+	int		flags = FSYNC_WAIT;
+
+	if (datasync)
+		flags |= FSYNC_DATA;
+
+	ASSERT(vp);
+
+	VOP_FSYNC(vp, flags, NULL, (off_t)0, (off_t)-1, error);
+
+	return -error;
+}
+
+/*
+ * linvfs_readdir maps to VOP_READDIR().
+ * We need to build a uio, cred, ...
+ */
+
+#define nextdp(dp)	((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen))
+
+STATIC int
+linvfs_readdir(
+	struct file	*filp,
+	void		*dirent,
+	filldir_t	filldir)
+{
+	int		error = 0;
+	vnode_t		*vp;
+	uio_t		uio;
+	iovec_t		iov;
+	int		eof = 0;
+	caddr_t		read_buf;
+	int		namelen, size = 0;
+	size_t		rlen = PAGE_CACHE_SIZE << 2;
+	off_t		start_offset;
+	xfs_dirent_t	*dbp = NULL;
+
+	vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
+	ASSERT(vp);
+
+	/* Try fairly hard to get memory */
+	do {
+		if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL)))
+			break;
+		rlen >>= 1;
+	} while (rlen >= 1024);
+
+	if (read_buf == NULL)
+		return -ENOMEM;
+
+	uio.uio_iov = &iov;
+	uio.uio_fmode = filp->f_mode;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_offset = filp->f_pos;
+
+	while (!eof) {
+		uio.uio_resid = iov.iov_len = rlen;
+		iov.iov_base = read_buf;
+		uio.uio_iovcnt = 1;
+
+		start_offset = uio.uio_offset;
+
+		VOP_READDIR(vp, &uio, NULL, &eof, error);
+		if ((uio.uio_offset == start_offset) || error) {
+			size = 0;
+			break;
+		}
+
+		size = rlen - uio.uio_resid;
+		dbp = (xfs_dirent_t *)read_buf;
+		while (size > 0) {
+			namelen = strlen(dbp->d_name);
+
+			if (filldir(dirent, dbp->d_name, namelen,
+					(loff_t) dbp->d_off,
+					(ino_t) dbp->d_ino,
+					DT_UNKNOWN)) {
+				goto done;
+			}
+			size -= dbp->d_reclen;
+			dbp = nextdp(dbp);
+		}
+	}
+done:
+	if (!error) {
+		if (size == 0)
+			filp->f_pos = uio.uio_offset;
+		else if (dbp)
+			filp->f_pos = dbp->d_off;
+	}
+
+	kfree(read_buf);
+	return -error;
+}
+
+STATIC int
+linvfs_file_mmap(
+	struct file	*filp,
+	struct vm_area_struct *vma)
+{
+	struct inode	*ip = filp->f_dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(ip);
+	vattr_t		va = { .va_mask = AT_UPDATIME };
+	int		error;
+
+	if ((vp->v_type == VREG) && (vp->v_vfsp->vfs_flag & VFS_DMI)) {
+		error = -xfs_dm_send_mmap_event(vma, 0);
+		if (error)
+			return error;
+	}
+
+	vma->vm_ops = &linvfs_file_vm_ops;
+
+	VOP_SETATTR(vp, &va, AT_UPDATIME, NULL, error);
+	UPDATE_ATIME(ip);
+	return 0;
+}
+
+
+STATIC int
+linvfs_ioctl(
+	struct inode	*inode,
+	struct file	*filp,
+	unsigned int	cmd,
+	unsigned long	arg)
+{
+	int		error;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	ASSERT(vp);
+	VOP_IOCTL(vp, inode, filp, cmd, arg, error);
+	VMODIFY(vp);
+
+	/* NOTE:  some of the ioctl's return positive #'s as a
+	 *	  byte count indicating success, such as
+	 *	  readlink_by_handle.  So we don't "sign flip"
+	 *	  like most other routines.  This means true
+	 *	  errors need to be returned as a negative value.
+	 */
+	return error;
+}
+
+#ifdef HAVE_VMOP_MPROTECT
+STATIC int
+linvfs_mprotect(
+	struct vm_area_struct *vma,
+	unsigned int	newflags)
+{
+	vnode_t		*vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode);
+	int		error = 0;
+
+	if ((vp->v_type == VREG) && (vp->v_vfsp->vfs_flag & VFS_DMI)) {
+		if ((vma->vm_flags & VM_MAYSHARE) &&
+		    (newflags & PROT_WRITE) && !(vma->vm_flags & PROT_WRITE)){
+			error = xfs_dm_send_mmap_event(vma, VM_WRITE);
+		    }
+	}
+	return error;
+}
+#endif /* HAVE_VMOP_MPROTECT */
+
+
+struct file_operations linvfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= linvfs_read,
+	.write		= linvfs_write,
+	.ioctl		= linvfs_ioctl,
+	.mmap		= linvfs_file_mmap,
+	.open		= linvfs_open,
+	.release	= linvfs_release,
+	.fsync		= linvfs_fsync,
+};
+
+struct file_operations linvfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= linvfs_readdir,
+	.ioctl		= linvfs_ioctl,
+	.fsync		= linvfs_fsync,
+};
+
+static struct vm_operations_struct linvfs_file_vm_ops = {
+	.nopage		= filemap_nopage,
+#ifdef HAVE_VMOP_MPROTECT
+	.mprotect	= linvfs_mprotect,
+#endif
+};
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_fs_subr.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_fs_subr.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_fs_subr.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_fs_subr.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * Implementation for VFS_DOUNMOUNT.
+ */
+int
+fs_dounmount(
+	bhv_desc_t	*bdp,
+	int		flags,
+	vnode_t		*rootvp,
+	cred_t		*cr)
+{
+	struct vfs	*vfsp = bhvtovfs(bdp);
+	bhv_desc_t	*fbdp = vfsp->vfs_fbhv;
+	int		error;
+
+	/*
+	 * Wait for sync to finish and lock vfsp.  This also sets the
+	 * VFS_OFFLINE flag.  Once we do this we can give up reference
+	 * the root vnode which we hold to avoid the another unmount
+	 * ripping the vfs out from under us before we get to lock it.
+	 * The VFS_DOUNMOUNT calling convention is that the reference
+	 * on the rot vnode is released whether the call succeeds or
+	 * fails.
+	 */
+	if (rootvp)
+		VN_RELE(rootvp);
+
+	/*
+	 * Now invoke SYNC and UNMOUNT ops, using the PVFS versions is
+	 * OK since we already have a behavior lock as a result of
+	 * being in VFS_DOUNMOUNT.  It is necessary to do things this
+	 * way since using the VFS versions would cause us to get the
+	 * behavior lock twice which can cause deadlock as well as
+	 * making the coding of vfs relocation unnecessarilty difficult
+	 * by making relocations invoked by unmount occur in a different
+	 * environment than those invoked by mount-update.
+	 */
+	PVFS_SYNC(fbdp, SYNC_ATTR|SYNC_DELWRI, cr, error);
+	if (error == 0)
+		PVFS_UNMOUNT(fbdp, flags, cr, error);
+	return error;
+}
+
+/*
+ * Stub for no-op vnode operations that return error status.
+ */
+int
+fs_noerr()
+{
+	return 0;
+}
+
+/*
+ * Operation unsupported under this file system.
+ */
+int
+fs_nosys()
+{
+	return ENOSYS;
+}
+
+/*
+ * Stub for inactive, strategy, and read/write lock/unlock.  Does nothing.
+ */
+/* ARGSUSED */
+void
+fs_noval()
+{
+}
+
+/*
+ * vnode pcache layer for vnode_tosspages.
+ * 'last' parameter unused but left in for IRIX compatibility
+ */
+void
+fs_tosspages(
+	bhv_desc_t	*bdp,
+	xfs_off_t	first,
+	xfs_off_t	last,
+	int		fiopt)
+{
+	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	struct inode	*ip = LINVFS_GET_IP(vp);
+
+	if (VN_CACHED(vp))
+		truncate_inode_pages(ip->i_mapping, first);
+}
+
+
+/*
+ * vnode pcache layer for vnode_flushinval_pages.
+ * 'last' parameter unused but left in for IRIX compatibility
+ */
+void
+fs_flushinval_pages(
+	bhv_desc_t	*bdp,
+	xfs_off_t	first,
+	xfs_off_t	last,
+	int		fiopt)
+{
+	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	struct inode	*ip = LINVFS_GET_IP(vp);
+
+	if (VN_CACHED(vp)) {
+		filemap_fdatasync(ip->i_mapping);
+		fsync_inode_data_buffers(ip);
+		filemap_fdatawait(ip->i_mapping);
+
+		truncate_inode_pages(ip->i_mapping, first);
+	}
+}
+
+/*
+ * vnode pcache layer for vnode_flush_pages.
+ * 'last' parameter unused but left in for IRIX compatibility
+ */
+int
+fs_flush_pages(
+	bhv_desc_t	*bdp,
+	xfs_off_t	first,
+	xfs_off_t	last,
+	uint64_t	flags,
+	int		fiopt)
+{
+	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	struct inode	*ip = LINVFS_GET_IP(vp);
+
+	if (VN_CACHED(vp)) {
+		filemap_fdatasync(ip->i_mapping);
+		fsync_inode_data_buffers(ip);
+		filemap_fdatawait(ip->i_mapping);
+	}
+
+	return 0;
+}
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_fs_subr.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_fs_subr.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_fs_subr.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_fs_subr.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUBR_H__
+#define __XFS_SUBR_H__
+
+/*
+ * Utilities shared among file system implementations.
+ */
+
+struct cred;
+
+extern int	fs_noerr(void);
+extern int	fs_nosys(void);
+extern int	fs_nodev(void);
+extern void	fs_noval(void);
+extern int	fs_dounmount(bhv_desc_t *, int, vnode_t *, struct cred *);
+extern void	fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+extern void	fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+extern int	fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
+
+#endif	/* __XFS_FS_SUBR_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_globals.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_globals.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_globals.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_globals.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains globals needed by XFS that were normally defined
+ * somewhere else in IRIX.
+ */
+
+#include <xfs.h>
+
+uint64_t	xfs_panic_mask;		/* set to cause more panics */
+unsigned long	xfs_physmem;
+
+/*
+ * restricted_chown = 1	 bsd style chown(2), only super-user can give away files
+ * restricted_chown = 0	 sysV style chown(2), non super-user can give away files
+ */
+int		restricted_chown = 1;
+
+/*
+ * Used to serialize atomicIncWithWrap.
+ */
+spinlock_t Atomic_spin = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Global system credential structure.
+ */
+cred_t sys_cred_val, *sys_cred = &sys_cred_val;
+
+/*
+ * The global quota manager. There is only one of these for the entire
+ * system, _not_ one per file system. XQM keeps track of the overall
+ * quota functionality, including maintaining the freelist and hash
+ * tables of dquots.
+ */
+struct xfs_qm	*xfs_Gqm;
+mutex_t		xfs_Gqm_lock;
+
+/* Export XFS symbols used by xfsidbg */
+EXPORT_SYMBOL(xfs_Gqm);
+EXPORT_SYMBOL(xfs_next_bit);
+EXPORT_SYMBOL(xfs_contig_bits);
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_globals.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_globals.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_globals.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_globals.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_GLOBALS_H__
+#define __XFS_GLOBALS_H__
+
+/*
+ * This file declares globals needed by XFS that were normally defined
+ * somewhere else in IRIX.
+ */
+
+extern uint64_t xfs_panic_mask;		/* set to cause more panics */
+
+extern int	restricted_chown;
+extern unsigned long	xfs_physmem;
+
+extern struct cred *sys_cred;
+
+extern struct xfs_qm	*xfs_Gqm;
+extern mutex_t		xfs_Gqm_lock;
+
+#endif	/* __XFS_GLOBALS_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_ioctl.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_ioctl.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_ioctl.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_ioctl.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,1094 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_fsops.h>
+#include <xfs_dfrag.h>
+#include <linux/dcache.h>
+#include <linux/iobuf.h>
+
+
+extern int xfs_change_file_space(bhv_desc_t *, int,
+			xfs_flock64_t *, xfs_off_t, cred_t *, int);
+extern int xfs_set_dmattrs(bhv_desc_t *, u_int, u_int16_t, cred_t *);
+
+
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ *    returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ *    returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ *    returns full handle for a path
+ */
+STATIC int
+xfs_find_handle(
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	int			hsize;
+	xfs_handle_t		handle;
+	xfs_fsop_handlereq_t	hreq;
+	struct inode		*inode;
+	struct vnode		*vp;
+
+	if (copy_from_user(&hreq, (xfs_fsop_handlereq_t *)arg, sizeof(hreq)))
+		return -XFS_ERROR(EFAULT);
+
+	bzero((char *)&handle, sizeof(handle));
+
+	switch (cmd) {
+	case XFS_IOC_PATH_TO_FSHANDLE:
+	case XFS_IOC_PATH_TO_HANDLE: {
+		struct nameidata	nd;
+		char			*path;
+		int			error;
+
+		/* we need the path */
+		path = getname(hreq.path);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+
+		/* traverse the path */
+		error = 0;
+		if (path_init(path, LOOKUP_POSITIVE, &nd))
+			error = path_walk(path, &nd);
+		putname(path);
+		if (error)
+			return error;
+
+		ASSERT(nd.dentry);
+		ASSERT(nd.dentry->d_inode);
+		inode = igrab(nd.dentry->d_inode);
+		path_release(&nd);
+		break;
+	}
+
+	case XFS_IOC_FD_TO_HANDLE: {
+		struct file	*file;
+
+		file = fget(hreq.fd);
+		if (!file)
+		    return -EBADF;
+
+		ASSERT(file->f_dentry);
+		ASSERT(file->f_dentry->d_inode);
+		inode = igrab(file->f_dentry->d_inode);
+		fput(file);
+
+		break;
+	}
+
+	default:
+		ASSERT(0);
+		return -XFS_ERROR(EINVAL);
+	}
+
+	if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
+		/* we're not in XFS anymore, Toto */
+		iput(inode);
+		return -XFS_ERROR(EINVAL);
+	}
+
+	/* we need the vnode */
+	vp = LINVFS_GET_VP(inode);
+	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
+		iput(inode);
+		return -XFS_ERROR(EBADF);
+	}
+
+	/* now we can grab the fsid */
+	memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t));
+	hsize = sizeof(xfs_fsid_t);
+
+	if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
+		xfs_inode_t	*ip;
+		bhv_desc_t	*bhv;
+		int		lock_mode;
+
+		/* need to get access to the xfs_inode to read the generation */
+		VN_BHV_READ_LOCK(&(vp)->v_bh);
+		bhv = VNODE_TO_FIRST_BHV(vp);
+		ASSERT(bhv);
+		ip = XFS_BHVTOI(bhv);
+		ASSERT(ip);
+		lock_mode = xfs_ilock_map_shared(ip);
+
+		/* fill in fid section of handle from inode */
+		handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) -
+					    sizeof(handle.ha_fid.xfs_fid_len);
+		handle.ha_fid.xfs_fid_pad = 0;
+		handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen;
+		handle.ha_fid.xfs_fid_ino = ip->i_ino;
+
+		xfs_iunlock_map_shared(ip, lock_mode);
+		VN_BHV_READ_UNLOCK(&(vp)->v_bh);
+
+		hsize = XFS_HSIZE(handle);
+	}
+
+	/* now copy our handle into the user buffer & write out the size */
+	if (copy_to_user((xfs_handle_t *)hreq.ohandle, &handle, hsize) ||
+	    copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
+		iput(inode);
+		return -XFS_ERROR(EFAULT);
+	}
+
+	iput(inode);
+	return 0;
+}
+
+
+/*
+ * Convert userspace handle data into vnode (and inode).
+ * We [ab]use the fact that all the fsop_handlereq ioctl calls
+ * have a data structure argument whose first component is always
+ * a xfs_fsop_handlereq_t, so we can cast to and from this type.
+ * This allows us to optimise the copy_from_user calls and gives
+ * a handy, shared routine.
+ *
+ * If no error, caller must always VN_RELE the returned vp.
+ */
+STATIC int
+xfs_vget_fsop_handlereq(
+	xfs_mount_t		*mp,
+	struct inode		*parinode,	/* parent inode pointer	   */
+	int			cap,		/* capability level for op */
+	unsigned long		arg,		/* userspace data pointer  */
+	unsigned long		size,		/* size of expected struct */
+	/* output arguments */
+	xfs_fsop_handlereq_t	*hreq,
+	vnode_t			**vp,
+	struct inode		**inode)
+{
+	void			*hanp;
+	size_t			hlen;
+	xfs_fid_t		*xfid;
+	xfs_handle_t		*handlep;
+	xfs_handle_t		handle;
+	xfs_inode_t		*ip;
+	struct inode		*inodep;
+	vnode_t			*vpp;
+	__u32			igen;
+	ino_t			ino;
+	int			error;
+
+	if (!capable(cap))
+		return XFS_ERROR(EPERM);
+
+	/*
+	 * Only allow handle opens under a directory.
+	 */
+	if (!S_ISDIR(parinode->i_mode))
+		return XFS_ERROR(ENOTDIR);
+
+	/*
+	 * Copy the handle down from the user and validate
+	 * that it looks to be in the correct format.
+	 */
+	if (copy_from_user(hreq, (struct xfs_fsop_handlereq *)arg, size))
+		return XFS_ERROR(EFAULT);
+
+	hanp = hreq->ihandle;
+	hlen = hreq->ihandlen;
+	handlep = &handle;
+
+	if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+		return XFS_ERROR(EINVAL);
+	if (copy_from_user(handlep, hanp, hlen))
+		return XFS_ERROR(EFAULT);
+	if (hlen < sizeof(*handlep))
+		bzero(((char *)handlep) + hlen, sizeof(*handlep) - hlen);
+	if (hlen > sizeof(handlep->ha_fsid)) {
+		if (handlep->ha_fid.xfs_fid_len !=
+				(hlen - sizeof(handlep->ha_fsid)
+					- sizeof(handlep->ha_fid.xfs_fid_len))
+		    || handlep->ha_fid.xfs_fid_pad)
+			return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * Crack the handle, obtain the inode # & generation #
+	 */
+	xfid = (struct xfs_fid *)&handlep->ha_fid;
+	if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) {
+		ino  = xfid->xfs_fid_ino;
+		igen = xfid->xfs_fid_gen;
+	} else {
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * Get the XFS inode, building a vnode to go with it.
+	 */
+	error = xfs_iget(mp, NULL, ino, XFS_ILOCK_SHARED, &ip, 0);
+	if (error)
+		return error;
+	if (ip == NULL)
+		return XFS_ERROR(EIO);
+	if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
+		xfs_iput_new(ip, XFS_ILOCK_SHARED);
+		return XFS_ERROR(ENOENT);
+	}
+
+	vpp = XFS_ITOV(ip);
+	inodep = LINVFS_GET_IP(vpp);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	error = linvfs_revalidate_core(inodep, ATTR_COMM);
+	if (error) {
+		iput(inodep);
+		/* this error is (-) but our callers expect + */
+		return XFS_ERROR(-error);
+	}
+
+	*vp = vpp;
+	*inode = inodep;
+	return 0;
+}
+
+STATIC int
+xfs_open_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	int			new_fd;
+	int			permflag;
+	struct file		*filp;
+	struct inode		*inode;
+	struct dentry		*dentry;
+	vnode_t			*vp;
+	xfs_fsop_handlereq_t	hreq;
+	struct list_head	*lp;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg,
+					sizeof(xfs_fsop_handlereq_t),
+					&hreq, &vp, &inode);
+	if (error)
+		return -error;
+
+	/* Restrict xfs_open_by_handle to directories & regular files. */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+		iput(inode);
+		return -XFS_ERROR(EINVAL);
+	}
+
+#if BITS_PER_LONG != 32
+	hreq.oflags |= O_LARGEFILE;
+#endif
+	/* Put open permission in namei format. */
+	permflag = hreq.oflags;
+	if ((permflag+1) & O_ACCMODE)
+		permflag++;
+	if (permflag & O_TRUNC)
+		permflag |= 2;
+
+	/* Can't write directories. */
+	if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+		iput(inode);
+		return -XFS_ERROR(EISDIR);
+	}
+
+	if ((new_fd = get_unused_fd()) < 0) {
+		iput(inode);
+		return new_fd;
+	}
+
+	/* Now to find a dentry.  If possible, get a well-connected one. */
+	spin_lock(&dcache_lock);
+	for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) {
+		dentry = list_entry(lp, struct dentry, d_alias);
+		if (! (dentry->d_flags & DCACHE_NFSD_DISCONNECTED)) {
+			dget_locked(dentry);
+			dentry->d_vfs_flags |= DCACHE_REFERENCED;
+			spin_unlock(&dcache_lock);
+			iput(inode);
+			goto found;
+		}
+	}
+	spin_unlock(&dcache_lock);
+
+	/* ELSE didn't find dentry.  Create anonymous dcache entry. */
+	dentry = d_alloc_root(inode);
+	if (dentry == NULL) {
+		iput(inode);
+		put_unused_fd(new_fd);
+		return -XFS_ERROR(ENOMEM);
+	}
+
+	/* Keep nfsd happy. */
+	dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
+
+ found:
+	/* Ensure umount returns EBUSY on umounts while this file is open. */
+	mntget(parfilp->f_vfsmnt);
+
+	/* Create file pointer. */
+	filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags);
+	if (IS_ERR(filp)) {
+		put_unused_fd(new_fd);
+		return -XFS_ERROR(-PTR_ERR(filp));
+	}
+	filp->f_mode |= FINVIS;
+
+	fd_install(new_fd, filp);
+	return new_fd;
+}
+
+STATIC int
+xfs_readlink_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	struct iovec		aiov;
+	struct uio		auio;
+	struct inode		*inode;
+	xfs_fsop_handlereq_t	hreq;
+	vnode_t			*vp;
+	__u32			olen;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg,
+					sizeof(xfs_fsop_handlereq_t),
+					&hreq, &vp, &inode);
+	if (error)
+		return -error;
+
+	/* Restrict this handle operation to symlinks only. */
+	if (vp->v_type != VLNK) {
+		VN_RELE(vp);
+		return -XFS_ERROR(EINVAL);
+	}
+
+	if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
+		VN_RELE(vp);
+		return -XFS_ERROR(EFAULT);
+	}
+	aiov.iov_len	= olen;
+	aiov.iov_base	= hreq.ohandle;
+
+	auio.uio_iov	= &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_fmode	= FINVIS;
+	auio.uio_offset = 0;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_resid	= olen;
+
+	VOP_READLINK(vp, &auio, NULL, error);
+
+	VN_RELE(vp);
+	return (olen - auio.uio_resid);
+}
+
+STATIC int
+xfs_fssetdm_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	struct fsdmidata	fsd;
+	xfs_fsop_setdm_handlereq_t dmhreq;
+	struct inode		*inode;
+	bhv_desc_t		*bdp;
+	vnode_t			*vp;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_MKNOD, arg,
+					sizeof(xfs_fsop_setdm_handlereq_t),
+					(xfs_fsop_handlereq_t *)&dmhreq,
+					&vp, &inode);
+	if (error)
+		return -error;
+
+	if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
+		VN_RELE(vp);
+		return -XFS_ERROR(EFAULT);
+	}
+
+	bdp = bhv_base_unlocked(VN_BHV_HEAD(vp));
+	error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL);
+
+	VN_RELE(vp);
+	if (error)
+		return -error;
+	return 0;
+}
+
+STATIC int
+xfs_attrlist_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	attrlist_cursor_kern_t	*cursor;
+	xfs_fsop_attrlist_handlereq_t al_hreq;
+	struct inode		*inode;
+	vnode_t			*vp;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg,
+					sizeof(xfs_fsop_attrlist_handlereq_t),
+					(xfs_fsop_handlereq_t *)&al_hreq,
+					&vp, &inode);
+	if (error)
+		return -error;
+
+	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+	VOP_ATTR_LIST(vp, al_hreq.buffer, al_hreq.buflen, al_hreq.flags,
+			cursor, NULL, error);
+	VN_RELE(vp);
+	if (error)
+		return -error;
+	return 0;
+}
+
+STATIC int
+xfs_attrmulti_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	xfs_attr_multiop_t	*ops;
+	xfs_fsop_attrmulti_handlereq_t am_hreq;
+	struct inode		*inode;
+	vnode_t			*vp;
+	int			i, size;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg,
+					sizeof(xfs_fsop_attrmulti_handlereq_t),
+					(xfs_fsop_handlereq_t *)&am_hreq,
+					&vp, &inode);
+	if (error)
+		return -error;
+
+	size = am_hreq.opcount * sizeof(attr_multiop_t);
+	ops = (xfs_attr_multiop_t *)kmalloc(size, GFP_KERNEL);
+	if (!ops) {
+		VN_RELE(vp);
+		return -XFS_ERROR(ENOMEM);
+	}
+
+	if (copy_from_user(ops, am_hreq.ops, size)) {
+		kfree(ops);
+		VN_RELE(vp);
+		return -XFS_ERROR(EFAULT);
+	}
+
+	for (i = 0; i < am_hreq.opcount; i++) {
+		switch(ops[i].am_opcode) {
+		case ATTR_OP_GET:
+			VOP_ATTR_GET(vp,ops[i].am_attrname, ops[i].am_attrvalue,
+					&ops[i].am_length, ops[i].am_flags,
+					NULL, ops[i].am_error);
+			break;
+		case ATTR_OP_SET:
+			VOP_ATTR_SET(vp,ops[i].am_attrname, ops[i].am_attrvalue,
+					ops[i].am_length, ops[i].am_flags,
+					NULL, ops[i].am_error);
+			break;
+		case ATTR_OP_REMOVE:
+			VOP_ATTR_REMOVE(vp, ops[i].am_attrname, ops[i].am_flags,
+					NULL, ops[i].am_error);
+			break;
+		default:
+			ops[i].am_error = EINVAL;
+		}
+	}
+
+	if (copy_to_user(am_hreq.ops, ops, size))
+		error = -XFS_ERROR(EFAULT);
+
+	kfree(ops);
+	VN_RELE(vp);
+	return error;
+}
+
+/* prototypes for a few of the stack-hungry cases that have
+ * their own functions.  Functions are defined after their use
+ * so gcc doesn't get fancy and inline them with -03 */
+
+int xfs_ioc_space(
+	bhv_desc_t		*bdp,
+	vnode_t			*vp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+int xfs_ioc_bulkstat(
+	xfs_mount_t		*mp,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+int xfs_ioc_fsgeometry_v1(
+	xfs_mount_t		*mp,
+	unsigned long		arg);
+
+int xfs_ioc_fsgeometry(
+	xfs_mount_t		*mp,
+	unsigned long		arg);
+
+int xfs_ioc_xattr(
+	vnode_t			*vp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+int xfs_ioc_getbmap(
+	bhv_desc_t		*bdp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+int xfs_ioc_getbmapx(
+	bhv_desc_t		*bdp,
+	unsigned long		arg);
+
+int
+xfs_ioctl(
+	bhv_desc_t		*bdp,
+	struct inode		*inode,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	int			error;
+	vnode_t			*vp;
+	xfs_inode_t		*ip;
+	xfs_mount_t		*mp;
+
+	vp = LINVFS_GET_VP(inode);
+
+	vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	switch (cmd) {
+
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP64:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP64:
+		return xfs_ioc_space(bdp, vp, filp, cmd, arg);
+
+	case XFS_IOC_DIOINFO: {
+		struct dioattr	da;
+
+		da.d_miniosz = mp->m_sb.sb_blocksize;
+		da.d_mem = mp->m_sb.sb_blocksize;
+
+		/*
+		 * this only really needs to be BBSIZE.
+		 * it is set to the file system block size to
+		 * avoid having to do block zeroing on short writes.
+		 */
+		da.d_maxiosz = XFS_FSB_TO_B(mp,
+				XFS_B_TO_FSBT(mp, KIO_MAX_ATOMIC_IO << 10));
+
+		if (copy_to_user((struct dioattr *)arg, &da, sizeof(da)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_FSBULKSTAT_SINGLE:
+	case XFS_IOC_FSBULKSTAT:
+	case XFS_IOC_FSINUMBERS:
+		return xfs_ioc_bulkstat(mp, cmd, arg);
+
+	case XFS_IOC_FSGEOMETRY_V1:
+		return xfs_ioc_fsgeometry_v1(mp, arg);
+
+	case XFS_IOC_FSGEOMETRY:
+		return xfs_ioc_fsgeometry(mp, arg);
+
+	case XFS_IOC_FSGETXATTR:
+	case XFS_IOC_FSSETXATTR:
+	case XFS_IOC_FSGETXATTRA:
+		return xfs_ioc_xattr(vp, filp, cmd, arg);
+
+	case XFS_IOC_FSSETDM: {
+		struct fsdmidata	dmi;
+
+		if (copy_from_user(&dmi, (struct fsdmidata *)arg, sizeof(dmi)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate,
+							NULL);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_GETBMAP:
+	case XFS_IOC_GETBMAPA:
+		return xfs_ioc_getbmap(bdp, filp, cmd, arg);
+
+	case XFS_IOC_GETBMAPX:
+		return xfs_ioc_getbmapx(bdp, arg);
+
+	case XFS_IOC_FD_TO_HANDLE:
+	case XFS_IOC_PATH_TO_HANDLE:
+	case XFS_IOC_PATH_TO_FSHANDLE:
+		return xfs_find_handle(cmd, arg);
+
+	case XFS_IOC_OPEN_BY_HANDLE:
+		return xfs_open_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_FSSETDM_BY_HANDLE:
+		return xfs_fssetdm_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_READLINK_BY_HANDLE:
+		return xfs_readlink_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_ATTRLIST_BY_HANDLE:
+		return xfs_attrlist_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_ATTRMULTI_BY_HANDLE:
+		return xfs_attrmulti_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_SWAPEXT: {
+		error = xfs_swapext((struct xfs_swapext *)arg);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_FSCOUNTS: {
+		xfs_fsop_counts_t out;
+
+		error = xfs_fs_counts(mp, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user((char *)arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_SET_RESBLKS: {
+		xfs_fsop_resblks_t inout;
+		__uint64_t	   in;
+
+		/* Only allow the sys admin to reserve space unless
+		 * unwritten extents are enabled.
+		 */
+		if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) &&
+		    !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&inout, (char *)arg, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+
+		/* input parameter is passed in resblks field of structure */
+		in = inout.resblks;
+		error = xfs_reserve_blocks(mp, &in, &inout);
+
+		if (copy_to_user((char *)arg, &inout, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_GET_RESBLKS: {
+		xfs_fsop_resblks_t out;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_reserve_blocks(mp, NULL, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user((char *)arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSDATA: {
+		xfs_growfs_data_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, (char *)arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_data(mp, &in);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSLOG: {
+		xfs_growfs_log_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, (char *)arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_log(mp, &in);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSRT: {
+		xfs_growfs_rt_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, (char *)arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_rt(mp, &in);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_FREEZE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xfs_fs_freeze(mp);
+		return 0;
+
+	case XFS_IOC_THAW:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xfs_fs_thaw(mp);
+		return 0;
+
+	case XFS_IOC_ERROR_INJECTION: {
+		xfs_error_injection_t in;
+
+		if (copy_from_user(&in, (char *)arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_errortag_add(in.errtag, mp);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_ERROR_CLEARALL:
+		error = xfs_errortag_clearall(mp);
+		return -error;
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+int xfs_ioc_space(
+	bhv_desc_t		*bdp,
+	vnode_t			*vp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	xfs_flock64_t	bf;
+	int		attr_flags = 0;
+	int		error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+
+	if (filp->f_flags & O_RDONLY)
+		return -XFS_ERROR(EBADF);
+
+	if (vp->v_type != VREG)
+		return -XFS_ERROR(EINVAL);
+
+	if (copy_from_user(&bf, (xfs_flock64_t *)arg, sizeof(bf)))
+		return -XFS_ERROR(EFAULT);
+
+	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+		attr_flags |= ATTR_NONBLOCK;
+	if (filp->f_mode & FINVIS)
+		attr_flags |= ATTR_DMI;
+
+	error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos,
+					      NULL, attr_flags);
+	return -error;
+}
+
+int xfs_ioc_bulkstat(
+	xfs_mount_t		*mp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	xfs_fsop_bulkreq_t bulkreq;
+	int		count;		/* # of records returned */
+	xfs_ino_t	inlast;		/* last inode number */
+	int		done;
+	int		error;
+	/* done = 1 if there are more stats to get and if bulkstat */
+	/* should be called again (unused here, but used in dmapi) */
+
+	/* Do not allow space reservation if this is not the admin and
+	 * unwritten extents are turned off.
+	 */
+	if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	if (copy_from_user(&bulkreq, (xfs_fsop_bulkreq_t *)arg,
+					sizeof(xfs_fsop_bulkreq_t)))
+		return -XFS_ERROR(EFAULT);
+
+	if (copy_from_user(&inlast, (__s64 *)bulkreq.lastip,
+						sizeof(__s64)))
+		return -XFS_ERROR(EFAULT);
+
+	if ((count = bulkreq.icount) <= 0)
+		return -XFS_ERROR(EINVAL);
+
+	if (cmd == XFS_IOC_FSINUMBERS)
+		error = xfs_inumbers(mp, NULL, &inlast, &count,
+						bulkreq.ubuffer);
+	else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
+		error = xfs_bulkstat_single(mp, &inlast,
+						bulkreq.ubuffer, &done);
+	else {	/* XFS_IOC_FSBULKSTAT */
+		if (count == 1 && inlast != 0) {
+			inlast++;
+			error = xfs_bulkstat_single(mp, &inlast,
+					bulkreq.ubuffer, &done);
+		} else {
+			error = xfs_bulkstat(mp, NULL, &inlast, &count,
+				(bulkstat_one_pf)xfs_bulkstat_one,
+				sizeof(xfs_bstat_t), bulkreq.ubuffer,
+				BULKSTAT_FG_QUICK, &done);
+		}
+	}
+
+	if (error)
+		return -error;
+
+	if (bulkreq.ocount != NULL) {
+		if (copy_to_user((xfs_ino_t *)bulkreq.lastip, &inlast,
+						sizeof(xfs_ino_t)))
+			return -XFS_ERROR(EFAULT);
+
+		if (copy_to_user((__s32 *)bulkreq.ocount, &count,
+						sizeof(count)))
+			return -XFS_ERROR(EFAULT);
+	}
+
+	return 0;
+}
+
+int xfs_ioc_fsgeometry_v1(
+	xfs_mount_t		*mp,
+	unsigned long		arg)
+{
+	xfs_fsop_geom_v1_t	fsgeo;
+	int			error;
+
+	error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+	if (error)
+		return -error;
+
+	if (copy_to_user((xfs_fsop_geom_t *)arg, &fsgeo, sizeof(fsgeo)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+int xfs_ioc_fsgeometry(
+	xfs_mount_t		*mp,
+	unsigned long		arg)
+{
+	xfs_fsop_geom_t fsgeo;
+	int		error;
+
+	error = xfs_fs_geometry(mp, &fsgeo, 4);
+	if (error)
+		return -error;
+
+	if (copy_to_user((xfs_fsop_geom_t *)arg, &fsgeo, sizeof(fsgeo)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+int xfs_ioc_xattr(
+	vnode_t			*vp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	struct fsxattr	fa;
+	vattr_t		va;
+	int		error;
+
+	switch (cmd) {
+	case XFS_IOC_FSGETXATTR: {
+		va.va_mask = AT_XFLAGS|AT_EXTSIZE|AT_NEXTENTS;
+		VOP_GETATTR(vp, &va, 0, NULL, error);
+		if (error)
+			return -error;
+
+		fa.fsx_xflags	= va.va_xflags;
+		fa.fsx_extsize	= va.va_extsize;
+		fa.fsx_nextents = va.va_nextents;
+
+		if (copy_to_user((struct fsxattr *)arg, &fa, sizeof(fa)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_FSSETXATTR: {
+		int		attr_flags = 0;
+
+		if (copy_from_user(&fa, (struct fsxattr *)arg, sizeof(fa)))
+			return -XFS_ERROR(EFAULT);
+
+		va.va_mask = AT_XFLAGS | AT_EXTSIZE;
+		va.va_xflags  = fa.fsx_xflags;
+		va.va_extsize = fa.fsx_extsize;
+
+		if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+			attr_flags |= ATTR_NONBLOCK;
+
+		VOP_SETATTR(vp, &va, attr_flags, NULL, error);
+		return -error;
+	}
+
+	case XFS_IOC_FSGETXATTRA: {
+
+		va.va_mask = AT_XFLAGS|AT_EXTSIZE|AT_ANEXTENTS;
+		VOP_GETATTR(vp, &va, 0, NULL, error);
+		if (error)
+			return -error;
+
+		fa.fsx_xflags	= va.va_xflags;
+		fa.fsx_extsize	= va.va_extsize;
+		fa.fsx_nextents = va.va_anextents;
+
+		if (copy_to_user((struct fsxattr *)arg, &fa, sizeof(fa)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+	
+	default:
+		return -ENOTTY;
+
+	}
+}
+
+int xfs_ioc_getbmap(
+	bhv_desc_t		*bdp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	struct getbmap	bm;
+	int		iflags;
+	int		error;
+
+	if (copy_from_user(&bm, (struct getbmap *)arg, sizeof(bm)))
+		return -XFS_ERROR(EFAULT);
+
+	if (bm.bmv_count < 2)
+		return -XFS_ERROR(EINVAL);
+
+	iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+	if (filp->f_mode & FINVIS)
+		iflags |= BMV_IF_NO_DMAPI_READ;
+
+	error = xfs_getbmap(bdp, &bm, (struct getbmap *)arg+1, iflags);
+	if (error)
+		return -error;
+
+	if (copy_to_user((struct getbmap *)arg, &bm, sizeof(bm)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+int xfs_ioc_getbmapx(
+	bhv_desc_t		*bdp,
+	unsigned long		arg)
+{
+	struct getbmapx bmx;
+	struct getbmap	bm;
+	int		iflags;
+	int		error;
+
+	if (copy_from_user(&bmx, (struct getbmapx *)arg, sizeof(bmx)))
+		return -XFS_ERROR(EFAULT);
+
+	if (bmx.bmv_count < 2)
+		return -XFS_ERROR(EINVAL);
+
+	/*
+	 * Map input getbmapx structure to a getbmap
+	 * structure for xfs_getbmap.
+	 */
+	GETBMAP_CONVERT(bmx, bm);
+
+	iflags = bmx.bmv_iflags;
+
+	if (iflags & (~BMV_IF_VALID))
+		return -XFS_ERROR(EINVAL);
+
+	iflags |= BMV_IF_EXTENDED;
+
+	error = xfs_getbmap(bdp, &bm, (struct getbmapx *)arg+1, iflags);
+	if (error)
+		return -error;
+
+	GETBMAP_CONVERT(bm, bmx);
+
+	if (copy_to_user((struct getbmapx *)arg, &bmx, sizeof(bmx)))
+		return -XFS_ERROR(EFAULT);
+
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_iops.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_iops.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_iops.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_iops.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,871 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/locks.h>
+#include <linux/xattr.h>
+
+
+/*
+ * Pull the link count and size up from the xfs inode to the linux inode
+ */
+STATIC void
+validate_fields(
+	struct inode	*ip)
+{
+	vnode_t		*vp = LINVFS_GET_VP(ip);
+	vattr_t		va;
+	int		error;
+
+	va.va_mask = AT_NLINK|AT_SIZE;
+	VOP_GETATTR(vp, &va, ATTR_LAZY, NULL, error);
+	ip->i_nlink = va.va_nlink;
+	ip->i_size = va.va_size;
+	ip->i_blocks = va.va_nblocks;
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+/*
+ * Determine whether a process has a valid fs_struct (kernel daemons
+ * like knfsd don't have an fs_struct).
+ */
+STATIC int inline
+has_fs_struct(struct task_struct *task)
+{
+	return (task->fs != init_task.fs);
+}
+#endif
+
+STATIC int
+linvfs_mknod(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	int		mode,
+	int		rdev)
+{
+	struct inode	*ip;
+	vattr_t		va;
+	vnode_t		*vp = NULL, *dvp = LINVFS_GET_VP(dir);
+	xattr_exists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
+	int		have_default_acl = 0;
+	int		error = EINVAL;
+
+	if (test_default_acl)
+		have_default_acl = test_default_acl(dvp);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	/*
+	 * Conditionally compiled so that the ACL base kernel changes can be
+	 * split out into separate patches - remove this once MS_POSIXACL is
+	 * accepted, or some other way to implement this exists.
+	 */
+	if (IS_POSIXACL(dir) && !have_default_acl && has_fs_struct(current))
+		mode &= ~current->fs->umask;
+#endif
+
+	bzero(&va, sizeof(va));
+	va.va_mask = AT_TYPE|AT_MODE;
+	va.va_type = IFTOVT(mode);
+	va.va_mode = mode;
+
+	switch (mode & S_IFMT) {
+	case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
+		va.va_rdev = rdev;
+		va.va_mask |= AT_RDEV;
+		/*FALLTHROUGH*/
+	case S_IFREG:
+		VOP_CREATE(dvp, dentry, &va, &vp, NULL, error);
+		break;
+	case S_IFDIR:
+		VOP_MKDIR(dvp, dentry, &va, &vp, NULL, error);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	if (!error) {
+		ASSERT(vp);
+		ip = LINVFS_GET_IP(vp);
+		if (!ip) {
+			VN_RELE(vp);
+			return -ENOMEM;
+		}
+
+		if (S_ISCHR(mode) || S_ISBLK(mode))
+			ip->i_rdev = to_kdev_t(rdev);
+		/* linvfs_revalidate_core returns (-) errors */
+		error = -linvfs_revalidate_core(ip, ATTR_COMM);
+		validate_fields(dir);
+		d_instantiate(dentry, ip);
+		mark_inode_dirty_sync(ip);
+		mark_inode_dirty_sync(dir);
+	}
+
+	if (!error && have_default_acl) {
+		_ACL_DECL	(pdacl);
+
+		if (!_ACL_ALLOC(pdacl)) {
+			error = -ENOMEM;
+		} else {
+			if (_ACL_GET_DEFAULT(dvp, pdacl))
+				error = _ACL_INHERIT(vp, &va, pdacl);
+			VMODIFY(vp);
+			_ACL_FREE(pdacl);
+		}
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_create(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	int		mode)
+{
+	return linvfs_mknod(dir, dentry, mode, 0);
+}
+
+STATIC int
+linvfs_mkdir(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	int		mode)
+{
+	return linvfs_mknod(dir, dentry, mode|S_IFDIR, 0);
+}
+
+
+STATIC struct dentry *
+linvfs_lookup(
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	int		error;
+	vnode_t		*vp, *cvp;
+	struct inode	*ip = NULL;
+
+	if (dentry->d_name.len >= MAXNAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	cvp = NULL;
+	vp = LINVFS_GET_VP(dir);
+	VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error);
+	if (!error) {
+		ASSERT(cvp);
+		ip = LINVFS_GET_IP(cvp);
+		if (!ip) {
+			VN_RELE(cvp);
+			return ERR_PTR(-EACCES);
+		}
+		error = -linvfs_revalidate_core(ip, ATTR_COMM);
+	}
+	if (error && (error != ENOENT))
+		return ERR_PTR(-error);
+	d_add(dentry, ip);	/* Negative entry goes in if ip is NULL */
+	return NULL;
+}
+
+STATIC int
+linvfs_link(
+	struct dentry	*old_dentry,
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	int		error;
+	vnode_t		*tdvp;	/* Target directory for new name/link */
+	vnode_t		*vp;	/* vp of name being linked */
+	struct inode	*ip;	/* inode of guy being linked to */
+
+	ip = old_dentry->d_inode;	/* inode being linked to */
+	if (S_ISDIR(ip->i_mode))
+		return -EPERM;
+
+	tdvp = LINVFS_GET_VP(dir);
+	vp = LINVFS_GET_VP(ip);
+
+	error = 0;
+	VOP_LINK(tdvp, vp, dentry, NULL, error);
+	if (!error) {
+		VMODIFY(tdvp);
+		VN_HOLD(vp);
+		validate_fields(ip);
+		d_instantiate(dentry, ip);
+		mark_inode_dirty_sync(ip);
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_unlink(
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	int		error = 0;
+	struct inode	*inode;
+	vnode_t		*dvp;	/* directory containing name to remove */
+
+	inode = dentry->d_inode;
+
+	dvp = LINVFS_GET_VP(dir);
+
+	VOP_REMOVE(dvp, dentry, NULL, error);
+
+	if (!error) {
+		validate_fields(dir);	/* For size only */
+		validate_fields(inode);
+		mark_inode_dirty_sync(inode);
+		mark_inode_dirty_sync(dir);
+	}
+
+	return -error;
+}
+
+STATIC int
+linvfs_symlink(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	const char	*symname)
+{
+	int		error;
+	vnode_t		*dvp;	/* directory containing name to remove */
+	vnode_t		*cvp;	/* used to lookup symlink to put in dentry */
+	vattr_t		va;
+	struct inode	*ip = NULL;
+
+	dvp = LINVFS_GET_VP(dir);
+
+	bzero(&va, sizeof(va));
+	va.va_type = VLNK;
+	va.va_mode = 0777 & ~current->fs->umask;
+	va.va_mask = AT_TYPE|AT_MODE; /* AT_PROJID? */
+
+	error = 0;
+	VOP_SYMLINK(dvp, dentry, &va, (char *)symname,
+							&cvp, NULL, error);
+	if (!error) {
+		ASSERT(cvp);
+		ASSERT(cvp->v_type == VLNK);
+		ip = LINVFS_GET_IP(cvp);
+		if (!ip) {
+			error = ENOMEM;
+			VN_RELE(cvp);
+		} else {
+			/* linvfs_revalidate_core returns (-) errors */
+			error = -linvfs_revalidate_core(ip, ATTR_COMM);
+			d_instantiate(dentry, ip);
+			validate_fields(dir);
+			mark_inode_dirty_sync(ip);
+			mark_inode_dirty_sync(dir);
+		}
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_rmdir(
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*dvp = LINVFS_GET_VP(dir);
+	int		error;
+
+	VOP_RMDIR(dvp, dentry, NULL, error);
+	if (!error) {
+		validate_fields(inode);
+		validate_fields(dir);
+		mark_inode_dirty_sync(inode);
+		mark_inode_dirty_sync(dir);
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_rename(
+	struct inode	*odir,
+	struct dentry	*odentry,
+	struct inode	*ndir,
+	struct dentry	*ndentry)
+{
+	int		error;
+	vnode_t		*fvp;	/* from directory */
+	vnode_t		*tvp;	/* target directory */
+	struct inode	*new_inode = NULL;
+
+	fvp = LINVFS_GET_VP(odir);
+	tvp = LINVFS_GET_VP(ndir);
+
+	new_inode = ndentry->d_inode;
+
+	VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error);
+	if (error)
+		return -error;
+
+	if (new_inode) {
+		validate_fields(new_inode);
+	}
+
+	validate_fields(odir);
+	if (ndir != odir)
+		validate_fields(ndir);
+	mark_inode_dirty(ndir);
+	return 0;
+}
+
+STATIC int
+linvfs_readlink(
+	struct dentry	*dentry,
+	char		*buf,
+	int		size)
+{
+	vnode_t		*vp;
+	uio_t		uio;
+	iovec_t		iov;
+	int		error;
+
+	vp = LINVFS_GET_VP(dentry->d_inode);
+
+	iov.iov_base = buf;
+	iov.iov_len = size;
+
+	uio.uio_iov = &iov;
+	uio.uio_offset = 0;
+	uio.uio_segflg = UIO_USERSPACE;
+	uio.uio_resid = size;
+
+	VOP_READLINK(vp, &uio, NULL, error);
+	if (error)
+		return -error;
+
+	return (size - uio.uio_resid);
+}
+
+/*
+ * careful here - this function can get called recusively, so
+ * we need to be very careful about how much stack we use.
+ * uio is kmalloced for this reason...
+ */
+STATIC int
+linvfs_follow_link(
+	struct dentry		*dentry,
+	struct nameidata	*nd)
+{
+	vnode_t			*vp;
+	uio_t			*uio;
+	iovec_t			iov;
+	int			error;
+	char			*link;
+
+	ASSERT(dentry);
+	ASSERT(nd);
+
+	link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL);
+	if (!link)
+		return -ENOMEM;
+
+	uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL);
+	if (!uio) {
+		kfree(link);
+		return -ENOMEM;
+	}
+
+	vp = LINVFS_GET_VP(dentry->d_inode);
+
+	iov.iov_base = link;
+	iov.iov_len = MAXNAMELEN;
+
+	uio->uio_iov = &iov;
+	uio->uio_offset = 0;
+	uio->uio_segflg = UIO_SYSSPACE;
+	uio->uio_resid = MAXNAMELEN;
+	uio->uio_fmode = 0;
+
+	VOP_READLINK(vp, uio, NULL, error);
+	if (error) {
+		kfree(uio);
+		kfree(link);
+		return -error;
+	}
+
+	link[MAXNAMELEN - uio->uio_resid] = '\0';
+	kfree(uio);
+
+	/* vfs_follow_link returns (-) errors */
+	error = vfs_follow_link(nd, link);
+	kfree(link);
+	return error;
+}
+
+STATIC int
+linvfs_permission(
+	struct inode	*inode,
+	int		mode)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	int		error;
+
+	mode <<= 6;		/* convert from linux to vnode access bits */
+	VOP_ACCESS(vp, mode, NULL, error);
+	return -error;
+}
+
+/* Brute force approach for now - copy data into linux inode
+ * from the results of a getattr. This gets called out of things
+ * like stat.
+ */
+int
+linvfs_revalidate_core(
+	struct inode	*inode,
+	int		flags)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	/* vn_revalidate returns (-) error so this is ok */
+	return vn_revalidate(vp, flags);
+}
+
+STATIC int
+linvfs_revalidate(
+	struct dentry	*dentry)
+{
+	vnode_t		*vp = LINVFS_GET_VP(dentry->d_inode);
+
+	if (unlikely(vp->v_flag & VMODIFIED)) {
+		return linvfs_revalidate_core(dentry->d_inode, 0);
+	}
+	return 0;
+}
+
+STATIC int
+linvfs_setattr(
+	struct dentry	*dentry,
+	struct iattr	*attr)
+{
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	vattr_t		vattr;
+	unsigned int	ia_valid = attr->ia_valid;
+	int		error;
+	int		flags = 0;
+
+	memset(&vattr, 0, sizeof(vattr_t));
+	if (ia_valid & ATTR_UID) {
+		vattr.va_mask |= AT_UID;
+		vattr.va_uid = attr->ia_uid;
+	}
+	if (ia_valid & ATTR_GID) {
+		vattr.va_mask |= AT_GID;
+		vattr.va_gid = attr->ia_gid;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		vattr.va_mask |= AT_SIZE;
+		vattr.va_size = attr->ia_size;
+	}
+	if (ia_valid & ATTR_ATIME) {
+		vattr.va_mask |= AT_ATIME;
+		vattr.va_atime.tv_sec = attr->ia_atime;
+		vattr.va_atime.tv_nsec = 0;
+	}
+	if (ia_valid & ATTR_MTIME) {
+		vattr.va_mask |= AT_MTIME;
+		vattr.va_mtime.tv_sec = attr->ia_mtime;
+		vattr.va_mtime.tv_nsec = 0;
+	}
+	if (ia_valid & ATTR_CTIME) {
+		vattr.va_mask |= AT_CTIME;
+		vattr.va_ctime.tv_sec = attr->ia_ctime;
+		vattr.va_ctime.tv_nsec = 0;
+	}
+	if (ia_valid & ATTR_MODE) {
+		vattr.va_mask |= AT_MODE;
+		vattr.va_mode = attr->ia_mode;
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			inode->i_mode &= ~S_ISGID;
+	}
+
+	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))
+		flags = ATTR_UTIME;
+
+	VOP_SETATTR(vp, &vattr, flags, NULL, error);
+	if (error)
+		return(-error); /* Positive error up from XFS */
+	if (ia_valid & ATTR_SIZE) {
+		error = vmtruncate(inode, attr->ia_size);
+	}
+
+	if (!error) {
+		vn_revalidate(vp, 0);
+		mark_inode_dirty_sync(inode);
+	}
+	return error;
+}
+
+STATIC void
+linvfs_truncate(
+	struct inode		*inode)
+{
+	block_truncate_page(inode->i_mapping, inode->i_size, linvfs_get_block);
+}
+
+
+
+/*
+ * Extended attributes interfaces
+ */
+
+#define SYSTEM_NAME	"system."	/* VFS shared names/values */
+#define ROOT_NAME	"xfsroot."	/* XFS ondisk names/values */
+#define USER_NAME	"user."		/* user's own names/values */
+STATIC xattr_namespace_t xfs_namespace_array[] = {
+	{ .name= SYSTEM_NAME,	.namelen= sizeof(SYSTEM_NAME)-1,.exists= NULL },
+	{ .name= ROOT_NAME,	.namelen= sizeof(ROOT_NAME)-1,	.exists= NULL },
+	{ .name= USER_NAME,	.namelen= sizeof(USER_NAME)-1,	.exists= NULL },
+	{ .name= NULL }
+};
+xattr_namespace_t *xfs_namespaces = &xfs_namespace_array[0];
+
+#define POSIXACL_ACCESS		"posix_acl_access"
+#define POSIXACL_ACCESS_SIZE	(sizeof(POSIXACL_ACCESS)-1)
+#define POSIXACL_DEFAULT	"posix_acl_default"
+#define POSIXACL_DEFAULT_SIZE	(sizeof(POSIXACL_DEFAULT)-1)
+#define POSIXCAP		"posix_capabilities"
+#define POSIXCAP_SIZE		(sizeof(POSIXCAP)-1)
+#define POSIXMAC		"posix_mac"
+#define POSIXMAC_SIZE		(sizeof(POSIXMAC)-1)
+STATIC xattr_namespace_t sys_namespace_array[] = {
+	{ .name= POSIXACL_ACCESS,
+		.namelen= POSIXACL_ACCESS_SIZE,	 .exists= _ACL_ACCESS_EXISTS },
+	{ .name= POSIXACL_DEFAULT,
+		.namelen= POSIXACL_DEFAULT_SIZE, .exists= _ACL_DEFAULT_EXISTS },
+	{ .name= POSIXCAP,
+		.namelen= POSIXCAP_SIZE,	 .exists= _CAP_EXISTS },
+	{ .name= POSIXMAC,
+		.namelen= POSIXMAC_SIZE,	 .exists= _MAC_EXISTS },
+	{ .name= NULL }
+};
+
+/*
+ * Some checks to prevent people abusing EAs to get over quota:
+ * - Don't allow modifying user EAs on devices/symlinks;
+ * - Don't allow modifying user EAs if sticky bit set;
+ */
+STATIC int
+capable_user_xattr(
+	struct inode	*inode)
+{
+	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
+	    !capable(CAP_SYS_ADMIN))
+		return 0;
+	if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
+	    (current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		return 0;
+	return 1;
+}
+
+STATIC int
+linvfs_setxattr(
+	struct dentry	*dentry,
+	const char	*name,
+	void		*data,
+	size_t		size,
+	int		flags)
+{
+	int		error;
+	int		xflags = 0;
+	char		*p = (char *)name;
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	if (strncmp(name, xfs_namespaces[SYSTEM_NAMES].name,
+			xfs_namespaces[SYSTEM_NAMES].namelen) == 0) {
+		error = -EINVAL;
+		if (flags & XATTR_CREATE)
+			 return error;
+		error = -ENOATTR;
+		p += xfs_namespaces[SYSTEM_NAMES].namelen;
+		if (strcmp(p, POSIXACL_ACCESS) == 0) {
+			if (vp->v_flag & VMODIFIED) {
+				error = linvfs_revalidate_core(inode, 0);
+				if (error)
+					return error;
+			}
+			error = xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
+			if (!error) {
+				VMODIFY(vp);
+				error = linvfs_revalidate_core(inode, 0);
+			}
+		}
+		else if (strcmp(p, POSIXACL_DEFAULT) == 0) {
+			error = linvfs_revalidate_core(inode, 0);
+			if (error)
+				return error;
+			error = xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
+			if (!error) {
+				VMODIFY(vp);
+				error = linvfs_revalidate_core(inode, 0);
+			}
+		}
+		else if (strcmp(p, POSIXCAP) == 0) {
+			error = xfs_cap_vset(vp, data, size);
+		}
+		return error;
+	}
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (flags & XATTR_CREATE)
+		xflags |= ATTR_CREATE;
+	if (flags & XATTR_REPLACE)
+		xflags |= ATTR_REPLACE;
+
+	if (strncmp(name, xfs_namespaces[ROOT_NAMES].name,
+			xfs_namespaces[ROOT_NAMES].namelen) == 0) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xflags |= ATTR_ROOT;
+		p += xfs_namespaces[ROOT_NAMES].namelen;
+		VOP_ATTR_SET(vp, p, data, size, xflags, NULL, error);
+		return -error;
+	}
+	if (strncmp(name, xfs_namespaces[USER_NAMES].name,
+			xfs_namespaces[USER_NAMES].namelen) == 0) {
+		if (!capable_user_xattr(inode))
+			return -EPERM;
+		p += xfs_namespaces[USER_NAMES].namelen;
+		VOP_ATTR_SET(vp, p, data, size, xflags, NULL, error);
+		return -error;
+	}
+	return -ENOATTR;
+}
+
+STATIC ssize_t
+linvfs_getxattr(
+	struct dentry	*dentry,
+	const char	*name,
+	void		*data,
+	size_t		size)
+{
+	ssize_t		error;
+	int		xflags = 0;
+	char		*p = (char *)name;
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	if (strncmp(name, xfs_namespaces[SYSTEM_NAMES].name,
+			xfs_namespaces[SYSTEM_NAMES].namelen) == 0) {
+		error = -ENOATTR;
+		p += xfs_namespaces[SYSTEM_NAMES].namelen;
+		if (strcmp(p, POSIXACL_ACCESS) == 0) {
+			if (vp->v_flag & VMODIFIED) {
+				error = linvfs_revalidate_core(inode, 0);
+				if (error)
+					return error;
+			}
+			error = xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
+		}
+		else if (strcmp(p, POSIXACL_DEFAULT) == 0) {
+			if (vp->v_flag & VMODIFIED) {
+				error = linvfs_revalidate_core(inode, 0);
+				if (error)
+					return error;
+			}
+			error = xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
+		}
+		else if (strcmp(p, POSIXCAP) == 0) {
+			error = xfs_cap_vget(vp, data, size);
+		}
+		return error;
+	}
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (!size)
+		xflags |= ATTR_KERNOVAL;
+
+	if (strncmp(name, xfs_namespaces[ROOT_NAMES].name,
+			xfs_namespaces[ROOT_NAMES].namelen) == 0) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xflags |= ATTR_ROOT;
+		p += xfs_namespaces[ROOT_NAMES].namelen;
+		VOP_ATTR_GET(vp, p, data, (int *)&size, xflags, NULL, error);
+		if (!error)
+			error = -size;
+		return -error;
+	}
+	if (strncmp(name, xfs_namespaces[USER_NAMES].name,
+			xfs_namespaces[USER_NAMES].namelen) == 0) {
+		p += xfs_namespaces[USER_NAMES].namelen;
+		if (!capable_user_xattr(inode))
+			return -EPERM;
+		VOP_ATTR_GET(vp, p, data, (int *)&size, xflags, NULL, error);
+		if (!error)
+			error = -size;
+		return -error;
+	}
+	return -ENOATTR;
+}
+
+
+STATIC ssize_t
+linvfs_listxattr(
+	struct dentry		*dentry,
+	char			*data,
+	size_t			size)
+{
+	ssize_t			error;
+	int			result = 0;
+	int			xflags = ATTR_KERNAMELS;
+	char			*k = data;
+	attrlist_cursor_kern_t	cursor;
+	xattr_namespace_t	*sys;
+	vnode_t			*vp;
+
+	vp = LINVFS_GET_VP(dentry->d_inode);
+
+	if (!size)
+		xflags |= ATTR_KERNOVAL;
+	if (capable(CAP_SYS_ADMIN))
+		xflags |= ATTR_KERNFULLS;
+
+	memset(&cursor, 0, sizeof(cursor));
+	VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error);
+	if (error > 0)
+		return -error;
+	result += -error;
+
+	k += result;		/* advance start of our buffer */
+	for (sys = &sys_namespace_array[0]; sys->name != NULL; sys++) {
+		if (sys->exists == NULL || !sys->exists(vp))
+			continue;
+		result += xfs_namespaces[SYSTEM_NAMES].namelen;
+		result += sys->namelen + 1;
+		if (size) {
+			if (result > size)
+				return -ERANGE;
+			strcpy(k, xfs_namespaces[SYSTEM_NAMES].name);
+			k += xfs_namespaces[SYSTEM_NAMES].namelen;
+			strcpy(k, sys->name);
+			k += sys->namelen + 1;
+		}
+	}
+	return result;
+}
+
+STATIC int
+linvfs_removexattr(
+	struct dentry	*dentry,
+	const char	*name)
+{
+	int		error;
+	int		xflags = 0;
+	char		*p = (char *)name;
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	if (strncmp(name, xfs_namespaces[SYSTEM_NAMES].name,
+			xfs_namespaces[SYSTEM_NAMES].namelen) == 0) {
+		error = -ENOATTR;
+		p += xfs_namespaces[SYSTEM_NAMES].namelen;
+		if (strcmp(p, POSIXACL_ACCESS) == 0)
+			error = xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
+		else if (strcmp(p, POSIXACL_DEFAULT) == 0)
+			error = xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
+		else if (strcmp(p, POSIXCAP) == 0)
+			error = xfs_cap_vremove(vp);
+		return error;
+	}
+
+	if (strncmp(name, xfs_namespaces[ROOT_NAMES].name,
+			xfs_namespaces[ROOT_NAMES].namelen) == 0) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xflags |= ATTR_ROOT;
+		p += xfs_namespaces[ROOT_NAMES].namelen;
+		VOP_ATTR_REMOVE(vp, p, xflags, NULL, error);
+		return -error;
+	}
+	if (strncmp(name, xfs_namespaces[USER_NAMES].name,
+			xfs_namespaces[USER_NAMES].namelen) == 0) {
+		p += xfs_namespaces[USER_NAMES].namelen;
+		if (!capable_user_xattr(inode))
+			return -EPERM;
+		VOP_ATTR_REMOVE(vp, p, xflags, NULL, error);
+		return -error;
+	}
+	return -ENOATTR;
+}
+
+
+struct inode_operations linvfs_file_inode_operations =
+{
+	.permission		= linvfs_permission,
+	.truncate		= linvfs_truncate,
+	.revalidate		= linvfs_revalidate,
+	.setattr		= linvfs_setattr,
+	.setxattr		= linvfs_setxattr,
+	.getxattr		= linvfs_getxattr,
+	.listxattr		= linvfs_listxattr,
+	.removexattr		= linvfs_removexattr,
+};
+
+struct inode_operations linvfs_dir_inode_operations =
+{
+	.create			= linvfs_create,
+	.lookup			= linvfs_lookup,
+	.link			= linvfs_link,
+	.unlink			= linvfs_unlink,
+	.symlink		= linvfs_symlink,
+	.mkdir			= linvfs_mkdir,
+	.rmdir			= linvfs_rmdir,
+	.mknod			= linvfs_mknod,
+	.rename			= linvfs_rename,
+	.permission		= linvfs_permission,
+	.revalidate		= linvfs_revalidate,
+	.setattr		= linvfs_setattr,
+	.setxattr		= linvfs_setxattr,
+	.getxattr		= linvfs_getxattr,
+	.listxattr		= linvfs_listxattr,
+	.removexattr		= linvfs_removexattr,
+};
+
+struct inode_operations linvfs_symlink_inode_operations =
+{
+	.readlink		= linvfs_readlink,
+	.follow_link		= linvfs_follow_link,
+	.permission		= linvfs_permission,
+	.revalidate		= linvfs_revalidate,
+	.setattr		= linvfs_setattr,
+	.setxattr		= linvfs_setxattr,
+	.getxattr		= linvfs_getxattr,
+	.listxattr		= linvfs_listxattr,
+	.removexattr		= linvfs_removexattr,
+};
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_iops.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_iops.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_iops.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_iops.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IOPS_H__
+#define __XFS_IOPS_H__
+
+/*
+ * Extended system attributes.
+ * So far only POSIX ACLs are supported, but this will need to
+ * grow in time (capabilities, mandatory access control, etc).
+ */
+#define XFS_SYSTEM_NAMESPACE	SYSTEM_POSIXACL
+
+/*
+ * Define a table of the namespaces XFS supports
+ */
+typedef int (*xattr_exists_t)(vnode_t *);
+
+typedef struct xattr_namespace {
+	char		*name;
+	unsigned int	namelen;
+	xattr_exists_t	exists;
+} xattr_namespace_t;
+
+#define SYSTEM_NAMES	0
+#define ROOT_NAMES	1
+#define USER_NAMES	2
+extern struct xattr_namespace *xfs_namespaces;
+
+
+extern struct inode_operations linvfs_file_inode_operations;
+extern struct inode_operations linvfs_dir_inode_operations;
+extern struct inode_operations linvfs_symlink_inode_operations;
+
+extern struct file_operations linvfs_file_operations;
+extern struct file_operations linvfs_dir_operations;
+
+extern struct address_space_operations linvfs_aops;
+
+extern int linvfs_revalidate_core(struct inode *, int);
+extern int linvfs_get_block(struct inode *, long, struct buffer_head *, int);
+
+#endif /* __XFS_IOPS_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_linux.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_linux.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_linux.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_linux.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LINUX__
+#define __XFS_LINUX__
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+
+#include <asm/page.h>
+#include <asm/div64.h>
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+#include <linux/xfs_behavior.h>
+#include <linux/xfs_vfs.h>
+#include <linux/xfs_cred.h>
+#include <linux/xfs_vnode.h>
+#include <linux/xfs_stats.h>
+#include <linux/xfs_sysctl.h>
+#include <linux/xfs_iops.h>
+#include <linux/xfs_super.h>
+#include <linux/xfs_globals.h>
+#include <linux/xfs_fs_subr.h>
+
+#include <pagebuf/page_buf.h>
+
+#ifndef STATIC
+#define STATIC static
+#endif
+
+typedef struct xfs_dirent {		/* data from readdir() */
+	xfs_ino_t	d_ino;		/* inode number of entry */
+	xfs_off_t	d_off;		/* offset of disk directory entry */
+	unsigned short	d_reclen;	/* length of this record */
+	char		d_name[1];	/* name of file */
+} xfs_dirent_t;
+
+#define DIRENTBASESIZE		(((xfs_dirent_t *)0)->d_name - (char *)0)
+#define DIRENTSIZE(namelen)	\
+	((DIRENTBASESIZE + (namelen) + \
+		sizeof(xfs_off_t)) & ~(sizeof(xfs_off_t) - 1))
+
+#define NBPP		PAGE_SIZE
+#define DPPSHFT		(PAGE_SHIFT - 9)
+#define NDPP		(1 << (PAGE_SHIFT - 9))
+#define dtop(DD)	(((DD) + NDPP - 1) >> DPPSHFT)
+#define dtopt(DD)	((DD) >> DPPSHFT)
+#define dpoff(DD)	((DD) & (NDPP-1))
+
+#define NBBY		8		/* number of bits per byte */
+#define NBPC		PAGE_SIZE	/* Number of bytes per click */
+#define BPCSHIFT	PAGE_SHIFT	/* LOG2(NBPC) if exact */
+
+/*
+ * Size of block device i/o is parameterized here.
+ * Currently the system supports page-sized i/o.
+ */
+#define BLKDEV_IOSHIFT		BPCSHIFT
+#define BLKDEV_IOSIZE		(1<<BLKDEV_IOSHIFT)
+/* number of BB's per block device block */
+#define BLKDEV_BB		BTOBB(BLKDEV_IOSIZE)
+
+/* bytes to clicks */
+#define btoc(x)		(((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
+#define btoct(x)	((__psunsigned_t)(x)>>BPCSHIFT)
+#define btoc64(x)	(((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
+#define btoct64(x)	((__uint64_t)(x)>>BPCSHIFT)
+#define io_btoc(x)	(((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT)
+#define io_btoct(x)	((__psunsigned_t)(x)>>IO_BPCSHIFT)
+
+/* off_t bytes to clicks */
+#define offtoc(x)	(((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
+#define offtoct(x)	((xfs_off_t)(x)>>BPCSHIFT)
+
+/* clicks to off_t bytes */
+#define ctooff(x)	((xfs_off_t)(x)<<BPCSHIFT)
+
+/* clicks to bytes */
+#define ctob(x)		((__psunsigned_t)(x)<<BPCSHIFT)
+#define btoct(x)	((__psunsigned_t)(x)>>BPCSHIFT)
+#define ctob64(x)	((__uint64_t)(x)<<BPCSHIFT)
+#define io_ctob(x)	((__psunsigned_t)(x)<<IO_BPCSHIFT)
+
+/* bytes to clicks */
+#define btoc(x)		(((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
+
+#ifndef CELL_CAPABLE
+#define FSC_NOTIFY_NAME_CHANGED(vp)
+#endif
+
+#ifndef ENOATTR
+#define ENOATTR		ENODATA		/* Attribute not found */
+#endif
+
+/* Note: EWRONGFS never visible outside the kernel */
+#define EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
+
+/*
+ * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't
+ *     return codes out of its known range in errno.
+ * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't
+ *     conflict with any code we use already or any code a driver may use)
+ * XXX Some options (currently we do #2):
+ *	1/ New error code ["Filesystem is corrupted", _after_ glibc updated]
+ *	2/ 990 ["Unknown error 990"]
+ *	3/ EUCLEAN ["Structure needs cleaning"]
+ *	4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]
+ */
+#define EFSCORRUPTED	990		/* Filesystem is corrupted */
+
+#define SYNCHRONIZE()	barrier()
+#define lbolt		jiffies
+#define rootdev		ROOT_DEV
+#define __return_address __builtin_return_address(0)
+#define LONGLONG_MAX	9223372036854775807LL	/* max "long long int" */
+#define nopkg()		( ENOSYS )
+
+/* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */
+/* we may well need to fine-tune this if it ever becomes an issue.  */
+#define DQUOT_MAX_HEURISTIC	1024	/* NR_DQUOTS */
+#define ndquot			DQUOT_MAX_HEURISTIC
+
+/* IRIX uses the current size of the name cache to guess a good value */
+/* - this isn't the same but is a good enough starting point for now. */
+#define DQUOT_HASH_HEURISTIC	files_stat.nr_files
+
+/* IRIX inodes maintain the project ID also, zero this field on Linux */
+#define DEFAULT_PROJID	0
+#define dfltprid	DEFAULT_PROJID
+
+#define MAXNAMELEN	256
+#define MAXPATHLEN	1024
+
+#define FINVIS		0x0100	/* don't update timestamps - XFS */
+
+#define MIN(a,b)	(min(a,b))
+#define MAX(a,b)	(max(a,b))
+#define howmany(x, y)	(((x)+((y)-1))/(y))
+#define roundup(x, y)	((((x)+((y)-1))/(y))*(y))
+
+/* Move the kernel do_div definition off to one side */
+
+#if defined __i386__
+/* For ia32 we need to pull some tricks to get past various versions
+ * of the compiler which do not like us using do_div in the middle
+ * of large functions.
+ */
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+	__u32	mod;
+
+	switch (n) {
+		case 4:
+			mod = *(__u32 *)a % b;
+			*(__u32 *)a = *(__u32 *)a / b;
+			return mod;
+		case 8:
+			{
+			unsigned long __upper, __low, __high, __mod;
+			__u64	c = *(__u64 *)a;
+			__upper = __high = c >> 32;
+			__low = c;
+			if (__high) {
+				__upper = __high % (b);
+				__high = __high / (b);
+			}
+			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+			asm("":"=A" (c):"a" (__low),"d" (__high));
+			*(__u64 *)a = c;
+			return __mod;
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+	switch (n) {
+		case 4:
+			return *(__u32 *)a % b;
+		case 8:
+			{
+			unsigned long __upper, __low, __high, __mod;
+			__u64	c = *(__u64 *)a;
+			__upper = __high = c >> 32;
+			__low = c;
+			if (__high) {
+				__upper = __high % (b);
+				__high = __high / (b);
+			}
+			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+			asm("":"=A" (c):"a" (__low),"d" (__high));
+			return __mod;
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+#else
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+	__u32	mod;
+
+	switch (n) {
+		case 4:
+			mod = *(__u32 *)a % b;
+			*(__u32 *)a = *(__u32 *)a / b;
+			return mod;
+		case 8:
+			mod = do_div(*(__u64 *)a, b);
+			return mod;
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+	switch (n) {
+		case 4:
+			return *(__u32 *)a % b;
+		case 8:
+			{
+			__u64	c = *(__u64 *)a;
+			return do_div(c, b);
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+#endif
+
+#undef do_div
+#define do_div(a, b)	xfs_do_div(&(a), (b), sizeof(a))
+#define do_mod(a, b)	xfs_do_mod(&(a), (b), sizeof(a))
+
+static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
+{
+	x += y - 1;
+	do_div(x, y);
+	return(x * y);
+}
+
+#endif /* __XFS_LINUX__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_lrw.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_lrw.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_lrw.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_lrw.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,1812 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ *  fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
+ *
+ */
+
+#include <xfs.h>
+#include <linux/pagemap.h>
+#include <linux/capability.h>
+
+
+#define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
+						<< mp->m_writeio_log)
+#define XFS_STRAT_WRITE_IMAPS	2
+
+STATIC int xfs_iomap_read(xfs_iocore_t *, loff_t, size_t, int, pb_bmap_t *,
+			int *, struct pm *);
+STATIC int xfs_iomap_write(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *,
+			int *, int, struct pm *);
+STATIC int xfs_iomap_write_delay(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *,
+			int *, int, int);
+STATIC int xfs_iomap_write_direct(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *,
+			int *, int, int);
+STATIC int _xfs_imap_to_bmap(xfs_iocore_t *, xfs_off_t, xfs_bmbt_irec_t *,
+			pb_bmap_t *, int, int);
+
+
+/*
+ *	xfs_iozero
+ *
+ *	xfs_iozero clears the specified range of buffer supplied,
+ *	and marks all the affected blocks as valid and modified.  If
+ *	an affected block is not allocated, it will be allocated.  If
+ *	an affected block is not completely overwritten, and is not
+ *	valid before the operation, it will be read from disk before
+ *	being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+	struct inode		*ip,	/* inode 			*/
+	loff_t			pos,	/* offset in file		*/
+	size_t			count,	/* size of data to zero		*/
+	loff_t			end_size)	/* max file size to set */
+{
+	unsigned		bytes;
+	struct page		*page;
+	struct address_space	*mapping;
+	char			*kaddr;
+	int			status;
+
+	mapping = ip->i_mapping;
+	do {
+		unsigned long index, offset;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		status = -ENOMEM;
+		page = grab_cache_page(mapping, index);
+		if (!page)
+			break;
+
+		kaddr = kmap(page);
+		status = mapping->a_ops->prepare_write(NULL, page, offset,
+							offset + bytes);
+		if (status) {
+			goto unlock;
+		}
+
+		memset((void *) (kaddr + offset), 0, bytes);
+		flush_dcache_page(page);
+		status = mapping->a_ops->commit_write(NULL, page, offset,
+							offset + bytes);
+		if (!status) {
+			pos += bytes;
+			count -= bytes;
+			if (pos > ip->i_size)
+				ip->i_size = pos < end_size ? pos : end_size;
+		}
+
+unlock:
+		kunmap(page);
+		unlock_page(page);
+		page_cache_release(page);
+		if (status)
+			break;
+	} while (count);
+
+	return (-status);
+}
+
+ssize_t			/* bytes read, or (-)  error */
+xfs_read(
+	bhv_desc_t	*bdp,
+	struct file	*file,
+	char		*buf,
+	size_t		size,
+	loff_t		*offset,
+	cred_t		*credp)
+{
+	ssize_t		ret;
+	xfs_fsize_t	n;
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	XFS_STATS_INC(xfsstats.xs_read_calls);
+
+	if (file->f_flags & O_DIRECT) {
+		if (((__psint_t)buf & BBMASK) ||
+		    (*offset & mp->m_blockmask) ||
+		    (size & mp->m_blockmask)) {
+			if (*offset == ip->i_d.di_size) {
+				return (0);
+			}
+			return -XFS_ERROR(EINVAL);
+		}
+	}
+
+
+	n = XFS_MAX_FILE_OFFSET - *offset;
+	if ((n <= 0) || (size == 0))
+		return 0;
+
+	if (n < size)
+		size = n;
+
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		return -EIO;
+	}
+
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+	if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
+	    !(file->f_mode & FINVIS)) {
+		int error;
+		vrwlock_t locktype = VRWLOCK_READ;
+
+		error = xfs_dm_send_data_event(DM_EVENT_READ, bdp,
+					     *offset, size,
+					     FILP_DELAY_FLAG(file),
+					     &locktype);
+		if (error) {
+			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+			return -error;
+		}
+	}
+
+	ret = generic_file_read(file, buf, size, offset);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	XFS_STATS_ADD(xfsstats.xs_read_bytes, ret);
+
+	if (!(file->f_mode & FINVIS))
+		xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
+
+	return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF.  We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int				/* error (positive) */
+xfs_zero_last_block(
+	struct inode	*ip,
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,
+	xfs_fsize_t	isize,
+	xfs_fsize_t	end_size,
+	struct pm	*pmp)
+{
+	xfs_fileoff_t	last_fsb;
+	xfs_mount_t	*mp;
+	int		nimaps;
+	int		zero_offset;
+	int		zero_len;
+	int		isize_fsb_offset;
+	int		error = 0;
+	xfs_bmbt_irec_t imap;
+	loff_t		loff;
+	size_t		lsize;
+
+	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
+	ASSERT(offset > isize);
+
+	mp = io->io_mount;
+
+	isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
+	if (isize_fsb_offset == 0) {
+		/*
+		 * There are no extra bytes in the last block on disk to
+		 * zero, so return.
+		 */
+		return 0;
+	}
+
+	last_fsb = XFS_B_TO_FSBT(mp, isize);
+	nimaps = 1;
+	error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
+			  &nimaps, NULL);
+	if (error) {
+		return error;
+	}
+	ASSERT(nimaps > 0);
+	/*
+	 * If the block underlying isize is just a hole, then there
+	 * is nothing to zero.
+	 */
+	if (imap.br_startblock == HOLESTARTBLOCK) {
+		return 0;
+	}
+	/*
+	 * Get a pagebuf for the last block, zero the part beyond the
+	 * EOF, and write it out sync.	We need to drop the ilock
+	 * while we do this so we don't deadlock when the buffer cache
+	 * calls back to us.
+	 */
+	XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
+	loff = XFS_FSB_TO_B(mp, last_fsb);
+	lsize = XFS_FSB_TO_B(mp, 1);
+
+	zero_offset = isize_fsb_offset;
+	zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
+
+	error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
+
+	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	ASSERT(error >= 0);
+	return error;
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF.	This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file.  This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated.  If fill is set,
+ * then any holes in the range are filled and zeroed.  If not, the holes
+ * are left alone as holes.
+ */
+
+int					/* error (positive) */
+xfs_zero_eof(
+	vnode_t		*vp,
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,		/* starting I/O offset */
+	xfs_fsize_t	isize,		/* current inode size */
+	xfs_fsize_t	end_size,	/* terminal inode size */
+	struct pm	*pmp)
+{
+	struct inode	*ip = LINVFS_GET_IP(vp);
+	xfs_fileoff_t	start_zero_fsb;
+	xfs_fileoff_t	end_zero_fsb;
+	xfs_fileoff_t	prev_zero_fsb;
+	xfs_fileoff_t	zero_count_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_extlen_t	buf_len_fsb;
+	xfs_extlen_t	prev_zero_count;
+	xfs_mount_t	*mp;
+	int		nimaps;
+	int		error = 0;
+	xfs_bmbt_irec_t imap;
+	loff_t		loff;
+	size_t		lsize;
+
+	ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+
+	mp = io->io_mount;
+
+	/*
+	 * First handle zeroing the block on which isize resides.
+	 * We only zero a part of that block so it is handled specially.
+	 */
+	error = xfs_zero_last_block(ip, io, offset, isize, end_size, pmp);
+	if (error) {
+		ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+		ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+		return error;
+	}
+
+	/*
+	 * Calculate the range between the new size and the old
+	 * where blocks needing to be zeroed may exist.	 To get the
+	 * block where the last byte in the file currently resides,
+	 * we need to subtract one from the size and truncate back
+	 * to a block boundary.	 We subtract 1 in case the size is
+	 * exactly on a block boundary.
+	 */
+	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+
+	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+	if (last_fsb == end_zero_fsb) {
+		/*
+		 * The size was only incremented on its last block.
+		 * We took care of that above, so just return.
+		 */
+		return 0;
+	}
+
+	ASSERT(start_zero_fsb <= end_zero_fsb);
+	prev_zero_fsb = NULLFILEOFF;
+	prev_zero_count = 0;
+	/*
+	 * Maybe change this loop to do the bmapi call and
+	 * loop while we split the mappings into pagebufs?
+	 */
+	while (start_zero_fsb <= end_zero_fsb) {
+		nimaps = 1;
+		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+		error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
+				  0, NULL, 0, &imap, &nimaps, NULL);
+		if (error) {
+			ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+			ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+			return error;
+		}
+		ASSERT(nimaps > 0);
+
+		if (imap.br_startblock == HOLESTARTBLOCK) {
+			/*
+			 * This loop handles initializing pages that were
+			 * partially initialized by the code below this
+			 * loop. It basically zeroes the part of the page
+			 * that sits on a hole and sets the page as P_HOLE
+			 * and calls remapf if it is a mapped file.
+			 */
+			prev_zero_fsb = NULLFILEOFF;
+			prev_zero_count = 0;
+			start_zero_fsb = imap.br_startoff +
+					 imap.br_blockcount;
+			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+			continue;
+		}
+
+		/*
+		 * There are blocks in the range requested.
+		 * Zero them a single write at a time.	We actually
+		 * don't zero the entire range returned if it is
+		 * too big and simply loop around to get the rest.
+		 * That is not the most efficient thing to do, but it
+		 * is simple and this path should not be exercised often.
+		 */
+		buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
+					      mp->m_writeio_blocks << 8);
+		/*
+		 * Drop the inode lock while we're doing the I/O.
+		 * We'll still have the iolock to protect us.
+		 */
+		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+
+		loff = XFS_FSB_TO_B(mp, start_zero_fsb);
+		lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
+
+		error = xfs_iozero(ip, loff, lsize, end_size);
+
+		if (error) {
+			goto out_lock;
+		}
+
+		prev_zero_fsb = start_zero_fsb;
+		prev_zero_count = buf_len_fsb;
+		start_zero_fsb = imap.br_startoff + buf_len_fsb;
+		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+
+		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	}
+
+	return 0;
+
+out_lock:
+
+	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	ASSERT(error >= 0);
+	return error;
+}
+
+ssize_t				/* bytes written, or (-) error */
+xfs_write(
+	bhv_desc_t	*bdp,
+	struct file	*file,
+	const char	*buf,
+	size_t		size,
+	loff_t		*offset,
+	cred_t		*credp)
+{
+	xfs_inode_t	*xip;
+	xfs_mount_t	*mp;
+	ssize_t		ret;
+	int		error = 0;
+	xfs_fsize_t	isize, new_size;
+	xfs_fsize_t	n, limit = XFS_MAX_FILE_OFFSET;
+	xfs_iocore_t	*io;
+	vnode_t		*vp;
+	int		iolock;
+	int		direct = file->f_flags & O_DIRECT;
+	int		eventsent = 0;
+	vrwlock_t	locktype;
+
+	XFS_STATS_INC(xfsstats.xs_write_calls);
+
+	vp = BHV_TO_VNODE(bdp);
+	xip = XFS_BHVTOI(bdp);
+
+	if (size == 0)
+		return 0;
+
+	io = &(xip->i_iocore);
+	mp = io->io_mount;
+
+	xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE);
+
+	if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
+		return -EIO;
+	}
+
+	if (direct) {
+		if (((__psint_t)buf & BBMASK) ||
+		    (*offset & mp->m_blockmask) ||
+		    (size  & mp->m_blockmask)) {
+			return XFS_ERROR(-EINVAL);
+		}
+		iolock = XFS_IOLOCK_SHARED;
+		locktype = VRWLOCK_WRITE_DIRECT;
+	} else {
+		iolock = XFS_IOLOCK_EXCL;
+		locktype = VRWLOCK_WRITE;
+	}
+
+	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
+	isize = xip->i_d.di_size;
+
+	if (file->f_flags & O_APPEND)
+		*offset = isize;
+
+start:
+	n = limit - *offset;
+	if (n <= 0) {
+		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+		return -EFBIG;
+	}
+	if (n < size)
+		size = n;
+
+	new_size = *offset + size;
+	if (new_size > isize) {
+		io->io_new_size = new_size;
+	}
+
+	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
+	    !(file->f_mode & FINVIS) && !eventsent)) {
+		loff_t		savedsize = *offset;
+
+		xfs_iunlock(xip, XFS_ILOCK_EXCL);
+		error = xfs_dm_send_data_event(DM_EVENT_WRITE, bdp,
+				*offset, size,
+				FILP_DELAY_FLAG(file), &locktype);
+		if (error) {
+			xfs_iunlock(xip, iolock);
+			return -error;
+		}
+		xfs_ilock(xip, XFS_ILOCK_EXCL);
+		eventsent = 1;
+
+		/*
+		 * The iolock was dropped and reaquired in
+		 * xfs_dm_send_data_event so we have to recheck the size
+		 *  when appending.  We will only "goto start;" once,
+		 *  since having sent the event prevents another call
+		 *  to xfs_dm_send_data_event, which is what
+		 *  allows the size to change in the first place.
+		 */
+		if ((file->f_flags & O_APPEND) &&
+		    savedsize != xip->i_d.di_size) {
+			*offset = isize = xip->i_d.di_size;
+			goto start;
+		}
+	}
+
+	/*
+	 * On Linux, generic_file_write updates the times even if
+	 * no data is copied in so long as the write had a size.
+	 *
+	 * We must update xfs' times since revalidate will overcopy xfs.
+	 */
+	if (size) {
+		if (!(file->f_mode & FINVIS))
+			xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	}
+
+	/*
+	 * If the offset is beyond the size of the file, we have a couple
+	 * of things to do. First, if there is already space allocated
+	 * we need to either create holes or zero the disk or ...
+	 *
+	 * If there is a page where the previous size lands, we need
+	 * to zero it out up to the new size.
+	 */
+
+	if (!direct && (*offset > isize && isize)) {
+		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
+			isize, *offset + size, NULL);
+		if (error) {
+			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+			return(-error);
+		}
+	}
+	xfs_iunlock(xip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we're writing the file then make sure to clear the
+	 * setuid and setgid bits if the process is not being run
+	 * by root.  This keeps people from modifying setuid and
+	 * setgid binaries.
+	 */
+
+	if (((xip->i_d.di_mode & ISUID) ||
+	    ((xip->i_d.di_mode & (ISGID | (IEXEC >> 3))) ==
+		(ISGID | (IEXEC >> 3)))) &&
+	     !capable(CAP_FSETID)) {
+		error = xfs_write_clear_setuid(xip);
+		if (error) {
+			xfs_iunlock(xip, iolock);
+			return -error;
+		}
+	}
+
+retry:
+	if (direct) {
+		xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1);
+	}
+
+	ret = do_generic_file_write(file, buf, size, offset);
+
+	if ((ret == -ENOSPC) &&
+	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
+	    !(file->f_mode & FINVIS)) {
+
+		xfs_rwunlock(bdp, locktype);
+		error = dm_send_namesp_event(DM_EVENT_NOSPACE, bdp,
+				DM_RIGHT_NULL, bdp, DM_RIGHT_NULL, NULL, NULL,
+				0, 0, 0); /* Delay flag intentionally  unused */
+		if (error)
+			return -error;
+		xfs_rwlock(bdp, locktype);
+		*offset = xip->i_d.di_size;
+		goto retry;
+
+	}
+
+	if (ret <= 0) {
+		xfs_rwunlock(bdp, locktype);
+		return ret;
+	}
+
+	XFS_STATS_ADD(xfsstats.xs_write_bytes, ret);
+
+	if (*offset > xip->i_d.di_size) {
+		xfs_ilock(xip, XFS_ILOCK_EXCL);
+		if (*offset > xip->i_d.di_size) {
+			struct inode	*inode = LINVFS_GET_IP(vp);
+
+			inode->i_size = xip->i_d.di_size = *offset;
+			xip->i_update_core = 1;
+			xip->i_update_size = 1;
+		}
+		xfs_iunlock(xip, XFS_ILOCK_EXCL);
+	}
+
+	/* Handle various SYNC-type writes */
+	if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {
+
+		/*
+		 * If we're treating this as O_DSYNC and we have not updated the
+		 * size, force the log.
+		 */
+
+		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
+			&& !(xip->i_update_size)) {
+			/*
+			 * If an allocation transaction occurred
+			 * without extending the size, then we have to force
+			 * the log up the proper point to ensure that the
+			 * allocation is permanent.  We can't count on
+			 * the fact that buffered writes lock out direct I/O
+			 * writes - the direct I/O write could have extended
+			 * the size nontransactionally, then finished before
+			 * we started.	xfs_write_file will think that the file
+			 * didn't grow but the update isn't safe unless the
+			 * size change is logged.
+			 *
+			 * Force the log if we've committed a transaction
+			 * against the inode or if someone else has and
+			 * the commit record hasn't gone to disk (e.g.
+			 * the inode is pinned).  This guarantees that
+			 * all changes affecting the inode are permanent
+			 * when we return.
+			 */
+
+			xfs_inode_log_item_t *iip;
+			xfs_lsn_t lsn;
+
+			iip = xip->i_itemp;
+			if (iip && iip->ili_last_lsn) {
+				lsn = iip->ili_last_lsn;
+				xfs_log_force(mp, lsn,
+						XFS_LOG_FORCE | XFS_LOG_SYNC);
+			} else if (xfs_ipincount(xip) > 0) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+						XFS_LOG_FORCE | XFS_LOG_SYNC);
+			}
+
+		} else {
+			xfs_trans_t	*tp;
+
+			/*
+			 * O_SYNC or O_DSYNC _with_ a size update are handled
+			 * the same way.
+			 *
+			 * If the write was synchronous then we need to make
+			 * sure that the inode modification time is permanent.
+			 * We'll have updated the timestamp above, so here
+			 * we use a synchronous transaction to log the inode.
+			 * It's not fast, but it's necessary.
+			 *
+			 * If this a dsync write and the size got changed
+			 * non-transactionally, then we need to ensure that
+			 * the size change gets logged in a synchronous
+			 * transaction.
+			 */
+
+			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
+			if ((error = xfs_trans_reserve(tp, 0,
+						      XFS_SWRITE_LOG_RES(mp),
+						      0, 0, 0))) {
+				/* Transaction reserve failed */
+				xfs_trans_cancel(tp, 0);
+			} else {
+				/* Transaction reserve successful */
+				xfs_ilock(xip, XFS_ILOCK_EXCL);
+				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(tp, xip);
+				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
+				xfs_trans_set_sync(tp);
+				error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0);
+				xfs_iunlock(xip, XFS_ILOCK_EXCL);
+			}
+		}
+	} /* (ioflags & O_SYNC) */
+
+	/*
+	 * If we are coming from an nfsd thread then insert into the
+	 * reference cache.
+	 */
+
+	if (!strcmp(current->comm, "nfsd"))
+		xfs_refcache_insert(xip);
+
+	/* Drop lock this way - the old refcache release is in here */
+	xfs_rwunlock(bdp, locktype);
+
+	return(ret);
+}
+
+/*
+ * xfs_bmap() is the same as the irix xfs_bmap from xfs_rw.c
+ * execpt for slight changes to the params
+ */
+int
+xfs_bmap(bhv_desc_t	*bdp,
+	xfs_off_t	offset,
+	ssize_t		count,
+	int		flags,
+	struct cred	*cred,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps)
+{
+	xfs_inode_t	*ip;
+	int		error;
+	int		lockmode;
+	int		fsynced = 0;
+	vnode_t		*vp;
+
+	ip = XFS_BHVTOI(bdp);
+	ASSERT((ip->i_d.di_mode & IFMT) == IFREG);
+	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
+	       ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
+	ASSERT((flags & PBF_READ) || (flags & PBF_WRITE));
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_iocore.io_mount))
+		return XFS_ERROR(EIO);
+
+	if (flags & PBF_READ) {
+		lockmode = xfs_ilock_map_shared(ip);
+		error = xfs_iomap_read(&ip->i_iocore, offset, count,
+				 XFS_BMAPI_ENTIRE, pbmapp, npbmaps, NULL);
+		xfs_iunlock_map_shared(ip, lockmode);
+	} else { /* PBF_WRITE */
+		ASSERT(flags & PBF_WRITE);
+		vp = BHV_TO_VNODE(bdp);
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		/*
+		 * Make sure that the dquots are there. This doesn't hold
+		 * the ilock across a disk read.
+		 */
+
+		if (XFS_IS_QUOTA_ON(ip->i_mount)) {
+			if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+				if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_ILOCKED))) {
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					return XFS_ERROR(error);
+				}
+			}
+		}
+retry:
+		error = xfs_iomap_write(&ip->i_iocore, offset, count,
+					pbmapp, npbmaps, flags, NULL);
+		/* xfs_iomap_write unlocks/locks/unlocks */
+
+		if (error == ENOSPC) {
+			switch (fsynced) {
+			case 0:
+				if (ip->i_delayed_blks) {
+					fsync_inode_data_buffers(LINVFS_GET_IP(vp));
+					fsynced = 1;
+				} else {
+					fsynced = 2;
+					flags |= PBF_SYNC;
+				}
+				error = 0;
+				xfs_ilock(ip, XFS_ILOCK_EXCL);
+				goto retry;
+			case 1:
+				fsynced = 2;
+				if (!(flags & PBF_SYNC)) {
+					flags |= PBF_SYNC;
+					error = 0;
+					xfs_ilock(ip, XFS_ILOCK_EXCL);
+					goto retry;
+				}
+			case 2:
+				fsync_no_super(LINVFS_GET_IP(vp)->i_dev);
+				xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+						XFS_LOG_FORCE|XFS_LOG_SYNC);
+
+				error = 0;
+/**
+				delay(HZ);
+**/
+				fsynced++;
+				xfs_ilock(ip, XFS_ILOCK_EXCL);
+				goto retry;
+			}
+		}
+	}
+
+	return XFS_ERROR(error);
+}
+
+int
+xfs_strategy(bhv_desc_t *bdp,
+	xfs_off_t	offset,
+	ssize_t		count,
+	int		flags,
+	struct cred	*cred,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps)
+{
+	xfs_inode_t	*ip;
+	xfs_iocore_t	*io;
+	xfs_mount_t	*mp;
+	int		error;
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	end_fsb;
+	xfs_fileoff_t	map_start_fsb;
+	xfs_fileoff_t	last_block;
+	xfs_fsblock_t	first_block;
+	xfs_bmap_free_t free_list;
+	xfs_filblks_t	count_fsb;
+	int		committed, i, loops, nimaps;
+	int		is_xfs = 1; /* This will be a variable at some point */
+	xfs_bmbt_irec_t imap[XFS_MAX_RW_NBMAPS];
+	xfs_trans_t	*tp;
+
+	ip = XFS_BHVTOI(bdp);
+	io = &ip->i_iocore;
+	mp = ip->i_mount;
+	/* is_xfs = IO_IS_XFS(io); */
+	ASSERT((ip->i_d.di_mode & IFMT) == IFREG);
+	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
+	       ((io->io_flags & XFS_IOCORE_RT) != 0));
+	ASSERT((flags & PBF_READ) || (flags & PBF_WRITE));
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	ASSERT(flags & PBF_WRITE);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	nimaps = min(XFS_MAX_RW_NBMAPS, *npbmaps);
+	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	first_block = NULLFSBLOCK;
+
+	XFS_ILOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
+	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
+			(xfs_filblks_t)(end_fsb - offset_fsb),
+			XFS_BMAPI_ENTIRE, &first_block, 0, imap,
+			&nimaps, NULL);
+	XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
+	if (error) {
+		return XFS_ERROR(error);
+	}
+
+	if (nimaps && !ISNULLSTARTBLOCK(imap[0].br_startblock)) {
+		*npbmaps = _xfs_imap_to_bmap(&ip->i_iocore, offset, imap,
+				pbmapp, nimaps, *npbmaps);
+		return 0;
+	}
+
+	/*
+	 * Make sure that the dquots are there.
+	 */
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((error = xfs_qm_dqattach(ip, 0))) {
+				return XFS_ERROR(error);
+			}
+		}
+	}
+	XFS_STATS_ADD(xfsstats.xs_xstrat_bytes,
+		XFS_FSB_TO_B(mp, imap[0].br_blockcount));
+
+	offset_fsb = imap[0].br_startoff;
+	count_fsb = imap[0].br_blockcount;
+	map_start_fsb = offset_fsb;
+	while (count_fsb != 0) {
+		/*
+		 * Set up a transaction with which to allocate the
+		 * backing store for the file.	Do allocations in a
+		 * loop until we get some space in the range we are
+		 * interested in.  The other space that might be allocated
+		 * is in the delayed allocation extent on which we sit
+		 * but before our buffer starts.
+		 */
+		nimaps = 0;
+		loops = 0;
+		while (nimaps == 0) {
+			if (is_xfs) {
+				tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+				error = xfs_trans_reserve(tp, 0,
+						XFS_WRITE_LOG_RES(mp),
+						0, XFS_TRANS_PERM_LOG_RES,
+						XFS_WRITE_LOG_COUNT);
+				if (error) {
+					xfs_trans_cancel(tp, 0);
+					goto error0;
+				}
+				xfs_ilock(ip, XFS_ILOCK_EXCL);
+				xfs_trans_ijoin(tp, ip,
+						XFS_ILOCK_EXCL);
+				xfs_trans_ihold(tp, ip);
+			} else {
+				tp = NULL;
+				XFS_ILOCK(mp, io, XFS_ILOCK_EXCL |
+						XFS_EXTSIZE_WR);
+			}
+
+
+			/*
+			 * Allocate the backing store for the file.
+			 */
+			XFS_BMAP_INIT(&(free_list),
+					&(first_block));
+			nimaps = XFS_STRAT_WRITE_IMAPS;
+
+			/*
+			 * Ensure we don't go beyond eof - it is possible
+			 * the extents changed since we did the read call,
+			 * we dropped the ilock in the interim.
+			 */
+
+			end_fsb = XFS_B_TO_FSB(mp, XFS_SIZE(mp, io));
+			xfs_bmap_last_offset(NULL, ip, &last_block,
+				XFS_DATA_FORK);
+			last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
+			if ((map_start_fsb + count_fsb) > last_block) {
+				count_fsb = last_block - map_start_fsb;
+				if (count_fsb == 0) {
+					if (is_xfs) {
+						xfs_bmap_cancel(&free_list);
+						xfs_trans_cancel(tp,
+						  (XFS_TRANS_RELEASE_LOG_RES |
+							 XFS_TRANS_ABORT));
+					}
+					XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL |
+							    XFS_EXTSIZE_WR);
+					return XFS_ERROR(EAGAIN);
+				}
+			}
+
+			error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb,
+					XFS_BMAPI_WRITE, &first_block, 1,
+					imap, &nimaps, &free_list);
+			if (error) {
+				xfs_bmap_cancel(&free_list);
+				xfs_trans_cancel(tp,
+					(XFS_TRANS_RELEASE_LOG_RES |
+					 XFS_TRANS_ABORT));
+				XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL |
+						    XFS_EXTSIZE_WR);
+
+				goto error0;
+			}
+
+			if (is_xfs) {
+				error = xfs_bmap_finish(&(tp), &(free_list),
+						first_block, &committed);
+				if (error) {
+					xfs_bmap_cancel(&free_list);
+					xfs_trans_cancel(tp,
+						(XFS_TRANS_RELEASE_LOG_RES |
+						XFS_TRANS_ABORT));
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					goto error0;
+				}
+
+				error = xfs_trans_commit(tp,
+						XFS_TRANS_RELEASE_LOG_RES,
+						NULL);
+				if (error) {
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					goto error0;
+				}
+			}
+
+			if (nimaps == 0) {
+				XFS_IUNLOCK(mp, io,
+						XFS_ILOCK_EXCL|XFS_EXTSIZE_WR);
+			} /* else hold 'till we maybe loop again below */
+		}
+
+		/*
+		 * See if we were able to allocate an extent that
+		 * covers at least part of the user's requested size.
+		 */
+
+		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+		for(i = 0; i < nimaps; i++) {
+			int maps;
+			if (offset_fsb >= imap[i].br_startoff &&
+				(offset_fsb < (imap[i].br_startoff + imap[i].br_blockcount))) {
+				XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR);
+				maps = min(nimaps, *npbmaps);
+				*npbmaps = _xfs_imap_to_bmap(io, offset, &imap[i],
+					pbmapp, maps, *npbmaps);
+				XFS_STATS_INC(xfsstats.xs_xstrat_quick);
+				return 0;
+			}
+			count_fsb -= imap[i].br_blockcount; /* for next bmapi,
+								if needed. */
+		}
+
+		/*
+		 * We didn't get an extent the caller can write into so
+		 * loop around and try starting after the last imap we got back.
+		 */
+
+		nimaps--; /* Index of last entry  */
+		ASSERT(nimaps >= 0);
+		ASSERT(offset_fsb >= imap[nimaps].br_startoff + imap[nimaps].br_blockcount);
+		ASSERT(count_fsb);
+		offset_fsb = imap[nimaps].br_startoff + imap[nimaps].br_blockcount;
+		map_start_fsb = offset_fsb;
+		XFS_STATS_INC(xfsstats.xs_xstrat_split);
+		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_WR);
+	}
+
+	ASSERT(0);	/* Should never get here */
+
+ error0:
+	if (error) {
+		ASSERT(count_fsb != 0);
+		ASSERT(is_xfs || XFS_FORCED_SHUTDOWN(mp));
+	}
+
+	return XFS_ERROR(error);
+}
+
+
+STATIC int
+_xfs_imap_to_bmap(
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,
+	xfs_bmbt_irec_t *imap,
+	pb_bmap_t	*pbmapp,
+	int		imaps,			/* Number of imap entries */
+	int		pbmaps)			/* Number of pbmap entries */
+{
+	xfs_mount_t	*mp;
+	xfs_fsize_t	nisize;
+	int		im, pbm;
+	xfs_fsblock_t	start_block;
+
+	mp = io->io_mount;
+	nisize = XFS_SIZE(mp, io);
+	if (io->io_new_size > nisize)
+		nisize = io->io_new_size;
+
+	for (im=0, pbm=0; im < imaps && pbm < pbmaps; im++,pbmapp++,imap++,pbm++) {
+		pbmapp->pbm_target = io->io_flags & XFS_IOCORE_RT ?
+			mp->m_rtdev_targp :
+			mp->m_ddev_targp;
+		pbmapp->pbm_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+		pbmapp->pbm_delta = offset - pbmapp->pbm_offset;
+		pbmapp->pbm_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
+		pbmapp->pbm_flags = 0;
+
+		start_block = imap->br_startblock;
+		if (start_block == HOLESTARTBLOCK) {
+			pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL;
+			pbmapp->pbm_flags = PBMF_HOLE;
+		} else if (start_block == DELAYSTARTBLOCK) {
+			pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL;
+			pbmapp->pbm_flags = PBMF_DELAY;
+		} else {
+			pbmapp->pbm_bn = XFS_FSB_TO_DB_IO(io, start_block);
+			if (imap->br_state == XFS_EXT_UNWRITTEN)
+				pbmapp->pbm_flags |= PBMF_UNWRITTEN;
+		}
+
+		if ((pbmapp->pbm_offset + pbmapp->pbm_bsize) >= nisize) {
+			pbmapp->pbm_flags |= PBMF_EOF;
+		}
+
+		offset += pbmapp->pbm_bsize - pbmapp->pbm_delta;
+	}
+	return(pbm);	/* Return the number filled */
+}
+
+STATIC int
+xfs_iomap_read(
+	xfs_iocore_t	*io,
+	loff_t		offset,
+	size_t		count,
+	int		flags,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps,
+	struct pm	*pmp)
+{
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	end_fsb;
+	int		nimaps;
+	int		error;
+	xfs_mount_t	*mp;
+	xfs_bmbt_irec_t imap[XFS_MAX_RW_NBMAPS];
+
+	ASSERT(ismrlocked(io->io_lock, MR_UPDATE | MR_ACCESS) != 0);
+/**	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE | MR_ACCESS) != 0); **/
+/*	xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count); */
+
+	mp = io->io_mount;
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	nimaps = sizeof(imap) / sizeof(imap[0]);
+	nimaps = min(nimaps, *npbmaps); /* Don't ask for more than caller has */
+	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
+				(xfs_filblks_t)(end_fsb - offset_fsb),
+				flags, NULL, 0, imap,
+				&nimaps, NULL);
+	if (error) {
+		return XFS_ERROR(error);
+	}
+
+	if(nimaps) {
+		*npbmaps = _xfs_imap_to_bmap(io, offset, imap, pbmapp, nimaps,
+			*npbmaps);
+	} else
+		*npbmaps = 0;
+	return XFS_ERROR(error);
+}
+
+/*
+ * xfs_iomap_write: return pagebuf_bmap_t's telling higher layers
+ *	where to write.
+ * There are 2 main cases:
+ *	1 the extents already exist
+ *	2 must allocate.
+ *	There are 3 cases when we allocate:
+ *		delay allocation (doesn't really allocate or use transactions)
+ *		direct allocation (no previous delay allocation
+ *		convert delay to real allocations
+ */
+
+STATIC int
+xfs_iomap_write(
+	xfs_iocore_t	*io,
+	loff_t		offset,
+	size_t		count,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps,
+	int		ioflag,
+	struct pm	*pmp)
+{
+	int		maps;
+	int		error = 0;
+	int		found;
+	int		flags = 0;
+
+	maps = *npbmaps;
+	if (!maps)
+		goto out;
+
+	/*
+	 * If we have extents that are allocated for this range,
+	 * return them.
+	 */
+
+	found = 0;
+	error = xfs_iomap_read(io, offset, count, flags, pbmapp, npbmaps, NULL);
+	if (error)
+		goto out;
+
+	/*
+	 * If we found mappings and they can just have data written
+	 * without conversion,
+	 * let the caller write these and call us again.
+	 *
+	 * If we have a HOLE or UNWRITTEN, proceed down lower to
+	 * get the space or to convert to written.
+	 */
+
+	if (*npbmaps) {
+		if (!(pbmapp->pbm_flags & PBMF_HOLE)) {
+			*npbmaps = 1; /* Only checked the first one. */
+					/* We could check more, ... */
+			goto out;
+		}
+	}
+	found = *npbmaps;
+	*npbmaps = maps; /* Restore to original requested */
+
+	if (ioflag & PBF_DIRECT) {
+		error = xfs_iomap_write_direct(io, offset, count, pbmapp,
+					npbmaps, ioflag, found);
+	} else {
+		error = xfs_iomap_write_delay(io, offset, count, pbmapp,
+				npbmaps, ioflag, found);
+	}
+
+out:
+	XFS_IUNLOCK(io->io_mount, io, XFS_ILOCK_EXCL);
+	return XFS_ERROR(error);
+}
+
+/*
+ * Map the given I/O size and I/O alignment over the given extent.
+ * If we're at the end of the file and the underlying extent is
+ * delayed alloc, make sure we extend out to the
+ * next i_writeio_blocks boundary.  Otherwise make sure that we
+ * are confined to the given extent.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_write_bmap(
+	xfs_mount_t	*mp,
+	xfs_iocore_t	*io,
+	xfs_bmbt_irec_t *imapp,
+	pb_bmap_t	*pbmapp,
+	int		iosize,
+	xfs_fileoff_t	ioalign,
+	xfs_fsize_t	isize)
+{
+	__int64_t	extra_blocks;
+	xfs_fileoff_t	size_diff;
+	xfs_fileoff_t	ext_offset;
+	xfs_fsblock_t	start_block;
+	int		length;		/* length of this mapping in blocks */
+	xfs_off_t	offset;		/* logical block offset of this mapping */
+
+	if (ioalign < imapp->br_startoff) {
+		/*
+		 * The desired alignment doesn't end up on this
+		 * extent.  Move up to the beginning of the extent.
+		 * Subtract whatever we drop from the iosize so that
+		 * we stay aligned on iosize boundaries.
+		 */
+		size_diff = imapp->br_startoff - ioalign;
+		iosize -= (int)size_diff;
+		ASSERT(iosize > 0);
+		ext_offset = 0;
+		offset = imapp->br_startoff;
+		pbmapp->pbm_offset = XFS_FSB_TO_B(mp, imapp->br_startoff);
+	} else {
+		/*
+		 * The alignment requested fits on this extent,
+		 * so use it.
+		 */
+		ext_offset = ioalign - imapp->br_startoff;
+		offset = ioalign;
+		pbmapp->pbm_offset = XFS_FSB_TO_B(mp, ioalign);
+	}
+	start_block = imapp->br_startblock;
+	ASSERT(start_block != HOLESTARTBLOCK);
+	if (start_block != DELAYSTARTBLOCK) {
+		pbmapp->pbm_bn = XFS_FSB_TO_DB_IO(io, start_block + ext_offset);
+		if (imapp->br_state == XFS_EXT_UNWRITTEN) {
+			pbmapp->pbm_flags = PBMF_UNWRITTEN;
+		}
+	} else {
+		pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL;
+		pbmapp->pbm_flags = PBMF_DELAY;
+	}
+	pbmapp->pbm_target = io->io_flags & XFS_IOCORE_RT ?
+		mp->m_rtdev_targp :
+		mp->m_ddev_targp;
+	length = iosize;
+
+	/*
+	 * If the iosize from our offset extends beyond the end of
+	 * the extent, then trim down length to match that of the extent.
+	 */
+	extra_blocks = (xfs_off_t)(offset + length) -
+		       (__uint64_t)(imapp->br_startoff +
+				    imapp->br_blockcount);
+	if (extra_blocks > 0) {
+		length -= extra_blocks;
+		ASSERT(length > 0);
+	}
+
+	pbmapp->pbm_bsize = XFS_FSB_TO_B(mp, length);
+}
+
+STATIC int
+xfs_iomap_write_delay(
+	xfs_iocore_t	*io,
+	loff_t		offset,
+	size_t		count,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps,
+	int		ioflag,
+	int		found)
+{
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	ioalign;
+	xfs_fileoff_t	last_fsb;
+	xfs_fileoff_t	start_fsb;
+	xfs_filblks_t	count_fsb;
+	xfs_off_t	aligned_offset;
+	xfs_fsize_t	isize;
+	xfs_fsblock_t	firstblock;
+	__uint64_t	last_page_offset;
+	int		nimaps;
+	int		error;
+	int		n;
+	unsigned int	iosize;
+	short		small_write;
+	xfs_mount_t	*mp;
+#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
+	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
+	int		aeof;
+
+	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
+
+/*	xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count); */
+
+	mp = io->io_mount;
+/***
+	ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
+***/
+
+	isize = XFS_SIZE(mp, io);
+	if (io->io_new_size > isize) {
+		isize = io->io_new_size;
+	}
+
+	aeof = 0;
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	/*
+	 * If the caller is doing a write at the end of the file,
+	 * then extend the allocation (and the buffer used for the write)
+	 * out to the file system's write iosize.  We clean up any extra
+	 * space left over when the file is closed in xfs_inactive().
+	 * We can only do this if we are sure that we will create buffers
+	 * over all of the space we allocate beyond the end of the file.
+	 * Not doing so would allow us to create delalloc blocks with
+	 * no pages in memory covering them.  So, we need to check that
+	 * there are not any real blocks in the area beyond the end of
+	 * the file which we are optimistically going to preallocate. If
+	 * there are then our buffers will stop when they encounter them
+	 * and we may accidentally create delalloc blocks beyond them
+	 * that we never cover with a buffer.  All of this is because
+	 * we are not actually going to write the extra blocks preallocated
+	 * at this point.
+	 *
+	 * We don't bother with this for sync writes, because we need
+	 * to minimize the amount we write for good performance.
+	 */
+	if (!(ioflag & PBF_SYNC) && ((offset + count) > XFS_SIZE(mp, io))) {
+		start_fsb = XFS_B_TO_FSBT(mp,
+				  ((xfs_ufsize_t)(offset + count - 1)));
+		count_fsb = mp->m_writeio_blocks;
+		while (count_fsb > 0) {
+			nimaps = XFS_WRITE_IMAPS;
+			error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
+					  0, NULL, 0, imap, &nimaps,
+					  NULL);
+			if (error) {
+				return error;
+			}
+			for (n = 0; n < nimaps; n++) {
+				if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
+				    (imap[n].br_startblock != DELAYSTARTBLOCK)) {
+					goto write_map;
+				}
+				start_fsb += imap[n].br_blockcount;
+				count_fsb -= imap[n].br_blockcount;
+				ASSERT(count_fsb < 0xffff000);
+			}
+		}
+		iosize = mp->m_writeio_blocks;
+		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
+		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+		last_fsb = ioalign + iosize;
+		aeof = 1;
+	}
+ write_map:
+	nimaps = XFS_WRITE_IMAPS;
+	firstblock = NULLFSBLOCK;
+
+	/*
+	 * roundup the allocation request to m_dalign boundary if file size
+	 * is greater that 512K and we are allocating past the allocation eof
+	 */
+	if (mp->m_dalign && (XFS_SIZE(mp, io) >= mp->m_dalign) && aeof) {
+		int eof;
+		xfs_fileoff_t new_last_fsb;
+		new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
+		error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
+		if (error) {
+			return error;
+		}
+		if (eof) {
+			last_fsb = new_last_fsb;
+		}
+	}
+
+	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
+			  (xfs_filblks_t)(last_fsb - offset_fsb),
+			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
+			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
+			  &nimaps, NULL);
+	/*
+	 * This can be EDQUOT, if nimaps == 0
+	 */
+	if (error) {
+		return XFS_ERROR(error);
+	}
+	/*
+	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+	 * then we must have run out of space.
+	 */
+	if (nimaps == 0) {
+/*		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
+				      io, offset, count); */
+		return XFS_ERROR(ENOSPC);
+	}
+
+	if (!(ioflag & PBF_SYNC) ||
+	    ((last_fsb - offset_fsb) >= mp->m_writeio_blocks)) {
+		/*
+		 * For normal or large sync writes, align everything
+		 * into i_writeio_blocks sized chunks.
+		 */
+		iosize = mp->m_writeio_blocks;
+		aligned_offset = XFS_WRITEIO_ALIGN(mp, offset);
+		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+		small_write = 0;
+		/* XXX - Are we shrinking? XXXXX  */
+	} else {
+		/*
+		 * For small sync writes try to minimize the amount
+		 * of I/O we do.  Round down and up to the larger of
+		 * page or block boundaries.  Set the small_write
+		 * variable to 1 to indicate to the code below that
+		 * we are not using the normal buffer alignment scheme.
+		 */
+		if (NBPP > mp->m_sb.sb_blocksize) {
+			aligned_offset = ctooff(offtoct(offset));
+			ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+			last_page_offset = ctob64(btoc64(offset + count));
+			iosize = XFS_B_TO_FSBT(mp, last_page_offset -
+					       aligned_offset);
+		} else {
+			ioalign = offset_fsb;
+			iosize = last_fsb - offset_fsb;
+		}
+		small_write = 1;
+		/* XXX - Are we shrinking? XXXXX  */
+	}
+
+	/*
+	 * Now map our desired I/O size and alignment over the
+	 * extents returned by xfs_bmapi().
+	 */
+	xfs_write_bmap(mp, io, imap, pbmapp, iosize, ioalign, isize);
+	pbmapp->pbm_delta = offset - pbmapp->pbm_offset;
+
+	ASSERT((pbmapp->pbm_bsize > 0)
+		&& (pbmapp->pbm_bsize - pbmapp->pbm_delta > 0));
+
+	/*
+	 * A bmap is the EOF bmap when it reaches to or beyond the new
+	 * inode size.
+	 */
+	if ((pbmapp->pbm_offset + pbmapp->pbm_bsize ) >= isize) {
+		pbmapp->pbm_flags |= PBMF_EOF;
+	}
+
+/*	xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP,
+			    io, offset, count, bmapp, imap);	     */
+
+	/* On IRIX, we walk more imaps filling in more bmaps. On Linux
+		just handle one for now. To find the code on IRIX,
+		look in xfs_iomap_write() in xfs_rw.c. */
+
+	*npbmaps = 1;
+	return 0;
+}
+
+STATIC int
+xfs_iomap_write_direct(
+	xfs_iocore_t	*io,
+	loff_t		offset,
+	size_t		count,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps,
+	int		ioflag,
+	int		found)
+{
+	xfs_inode_t	*ip = XFS_IO_INODE(io);
+	xfs_mount_t	*mp;
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_filblks_t	count_fsb;
+	xfs_fsize_t	isize;
+	xfs_fsblock_t	firstfsb;
+	int		nimaps, maps;
+	int		error;
+	xfs_trans_t	*tp;
+
+#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
+	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
+	xfs_bmap_free_t free_list;
+	int		aeof;
+	int		bmapi_flags;
+	xfs_filblks_t	datablocks;
+	int		rt;
+	int		committed;
+	int		numrtextents;
+	uint		resblks;
+	int		rtextsize;
+
+	maps = min(XFS_WRITE_IMAPS, *npbmaps);
+	nimaps = maps;
+
+	mp = io->io_mount;
+	isize = XFS_SIZE(mp, io);
+	if (io->io_new_size > isize)
+		isize = io->io_new_size;
+
+	if ((offset + count) > isize) {
+		aeof = 1;
+	} else {
+		aeof = 0;
+	}
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	count_fsb = last_fsb - offset_fsb;
+	if (found && (pbmapp->pbm_flags & PBMF_HOLE)) {
+		xfs_fileoff_t	map_last_fsb;
+		map_last_fsb = XFS_B_TO_FSB(mp,
+			(pbmapp->pbm_bsize + pbmapp->pbm_offset));
+
+		if (map_last_fsb < last_fsb) {
+			last_fsb = map_last_fsb;
+			count_fsb = last_fsb - offset_fsb;
+		}
+		ASSERT(count_fsb > 0);
+	}
+
+	/*
+	 * roundup the allocation request to m_dalign boundary if file size
+	 * is greater that 512K and we are allocating past the allocation eof
+	 */
+	if (!found && mp->m_dalign && (isize >= 524288) && aeof) {
+		int eof;
+		xfs_fileoff_t new_last_fsb;
+		new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
+		printk("xfs_iomap_write_direct: about to XFS_BMAP_EOF %Ld\n",
+			new_last_fsb);
+		error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
+		if (error) {
+			goto error_out;
+		}
+		if (eof)
+			last_fsb = new_last_fsb;
+	}
+
+	bmapi_flags = XFS_BMAPI_WRITE|XFS_BMAPI_DIRECT_IO|XFS_BMAPI_ENTIRE;
+	bmapi_flags &= ~XFS_BMAPI_DIRECT_IO;
+
+	/*
+	 * determine if this is a realtime file
+	 */
+	if ((rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) != 0) {
+		rtextsize = mp->m_sb.sb_rextsize;
+	} else
+		rtextsize = 0;
+
+	error = 0;
+
+	/*
+	 * allocate file space for the bmapp entries passed in.
+	 */
+
+	/*
+	 * determine if reserving space on
+	 * the data or realtime partition.
+	 */
+	if (rt) {
+		numrtextents = (count_fsb + rtextsize - 1);
+		do_div(numrtextents, rtextsize);
+		datablocks = 0;
+	} else {
+		datablocks = count_fsb;
+		numrtextents = 0;
+	}
+
+	/*
+	 * allocate and setup the transaction
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	error = xfs_trans_reserve(tp,
+				  resblks,
+				  XFS_WRITE_LOG_RES(mp),
+				  numrtextents,
+				  XFS_TRANS_PERM_LOG_RES,
+				  XFS_WRITE_LOG_COUNT);
+
+	/*
+	 * check for running out of space
+	 */
+	if (error) {
+		/*
+		 * Free the transaction structure.
+		 */
+		xfs_trans_cancel(tp, 0);
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	if (error)  {
+		goto error_out; /* Don't return in above if .. trans ..,
+					need lock to return */
+	}
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (xfs_trans_reserve_quota(tp,
+					    ip->i_udquot,
+					    ip->i_gdquot,
+					    resblks, 0, 0)) {
+			error = (EDQUOT);
+			goto error1;
+		}
+		nimaps = 1;
+	} else {
+		nimaps = 2;
+	}
+
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+
+	/*
+	 * issue the bmapi() call to allocate the blocks
+	 */
+	XFS_BMAP_INIT(&free_list, &firstfsb);
+	imapp = &imap[0];
+	error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb,
+		bmapi_flags, &firstfsb, 1, imapp, &nimaps, &free_list);
+	if (error) {
+		goto error0;
+	}
+
+	/*
+	 * complete the transaction
+	 */
+
+	error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+	if (error) {
+		goto error0;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (error) {
+		goto error_out;
+	}
+
+	/* copy any maps to caller's array and return any error. */
+	if (nimaps == 0) {
+		error = (ENOSPC);
+		goto error_out;
+	}
+
+	maps = min(nimaps, maps);
+	*npbmaps = _xfs_imap_to_bmap(io, offset, &imap[0], pbmapp, maps,
+								*npbmaps);
+	if(*npbmaps) {
+		/*
+		 * this is new since xfs_iomap_read
+		 * didn't find it.
+		 */
+		if (*npbmaps != 1) {
+			printk("NEED MORE WORK FOR MULTIPLE BMAPS (which are new)\n");
+		}
+	}
+	goto out;
+
+ error0:	/* Cancel bmap, unlock inode, and cancel trans */
+	xfs_bmap_cancel(&free_list);
+
+ error1:	/* Just cancel transaction */
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	*npbmaps = 0;	/* nothing set-up here */
+
+error_out:
+out:	/* Just return error and any tracing at end of routine */
+	return XFS_ERROR(error);
+}
+
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(struct xfs_buf *bp)
+{
+	xfs_mount_t	*mp;
+
+	mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
+	if (!XFS_FORCED_SHUTDOWN(mp)) {
+		pagebuf_iorequest(bp);
+		return 0;
+	} else {
+		xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
+		/*
+		 * Metadata write that didn't get logged but
+		 * written delayed anyway. These aren't associated
+		 * with a transaction, and can be ignored.
+		 */
+		if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
+		    (XFS_BUF_ISREAD(bp)) == 0)
+			return (xfs_bioerror_relse(bp));
+		else
+			return (xfs_bioerror(bp));
+	}
+}
+/*
+ * Wrapper around bdstrat so that we can stop data
+ * from going to disk in case we are shutting down the filesystem.
+ * Typically user data goes thru this path; one of the exceptions
+ * is the superblock.
+ */
+int
+xfsbdstrat(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	ASSERT(mp);
+	if (!XFS_FORCED_SHUTDOWN(mp)) {
+		/* Grio redirection would go here
+		 * if (XFS_BUF_IS_GRIO(bp)) {
+		 */
+
+		pagebuf_iorequest(bp);
+		return 0;
+	}
+
+	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
+	return (xfs_bioerror_relse(bp));
+}
+
+
+void
+XFS_bflush(xfs_buftarg_t *target)
+{
+	pagebuf_delwri_flush(target, PBDF_WAIT, NULL);
+}
+
+
+/* Push all fs state out to disk
+ */
+
+void
+XFS_log_write_unmount_ro(bhv_desc_t	*bdp)
+{
+	xfs_mount_t	*mp;
+	int pincount = 0;
+	int count = 0;
+	int error;
+
+	mp = XFS_BHVTOM(bdp);
+	xfs_refcache_purge_mp(mp);
+	xfs_binval(mp->m_ddev_targp);
+
+	do {
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+		VFS_SYNC(XFS_MTOVFS(mp), SYNC_ATTR|SYNC_WAIT, NULL, error);
+		pagebuf_delwri_flush(mp->m_ddev_targp,
+				PBDF_WAIT, &pincount);
+		if (pincount == 0) {delay(50); count++;}
+	}  while (count < 2);
+
+	/* Ok now write out an unmount record */
+	xfs_log_unmount_write(mp);
+	xfs_unmountfs_writesb(mp);
+}
+
+/*
+ * In these two situations we disregard the readonly mount flag and
+ * temporarily enable writes (we must, to ensure metadata integrity).
+ */
+STATIC int
+xfs_is_read_only(xfs_mount_t *mp)
+{
+	if (is_read_only(mp->m_dev) || is_read_only(mp->m_logdev_targp->pbr_dev)) {
+		cmn_err(CE_NOTE,
+			"XFS: write access unavailable, cannot proceed.");
+		return EROFS;
+	}
+	cmn_err(CE_NOTE,
+		"XFS: write access will be enabled during mount.");
+	XFS_MTOVFS(mp)->vfs_flag &= ~VFS_RDONLY;
+	return 0;
+}
+
+int
+xfs_recover_read_only(xlog_t *log)
+{
+	cmn_err(CE_NOTE, "XFS: WARNING: "
+		"recovery required on readonly filesystem.");
+	return xfs_is_read_only(log->l_mp);
+}
+
+int
+xfs_quotacheck_read_only(xfs_mount_t *mp)
+{
+	cmn_err(CE_NOTE, "XFS: WARNING: "
+		"quotacheck required on readonly filesystem.");
+	return xfs_is_read_only(mp);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_lrw.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_lrw.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_lrw.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_lrw.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef __XFS_LRW_H__
+#define __XFS_LRW_H__
+
+#define XFS_IOMAP_READ_ENTER	3
+/*
+ * Maximum count of bmaps used by read and write paths.
+ */
+#define XFS_MAX_RW_NBMAPS	4
+
+extern int xfs_bmap (bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, pb_bmap_t *, int *);
+extern int xfs_strategy (bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, pb_bmap_t *, int *);
+extern int xfsbdstrat (struct xfs_mount *, struct xfs_buf *);
+extern int xfs_bdstrat_cb (struct xfs_buf *);
+
+extern int xfs_zero_eof (vnode_t *, struct xfs_iocore *, xfs_off_t,
+				xfs_fsize_t, xfs_fsize_t, struct pm *);
+extern ssize_t xfs_read (
+	struct bhv_desc	       *bdp,
+	struct file		*file,
+	char			*buf,
+	size_t			size,
+	loff_t			*offset,
+	struct cred	       *credp);
+
+extern ssize_t xfs_write (
+	struct bhv_desc		*bdp,
+	struct file		*file,
+	const char		*buf,
+	size_t			size,
+	loff_t			*offset,
+	struct cred		*credp);
+
+extern int xfs_recover_read_only (xlog_t *);
+extern int xfs_quotacheck_read_only (xfs_mount_t *);
+
+extern void XFS_log_write_unmount_ro (bhv_desc_t *);
+
+#define XFS_FSB_TO_DB_IO(io,fsb) \
+		(((io)->io_flags & XFS_IOCORE_RT) ? \
+		 XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \
+		 XFS_FSB_TO_DADDR((io)->io_mount, (fsb)))
+
+#endif	/* __XFS_LRW_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_seq.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_seq.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_seq.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_seq.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,31 @@
+
+static void *stats_start(struct seq_file *s, loff_t *pos)
+{
+	if (*pos >= ARRAY_SIZE(xstats))
+		return NULL;
+	return xstats + *pos;
+}
+
+static void *stats_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return stats_start(m, pos);
+}
+
+static void stats_stop(struct seq_file *m, void *v)
+{
+
+}
+
+static int stats_show(struct seq_file *m, void *v)
+{
+	struct xstats_entry *e = v;
+
+	seq_printf(m, xstats[i].desc);
+
+	for (i = e->start; i < e->end; i++)
+		seq_printf(m, " %u", *(((__u32*)&xfsstats) + i));
+
+
+
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_stats.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_stats.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_stats.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_stats.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/proc_fs.h>
+
+static int
+xfs_read_xfsstats(char *buffer, char **start, off_t offset,
+			int count, int *eof, void *data)
+{
+	int	i, j, len;
+	static struct xstats_entry {
+		char	*desc;
+		int	endpoint;
+	} xstats[] = {
+		{ "extent_alloc",	XFSSTAT_END_EXTENT_ALLOC	},
+		{ "abt",		XFSSTAT_END_ALLOC_BTREE		},
+		{ "blk_map",		XFSSTAT_END_BLOCK_MAPPING	},
+		{ "bmbt",		XFSSTAT_END_BLOCK_MAP_BTREE	},
+		{ "dir",		XFSSTAT_END_DIRECTORY_OPS	},
+		{ "trans",		XFSSTAT_END_TRANSACTIONS	},
+		{ "ig",			XFSSTAT_END_INODE_OPS		},
+		{ "log",		XFSSTAT_END_LOG_OPS		},
+		{ "push_ail",		XFSSTAT_END_TAIL_PUSHING	},
+		{ "xstrat",		XFSSTAT_END_WRITE_CONVERT	},
+		{ "rw",			XFSSTAT_END_READ_WRITE_OPS	},
+		{ "attr",		XFSSTAT_END_ATTRIBUTE_OPS	},
+		{ "qm",			XFSSTAT_END_QUOTA_OPS		},
+		{ "icluster",		XFSSTAT_END_INODE_CLUSTER	},
+		{ "vnodes",		XFSSTAT_END_VNODE_OPS		},
+	};
+
+	for (i=j=len = 0; i < sizeof(xstats)/sizeof(struct xstats_entry); i++) {
+		len += sprintf(buffer + len, xstats[i].desc);
+		/* inner loop does each group */
+		while (j < xstats[i].endpoint) {
+			len += sprintf(buffer + len, " %u",
+					*(((__u32*)&xfsstats) + j));
+			j++;
+		}
+		buffer[len++] = '\n';
+	}
+	/* extra precision counters */
+	len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+			xfsstats.xs_xstrat_bytes,
+			xfsstats.xs_write_bytes,
+			xfsstats.xs_read_bytes);
+
+	if (offset >= len) {
+		*start = buffer;
+		*eof = 1;
+		return 0;
+	}
+	*start = buffer + offset;
+	if ((len -= offset) > count)
+		return count;
+	*eof = 1;
+
+	return len;
+}
+
+static int
+xfs_read_xfsquota(char *buffer, char **start, off_t offset,
+			int count, int *eof, void *data)
+{
+	int	len;
+
+	/* maximum; incore; ratio free to inuse; freelist */
+	len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+			ndquot,
+			xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
+			xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
+			xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+
+	if (offset >= len) {
+		*start = buffer;
+		*eof = 1;
+		return 0;
+	}
+	*start = buffer + offset;
+	if ((len -= offset) > count)
+		return count;
+	*eof = 1;
+
+	return len;
+}
+
+void
+xfs_init_procfs(void)
+{
+	if (!proc_mkdir("fs/xfs", 0))
+		return;
+	create_proc_read_entry("fs/xfs/stat", 0, 0, xfs_read_xfsstats, NULL);
+	create_proc_read_entry("fs/xfs/xqm", 0, 0, xfs_read_xfsquota, NULL);
+}
+
+void
+xfs_cleanup_procfs(void)
+{
+	remove_proc_entry("fs/xfs/stat", NULL);
+	remove_proc_entry("fs/xfs/xqm", NULL);
+	remove_proc_entry("fs/xfs", NULL);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_stats.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_stats.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_stats.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_stats.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_STATS_H__
+#define __XFS_STATS_H__
+
+/*
+ * procfs interface
+ */
+#ifdef CONFIG_PROC_FS
+extern void xfs_init_procfs(void);
+extern void xfs_cleanup_procfs(void);
+#else
+static __inline void xfs_init_procfs(void) { };
+static __inline void xfs_cleanup_procfs(void) { };
+#endif
+
+#endif /* __XFS_STATS_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_super.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_super.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_super.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_super.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,935 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/ctype.h>
+#include <linux/seq_file.h>
+#include "xfs_version.h"
+
+/* xfs_vfs[ops].c */
+extern int  xfs_init(void);
+extern void xfs_cleanup(void);
+
+/* For kernels which have the s_maxbytes field - set it */
+#ifdef MAX_NON_LFS
+# define set_max_bytes(sb)	((sb)->s_maxbytes = XFS_MAX_FILE_OFFSET)
+#else
+# define set_max_bytes(sb)	do { } while (0)
+#endif
+
+#ifdef CONFIG_FS_POSIX_ACL
+# define set_posix_acl(sb)	((sb)->s_flags |= MS_POSIXACL)
+#else
+# define set_posix_acl(sb)	do { } while (0)
+#endif
+
+#ifdef CONFIG_XFS_QUOTA
+static struct quotactl_ops linvfs_qops = {
+	.get_xstate		= linvfs_getxstate,
+	.set_xstate		= linvfs_setxstate,
+	.get_xquota		= linvfs_getxquota,
+	.set_xquota		= linvfs_setxquota,
+};
+# define set_quota_ops(sb)	((sb)->s_qcop = &linvfs_qops)
+#else
+# define set_quota_ops(sb)	do { } while (0)
+#endif
+
+#ifdef CONFIG_XFS_DMAPI
+int dmapi_init(void);
+void dmapi_uninit(void);
+#else
+#define dmapi_init()
+#define dmapi_uninit()
+#endif
+
+static struct super_operations linvfs_sops;
+
+#define MNTOPT_LOGBUFS	"logbufs"	/* number of XFS log buffers */
+#define MNTOPT_LOGBSIZE "logbsize"	/* size of XFS log buffers */
+#define MNTOPT_LOGDEV	"logdev"	/* log device */
+#define MNTOPT_RTDEV	"rtdev"		/* realtime I/O device */
+#define MNTOPT_DMAPI	"dmapi"		/* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_XDSM	"xdsm"		/* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_BIOSIZE	"biosize"	/* log2 of preferred buffered io size */
+#define MNTOPT_WSYNC	"wsync"		/* safe-mode nfs compatible mount */
+#define MNTOPT_INO64	"ino64"		/* force inodes into 64-bit range */
+#define MNTOPT_NOALIGN	"noalign"	/* turn off stripe alignment */
+#define MNTOPT_SUNIT	"sunit"		/* data volume stripe unit */
+#define MNTOPT_SWIDTH	"swidth"	/* data volume stripe width */
+#define MNTOPT_NORECOVERY "norecovery"	/* don't run XFS recovery */
+#define MNTOPT_OSYNCISDSYNC "osyncisdsync" /* o_sync == o_dsync on this fs */
+					   /* (this is now the default!) */
+#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
+#define MNTOPT_QUOTA	"quota"		/* disk quotas */
+#define MNTOPT_MRQUOTA	"mrquota"	/* don't turnoff if SB has quotas on */
+#define MNTOPT_NOQUOTA	"noquota"	/* no quotas */
+#define MNTOPT_UQUOTA	"usrquota"	/* user quota enabled */
+#define MNTOPT_GQUOTA	"grpquota"	/* group quota enabled */
+#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
+#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
+#define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
+#define MNTOPT_NOUUID	"nouuid"	/* Ignore FS uuid */
+#define MNTOPT_IRIXSGID "irixsgid"	/* Irix-style sgid inheritance */
+#define MNTOPT_NOLOGFLUSH  "nologflush"	/* Don't use hard flushes in
+					   log writing */
+#define MNTOPT_MTPT	"mtpt"		/* filesystem mount point */
+
+STATIC int
+xfs_parseargs(
+	char		*options,
+	int		flags,
+	struct xfs_mount_args *args)
+{
+	char		*this_char, *value, *eov;
+	int		logbufs = -1;
+	int		logbufsize = -1;
+	int		dsunit, dswidth, vol_dsunit, vol_dswidth;
+	int		iosize;
+	int		rval = 1;	/* failure is default */
+
+	iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
+	memset(args, 0, sizeof(struct xfs_mount_args));
+	args->slcount = args->stimeout = args->ctimeout = -1;
+	args->mtpt[0] = '\0';
+
+	/* Copy the already-parsed mount(2) flags we're interested in */
+	if (flags & MS_NOATIME)
+		args->flags |= XFSMNT_NOATIME;
+
+	if (!options) {
+		args->logbufs = logbufs;
+		args->logbufsize = logbufsize;
+		return 0;
+	}
+
+	while ((this_char = strsep (&options, ",")) != NULL) {
+		if (!*this_char)
+			continue;
+		if ((value = strchr (this_char, '=')) != NULL)
+			*value++ = 0;
+
+		if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
+			logbufs = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
+			int in_kilobytes = 0;
+
+			if (toupper(value[strlen(value)-1]) == 'K') {
+				in_kilobytes = 1;
+				value[strlen(value)-1] = '\0';
+			}
+			logbufsize = simple_strtoul(value, &eov, 10);
+			if (in_kilobytes)
+				logbufsize = logbufsize * 1024;
+		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
+			strncpy(args->logname, value, MAXNAMELEN);
+		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
+			strncpy(args->mtpt, value, MAXNAMELEN);
+#if CONFIG_XFS_DMAPI
+		} else if (!strcmp(this_char, MNTOPT_DMAPI)) {
+			args->flags |= XFSMNT_DMAPI;
+		} else if (!strcmp(this_char, MNTOPT_XDSM)) {
+			args->flags |= XFSMNT_DMAPI;
+#else
+		} else if (!strcmp(this_char, MNTOPT_DMAPI) ||
+			   !strcmp(this_char, MNTOPT_XDSM)) {
+			printk("XFS: this kernel does not support dmapi/xdsm.\n");
+			return rval;
+#endif
+		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
+			strncpy(args->rtname, value, MAXNAMELEN);
+		} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
+			iosize = simple_strtoul(value, &eov, 10);
+			args->flags |= XFSMNT_IOSIZE;
+			args->iosizelog = (uint8_t) iosize;
+		} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+			args->flags |= XFSMNT_WSYNC;
+		} else if (!strcmp(this_char, MNTOPT_OSYNCISDSYNC)) {
+			/* no-op, this is now the default */
+printk("XFS: osyncisdsync is now the default, and will soon be deprecated.\n");
+		} else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
+			args->flags |= XFSMNT_OSYNCISOSYNC;
+		} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+			args->flags |= XFSMNT_NORECOVERY;
+		} else if (!strcmp(this_char, MNTOPT_INO64)) {
+#ifdef XFS_BIG_FILESYSTEMS
+			args->flags |= XFSMNT_INO64;
+#else
+			printk("XFS: ino64 option not allowed on this system\n");
+			return rval;
+#endif
+		} else if (!strcmp(this_char, MNTOPT_UQUOTA)) {
+			args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_QUOTA)) {
+			args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+			args->flags |= XFSMNT_UQUOTA;
+			args->flags &= ~XFSMNT_UQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_QUOTANOENF)) {
+			args->flags |= XFSMNT_UQUOTA;
+			args->flags &= ~XFSMNT_UQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_MRQUOTA)) {
+			args->flags |= XFSMNT_QUOTAMAYBE;
+		} else if (!strcmp(this_char, MNTOPT_GQUOTA)) {
+			args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+			args->flags |= XFSMNT_GQUOTA;
+			args->flags &= ~XFSMNT_GQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+			args->flags |= XFSMNT_NOALIGN;
+		} else if (!strcmp(this_char, MNTOPT_SUNIT)) {
+			dsunit = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
+			dswidth = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+			args->flags |= XFSMNT_NOUUID;
+		} else if (!strcmp(this_char, MNTOPT_IRIXSGID)) {
+			args->flags |= XFSMNT_IRIXSGID;
+		} else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) {
+			args->flags |= XFSMNT_NOLOGFLUSH;
+		} else {
+			printk("XFS: unknown mount option [%s].\n", this_char);
+			return rval;
+		}
+	}
+
+	if (args->flags & XFSMNT_NORECOVERY) {
+		if ((flags & MS_RDONLY) == 0) {
+			printk("XFS: no-recovery mounts must be read-only.\n");
+			return rval;
+		}
+	}
+
+	if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
+		printk(
+	"XFS: sunit and swidth options incompatible with the noalign option\n");
+		return rval;
+	}
+
+	if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
+		printk("XFS: sunit and swidth must be specified together\n");
+		return rval;
+	}
+
+	if (dsunit && (dswidth % dsunit != 0)) {
+		printk(
+	"XFS: stripe width (%d) must be a multiple of the stripe unit (%d)\n",
+			dswidth, dsunit);
+		return rval;
+	}
+
+	if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+		if (dsunit) {
+			args->sunit = dsunit;
+			args->flags |= XFSMNT_RETERR;
+		} else
+			args->sunit = vol_dsunit;
+		dswidth ? (args->swidth = dswidth) :
+			  (args->swidth = vol_dswidth);
+	} else
+		args->sunit = args->swidth = 0;
+
+	args->logbufs = logbufs;
+	args->logbufsize = logbufsize;
+
+	return 0;
+}
+
+/*
+ * Convert one device special file to a dev_t.
+ * Helper routine, used only by spectodevs below.
+ */
+STATIC int
+spectodev(
+	const char	*name,
+	const char	*id,
+	dev_t		*dev)
+{
+	struct nameidata nd;
+	int		rval = 0;
+
+	if (path_init(name, LOOKUP_FOLLOW, &nd))
+		rval = path_walk(name, &nd);
+	/* Watch out for negative dentries */
+	if (!nd.dentry->d_inode)
+		rval = -ENOENT;
+	if (rval)
+		printk("XFS: Invalid %s device [%s], err=%d\n", id, name, rval);
+	else
+		*dev = nd.dentry->d_inode->i_rdev;
+	path_release(&nd);
+	return rval;
+}
+
+/*
+ * Convert device special files to dev_t for data, log, realtime.
+ */
+int
+spectodevs(
+	struct super_block *sb,
+	struct xfs_mount_args *args,
+	dev_t		*ddevp,
+	dev_t		*logdevp,
+	dev_t		*rtdevp)
+{
+	int		rval = 0;
+
+	*ddevp = sb->s_dev;
+
+	if (args->logname[0])
+		rval = spectodev(args->logname, "log", logdevp);
+	else
+		*logdevp = sb->s_dev;
+
+	if (args->rtname[0] && !rval)
+		rval = spectodev(args->rtname, "realtime", rtdevp);
+	else
+		*rtdevp = 0;
+	return rval;
+}
+
+
+static kmem_cache_t * linvfs_inode_cachep;
+
+static __inline__ unsigned int gfp_mask(void)
+{
+	/* If we're not in a transaction, FS activity is ok */
+	if (current->flags & PF_FSTRANS) return GFP_NOFS;
+	return GFP_KERNEL;
+}
+
+
+static struct inode *linvfs_alloc_inode(struct super_block *sb)
+{
+	vnode_t *vp;
+
+	vp = (vnode_t *)kmem_cache_alloc(linvfs_inode_cachep, gfp_mask());
+	if (!vp)
+		return NULL;
+	return LINVFS_GET_IP(vp);
+}
+
+static void linvfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(linvfs_inode_cachep, LINVFS_GET_VP(inode));
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+	vnode_t *vp = (vnode_t *)foo;
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(LINVFS_GET_IP(vp));
+}
+
+static int init_inodecache(void)
+{
+	linvfs_inode_cachep = kmem_cache_create("linvfs_icache",
+				sizeof(vnode_t), 0, SLAB_HWCACHE_ALIGN,
+				init_once, NULL);
+
+	if (linvfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	if (kmem_cache_destroy(linvfs_inode_cachep))
+		printk(KERN_INFO "linvfs_inode_cache: not all structures were freed\n");
+}
+
+struct super_block *
+linvfs_read_super(
+	struct super_block *sb,
+	void		*data,
+	int		silent)
+{
+	vfs_t		*vfsp;
+	vfsops_t	*vfsops;
+	vnode_t		*rootvp;
+	struct inode	*ip;
+	struct xfs_mount_args *args;
+	struct statfs	statvfs;
+	int		error;
+
+	args = (struct xfs_mount_args *)kmalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
+	if (!args)
+		return NULL;
+
+	if (xfs_parseargs((char *)data, sb->s_flags, args))
+		goto out_null;
+	strncpy(args->fsname, bdevname(sb->s_dev), MAXNAMELEN);
+	/* args->rtdev and args->logdev done in xfs_parseargs */
+
+	/*  Kludge in XFS until we have other VFS/VNODE FSs  */
+	vfsops = &xfs_vfsops;
+
+	/*  Set up the vfs_t structure	*/
+	vfsp = vfs_allocate();
+	if (!vfsp)
+		goto out_null;
+
+	if (sb->s_flags & MS_RDONLY)
+		vfsp->vfs_flag |= VFS_RDONLY;
+
+	vfsp->vfs_super = sb;
+	set_blocksize(sb->s_dev, BBSIZE);
+	set_max_bytes(sb);
+	set_quota_ops(sb);
+	sb->s_op = &linvfs_sops;
+
+	LINVFS_SET_VFS(sb, vfsp);
+
+	VFSOPS_MOUNT(vfsops, vfsp, args, NULL, error);
+	if (error)
+		goto fail_vfsop;
+
+	VFS_STATVFS(vfsp, &statvfs, NULL, error);
+	if (error)
+		goto fail_unmount;
+
+	sb->s_magic = XFS_SB_MAGIC;
+	sb->s_dirt = 1;
+	sb->s_blocksize = statvfs.f_bsize;
+	sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1;
+
+	VFS_ROOT(vfsp, &rootvp, error);
+	if (error)
+		goto fail_unmount;
+
+	ip = LINVFS_GET_IP(rootvp);
+	linvfs_revalidate_core(ip, ATTR_COMM);
+
+	sb->s_root = d_alloc_root(ip);
+	if (!sb->s_root)
+		goto fail_vnrele;
+	if (is_bad_inode(sb->s_root->d_inode))
+		goto fail_vnrele;
+
+	/* Don't set the VFS_DMI flag until here because we don't want
+	 * to send events while replaying the log.
+	 */
+	if (args->flags & XFSMNT_DMAPI) {
+		vfsp->vfs_flag |= VFS_DMI;
+		VFSOPS_DMAPI_MOUNT(vfsops, vfsp, args->mtpt, args->fsname,
+				   error);
+		if (error) {
+			if (atomic_read(&sb->s_active) == 1)
+				vfsp->vfs_flag &= ~VFS_DMI;
+			goto fail_vnrele;
+		}
+	}
+	set_posix_acl(sb);
+
+	vn_trace_exit(rootvp, "linvfs_read_super", (inst_t *)__return_address);
+
+	kfree(args);
+	return(sb);
+
+fail_vnrele:
+	if (sb->s_root) {
+		dput(sb->s_root);
+		sb->s_root = NULL;
+	} else {
+		VN_RELE(rootvp);
+	}
+
+fail_unmount:
+	VFS_UNMOUNT(vfsp, 0, NULL, error);
+
+fail_vfsop:
+	vfs_deallocate(vfsp);
+
+out_null:
+	kfree(args);
+	return(NULL);
+}
+
+void
+linvfs_set_inode_ops(
+	struct inode	*inode)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	inode->i_mode = VTTOIF(vp->v_type);
+
+	/* If this isn't a new inode, nothing to do */
+	if (!(inode->i_state & I_NEW))
+		return;
+
+	if (vp->v_type == VNON) {
+		make_bad_inode(inode);
+	} else if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &linvfs_file_inode_operations;
+		inode->i_fop = &linvfs_file_operations;
+		inode->i_mapping->a_ops = &linvfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &linvfs_dir_inode_operations;
+		inode->i_fop = &linvfs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &linvfs_symlink_inode_operations;
+		if (inode->i_blocks)
+			inode->i_mapping->a_ops = &linvfs_aops;
+	} else {
+		inode->i_op = &linvfs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode,
+					kdev_t_to_nr(inode->i_rdev));
+	}
+
+	unlock_new_inode(inode);
+}
+
+/*
+ * We do not actually write the inode here, just mark the
+ * super block dirty so that sync_supers calls us and
+ * forces the flush.
+ */
+void
+linvfs_write_inode(
+	struct inode	*inode,
+	int		sync)
+{
+	vnode_t *vp = LINVFS_GET_VP(inode);
+	int	error, flags = FLUSH_INODE;
+
+	if (vp) {
+		vn_trace_entry(vp, "linvfs_write_inode",
+					(inst_t *)__return_address);
+
+		if (sync)
+			flags |= FLUSH_SYNC;
+		VOP_IFLUSH(vp, flags, error);
+		if (error == EAGAIN)
+			inode->i_sb->s_dirt = 1;
+	}
+}
+
+void
+linvfs_clear_inode(
+	struct inode	*inode)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	if (vp) {
+		vn_rele(vp);
+		vn_trace_entry(vp, "linvfs_clear_inode",
+					(inst_t *)__return_address);
+		/*
+		 * Do all our cleanup, and remove this vnode.
+		 */
+		vp->v_flag |= VPURGE;
+		vn_remove(vp);
+	}
+}
+
+void
+linvfs_put_inode(
+	struct inode	*ip)
+{
+	vnode_t		*vp = LINVFS_GET_VP(ip);
+	int		error;
+
+	if (vp && vp->v_fbhv && (atomic_read(&ip->i_count) == 1))
+		VOP_RELEASE(vp, error);
+}
+
+void
+linvfs_put_super(
+	struct super_block *sb)
+{
+	int		error;
+	int		sector_size = BBSIZE;
+	kdev_t		dev = sb->s_dev;
+	vfs_t		*vfsp = LINVFS_GET_VFS(sb);
+
+	VFS_DOUNMOUNT(vfsp, 0, NULL, NULL, error);
+	if (error) {
+		printk("XFS unmount got error %d\n", error);
+		printk("linvfs_put_super: vfsp/0x%p left dangling!\n", vfsp);
+		return;
+	}
+
+	vfs_deallocate(vfsp);
+
+	/* Reset device block size */
+	if (hardsect_size[MAJOR(dev)])
+		sector_size = hardsect_size[MAJOR(dev)][MINOR(dev)];
+	set_blocksize(dev, sector_size);
+}
+
+void
+linvfs_write_super(
+	struct super_block *sb)
+{
+	vfs_t		*vfsp = LINVFS_GET_VFS(sb);
+	int		error;
+
+	sb->s_dirt = 0;
+	if (sb->s_flags & MS_RDONLY)
+		return;
+	VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR,
+		NULL, error);
+}
+
+int
+linvfs_statfs(
+	struct super_block *sb,
+	struct statfs	*statp)
+{
+	vfs_t		*vfsp = LINVFS_GET_VFS(sb);
+	int		error;
+
+	VFS_STATVFS(vfsp, statp, NULL, error);
+
+	return error;
+}
+
+int
+linvfs_remount(
+	struct super_block *sb,
+	int		*flags,
+	char		*options)
+{
+	struct xfs_mount_args *args;
+	vfs_t		*vfsp;
+	xfs_mount_t	*mp;
+	int		error = 0;
+
+	args = (struct xfs_mount_args *)kmalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+
+	if (xfs_parseargs(options, *flags, args)) {
+		error = -EINVAL;
+		goto out;
+	}
+	set_posix_acl(sb);
+
+	if (args->flags & XFSMNT_NOATIME)
+		mp->m_flags |= XFS_MOUNT_NOATIME;
+	else
+		mp->m_flags &= ~XFS_MOUNT_NOATIME;
+
+	linvfs_write_super(sb);
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out;
+
+	if (*flags & MS_RDONLY) {
+		sb->s_flags |= MS_RDONLY;
+		XFS_log_write_unmount_ro(vfsp->vfs_fbhv);
+		vfsp->vfs_flag |= VFS_RDONLY;
+	} else {
+		vfsp->vfs_flag &= ~VFS_RDONLY;
+	}
+
+out:
+	kfree(args);
+	return error;
+}
+
+void
+linvfs_freeze_fs(
+	struct super_block *sb)
+{
+	vfs_t		*vfsp;
+	vnode_t		*vp;
+	int		error;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	if (sb->s_flags & MS_RDONLY)
+		return;
+	VFS_ROOT(vfsp, &vp, error);
+	VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, XFS_IOC_FREEZE, 0, error);
+	VN_RELE(vp);
+}
+
+void
+linvfs_unfreeze_fs(
+	struct super_block *sb)
+{
+	vfs_t		*vfsp;
+	vnode_t		*vp;
+	int		error;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	VFS_ROOT(vfsp, &vp, error);
+	VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, XFS_IOC_THAW, 0, error);
+	VN_RELE(vp);
+}
+
+
+STATIC int linvfs_dentry_to_fh(
+	struct dentry *dentry,
+	__u32 *data,
+	int *lenp,
+	int need_parent)
+{
+	struct inode *inode = dentry->d_inode ;
+	vnode_t *vp = LINVFS_GET_VP(inode);
+	int maxlen = *lenp;
+	xfs_fid2_t fid;
+	int error;
+
+	if (maxlen < 3)
+		return 255 ;
+
+	VOP_FID2(vp, (struct fid *)&fid, error);
+	data[0] = (__u32)fid.fid_ino;	/* 32 bits of inode is OK */
+	data[1] = fid.fid_gen;
+
+	*lenp = 2 ;
+	if (maxlen < 4 || ! need_parent)
+		return 2 ;
+
+	inode = dentry->d_parent->d_inode ;
+	vp = LINVFS_GET_VP(inode);
+
+	VOP_FID2(vp, (struct fid *)&fid, error);
+	data[2] = (__u32)fid.fid_ino;	/* 32 bits of inode is OK */
+	*lenp = 3 ;
+	if (maxlen < 4)
+		return 3 ;
+	data[3] = fid.fid_gen;
+	*lenp = 4 ;
+	return 4 ;
+}
+
+STATIC struct dentry *linvfs_fh_to_dentry(
+	struct super_block *sb,
+	__u32 *data,
+	int len,
+	int fhtype,
+	int parent)
+{
+	vnode_t *vp;
+	struct inode *inode = NULL;
+	struct list_head *lp;
+	struct dentry *result;
+	xfs_fid2_t xfid;
+	vfs_t *vfsp = LINVFS_GET_VFS(sb);
+	int error;
+
+	xfid.fid_len = sizeof(xfs_fid2_t) - sizeof(xfid.fid_len);
+	xfid.fid_pad = 0;
+
+	if (!parent) {
+		xfid.fid_gen = data[1];
+		xfid.fid_ino = (__u64)data[0];
+	} else {
+		if (fhtype == 4)
+			xfid.fid_gen = data[3];
+		else
+			xfid.fid_gen = 0;
+		xfid.fid_ino = (__u64)data[2];
+	}
+
+	VFS_VGET(vfsp, &vp, (fid_t *)&xfid, error);
+	if (error || vp == NULL)
+		return ERR_PTR(-ESTALE) ;
+
+	inode = LINVFS_GET_IP(vp);
+	spin_lock(&dcache_lock);
+	for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) {
+		result = list_entry(lp,struct dentry, d_alias);
+		if (! (result->d_flags & DCACHE_NFSD_DISCONNECTED)) {
+			dget_locked(result);
+			result->d_vfs_flags |= DCACHE_REFERENCED;
+			spin_unlock(&dcache_lock);
+			iput(inode);
+			return result;
+		}
+	}
+	spin_unlock(&dcache_lock);
+	result = d_alloc_root(inode);
+	if (result == NULL) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	result->d_flags |= DCACHE_NFSD_DISCONNECTED;
+	return result;
+}
+
+int
+linvfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	vfs_t		*vfsp;
+	xfs_mount_t	*mp;
+	static struct proc_xfs_info {
+		int flag;
+		char *str;
+	} xfs_info[] = {
+		/* the few simple ones we can get from the mount struct */
+		{ XFS_MOUNT_NOALIGN,		",noalign" },
+		{ XFS_MOUNT_NORECOVERY,		",norecovery" },
+		{ XFS_MOUNT_OSYNCISOSYNC,	",osyncisosync" },
+		{ XFS_MOUNT_NOUUID,		",nouuid" },
+		{ XFS_MOUNT_IRIXSGID,		",irixsgid" },
+		{ 0, NULL }
+	};
+
+	struct proc_xfs_info *xfs_infop;
+
+	vfsp = LINVFS_GET_VFS(mnt->mnt_sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+
+	for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) {
+		if (mp->m_flags & xfs_infop->flag)
+			seq_puts(m, xfs_infop->str);
+	}
+
+	if (mp->m_qflags & XFS_UQUOTA_ACCT) {
+		seq_puts(m, ",uquota");
+		if (!(mp->m_qflags & XFS_UQUOTA_ENFD))
+			seq_puts(m, ",uqnoenforce");
+	}
+
+	if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+		seq_puts(m, ",gquota");
+		if (!(mp->m_qflags & XFS_GQUOTA_ENFD))
+			seq_puts(m, ",gqnoenforce");
+	}
+
+	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+		seq_printf(m, ",biosize=%d", mp->m_writeio_log);
+
+	if (mp->m_logbufs > 0)
+		seq_printf(m, ",logbufs=%d", mp->m_logbufs);
+
+	if (mp->m_logbsize > 0)
+		seq_printf(m, ",logbsize=%d", mp->m_logbsize);
+
+	if (mp->m_ddev_targp->pbr_dev != mp->m_logdev_targp->pbr_dev)
+		seq_printf(m, ",logdev=%s",
+				bdevname(mp->m_logdev_targp->pbr_dev));
+
+	if (mp->m_rtdev_targp &&
+	    mp->m_ddev_targp->pbr_dev != mp->m_rtdev_targp->pbr_dev)
+		seq_printf(m, ",rtdev=%s",
+				bdevname(mp->m_rtdev_targp->pbr_dev));
+
+	if (mp->m_dalign > 0)
+		seq_printf(m, ",sunit=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_dalign));
+
+	if (mp->m_swidth > 0)
+		seq_printf(m, ",swidth=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
+
+	if (vfsp->vfs_flag & VFS_DMI)
+		seq_puts(m, ",dmapi");
+
+	return 0;
+}
+
+static struct super_operations linvfs_sops = {
+	.alloc_inode		= linvfs_alloc_inode,
+	.destroy_inode		= linvfs_destroy_inode,
+	.write_inode		= linvfs_write_inode,
+	.put_inode		= linvfs_put_inode,
+	.clear_inode		= linvfs_clear_inode,
+	.put_super		= linvfs_put_super,
+	.write_super		= linvfs_write_super,
+	.write_super_lockfs	= linvfs_freeze_fs,
+	.unlockfs		= linvfs_unfreeze_fs,
+	.statfs			= linvfs_statfs,
+	.remount_fs		= linvfs_remount,
+	.fh_to_dentry		= linvfs_fh_to_dentry,
+	.dentry_to_fh		= linvfs_dentry_to_fh,
+	.show_options		= linvfs_show_options,
+};
+
+static struct file_system_type xfs_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= "xfs",
+	.read_super		= linvfs_read_super,
+	.fs_flags		= FS_REQUIRES_DEV,
+};
+
+static int __init init_xfs_fs(void)
+{
+	int error;
+	struct sysinfo	si;
+	static char message[] __initdata =
+		KERN_INFO "SGI XFS " XFS_VERSION_STRING " with " 
+		XFS_BUILD_OPTIONS " enabled\n";
+
+	error = init_inodecache();
+	if (error < 0)
+		return error;
+
+	error = pagebuf_init();
+	if (error < 0)
+		goto out;
+
+	si_meminfo(&si);
+	xfs_physmem = si.totalram;
+
+	printk(message);
+
+	vn_init();
+	xfs_init();
+	dmapi_init();
+
+	error = register_filesystem(&xfs_fs_type);
+	if (error)
+		goto out;
+	return 0;
+
+out:
+	destroy_inodecache();
+	return error;
+}
+
+
+static void __exit exit_xfs_fs(void)
+{
+	dmapi_uninit();
+	xfs_cleanup();
+	unregister_filesystem(&xfs_fs_type);
+	pagebuf_terminate();
+	destroy_inodecache();
+}
+
+module_init(init_xfs_fs);
+module_exit(exit_xfs_fs);
+
+MODULE_AUTHOR("SGI <sgi.com>");
+MODULE_DESCRIPTION("SGI XFS " XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
+MODULE_LICENSE("GPL");
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_super.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_super.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_super.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_super.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPER_H__
+#define __XFS_SUPER_H__
+
+#ifdef CONFIG_FS_POSIX_ACL
+# define XFS_ACL_STRING		"ACLs, "
+#else
+# define XFS_ACL_STRING
+#endif
+
+#ifdef CONFIG_XFS_DMAPI
+# define XFS_DMAPI_STRING	"DMAPI, "
+#else
+# define XFS_DMAPI_STRING
+#endif
+
+#ifdef CONFIG_XFS_QUOTA
+# define XFS_QUOTA_STRING	"quota, "
+#else
+# define XFS_QUOTA_STRING
+#endif
+
+#ifdef CONFIG_XFS_RT
+# define XFS_RT_STRING		"realtime, "
+#else
+# define XFS_RT_STRING
+#endif
+
+#ifdef CONFIG_XFS_VNODE_TRACING
+# define XFS_VNTRACE_STRING	"VN-trace, "
+#else
+# define XFS_VNTRACE_STRING
+#endif
+
+#ifdef XFSDEBUG
+# define XFS_DBG_STRING		"debug"
+#else
+# define XFS_DBG_STRING		"no debug"
+#endif
+
+#define XFS_BUILD_OPTIONS	XFS_ACL_STRING XFS_DMAPI_STRING \
+				XFS_RT_STRING \
+				XFS_QUOTA_STRING XFS_VNTRACE_STRING \
+				XFS_DBG_STRING /* DBG must be last */
+
+
+#define LINVFS_GET_VFS(s) \
+	(vfs_t *)((s)->u.generic_sbp)
+#define LINVFS_SET_VFS(s, vfsp) \
+	((s)->u.generic_sbp = vfsp)
+
+
+struct xfs_mount_args;
+
+extern void
+linvfs_set_inode_ops(
+	struct inode	*inode);
+
+extern int
+spectodevs(
+	struct super_block *sb,
+	struct xfs_mount_args *args,
+	dev_t		*ddevp,
+	dev_t		*logdevp,
+	dev_t		*rtdevp);
+
+#endif	/* __XFS_SUPER_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_sysctl.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_sysctl.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_sysctl.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_sysctl.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+/*
+ * Tunable xfs parameters
+ */
+
+extern struct xfsstats xfsstats;
+
+unsigned long xfs_min[XFS_PARAM] = {			 0,			 0, 0 };
+unsigned long xfs_max[XFS_PARAM] = { XFS_REFCACHE_SIZE_MAX,  XFS_REFCACHE_SIZE_MAX, 1 };
+
+xfs_param_t xfs_params = { 128, 32, 0 };
+
+static struct ctl_table_header *xfs_table_header;
+
+/* proc handlers */
+
+extern void xfs_refcache_resize(int xfs_refcache_new_size);
+
+static int
+xfs_refcache_resize_proc_handler(ctl_table *ctl, int write, struct file * filp,
+		       void *buffer, size_t *lenp)
+{
+	int	ret;
+	int	*valp = ctl->data;
+	int	xfs_refcache_new_size;
+	int	xfs_refcache_old_size = *valp;
+
+	ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp);
+	xfs_refcache_new_size = *valp;
+
+	if (!ret && write && xfs_refcache_new_size != xfs_refcache_old_size) {
+		xfs_refcache_resize(xfs_refcache_new_size);
+		/* Don't purge more than size of the cache */
+		if (xfs_refcache_new_size < xfs_params.refcache_purge)
+			xfs_params.refcache_purge = xfs_refcache_new_size;
+	}
+
+	return ret;
+}
+
+static int
+xfs_stats_clear_proc_handler(ctl_table *ctl, int write, struct file * filp,
+		       void *buffer, size_t *lenp)
+{
+	int		ret;
+	int		*valp = ctl->data;
+	__uint32_t	vn_active;
+
+	ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp);
+
+	if (!ret && write && *valp) {
+		printk("XFS Clearing xfsstats\n");
+		/* save vn_active, it's a universal truth! */
+		vn_active = xfsstats.vn_active;
+		memset(&xfsstats, 0, sizeof(xfsstats));
+		xfsstats.vn_active = vn_active;
+		xfs_params.stats_clear = 0;
+	}
+
+	return ret;
+}
+
+static ctl_table xfs_table[] = {
+	{XFS_REFCACHE_SIZE, "refcache_size", &xfs_params.refcache_size,
+	sizeof(ulong), 0644, NULL, &xfs_refcache_resize_proc_handler,
+	&sysctl_intvec, NULL, &xfs_min[0], &xfs_max[0]},
+
+	{XFS_REFCACHE_PURGE, "refcache_purge", &xfs_params.refcache_purge,
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_minmax,
+	&sysctl_intvec, NULL, &xfs_min[1], &xfs_params.refcache_size},
+
+	{XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear,
+	sizeof(ulong), 0644, NULL, &xfs_stats_clear_proc_handler,
+	&sysctl_intvec, NULL, &xfs_min[2], &xfs_max[2]},
+
+	{0}
+};
+
+static ctl_table xfs_dir_table[] = {
+	{FS_XFS, "xfs", NULL, 0, 0555, xfs_table},
+	{0}
+};
+
+static ctl_table xfs_root_table[] = {
+	{CTL_FS, "fs",	NULL, 0, 0555, xfs_dir_table},
+	{0}
+};
+
+void
+xfs_sysctl_register(void)
+{
+	xfs_table_header = register_sysctl_table(xfs_root_table, 1);
+}
+
+void
+xfs_sysctl_unregister(void)
+{
+	if (xfs_table_header)
+		unregister_sysctl_table(xfs_table_header);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_sysctl.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_sysctl.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_sysctl.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_sysctl.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef __XFS_SYSCTL_H__
+#define __XFS_SYSCTL_H__
+
+#include <linux/sysctl.h>
+
+/*
+ * Tunable xfs parameters
+ */
+
+#define XFS_PARAM	3
+
+typedef struct xfs_param {
+	ulong	refcache_size;	/* Size of nfs refcache */
+	ulong	refcache_purge; /* # of entries to purge each time */
+	ulong	stats_clear;	/* reset all xfs stats to 0 */
+} xfs_param_t;
+
+enum {
+	XFS_REFCACHE_SIZE = 1,
+	XFS_REFCACHE_PURGE = 2,
+	XFS_STATS_CLEAR = 3,
+};
+
+extern xfs_param_t	xfs_params;
+
+#ifdef CONFIG_SYSCTL
+extern void xfs_sysctl_register(void);
+extern void xfs_sysctl_unregister(void);
+#else
+static __inline void xfs_sysctl_register(void) { };
+static __inline void xfs_sysctl_unregister(void) { };
+#endif
+
+#endif /* __XFS_SYSCTL_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_version.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_version.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_version.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_version.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Dummy file that can contain a timestamp to put into the
+ * XFS init string, to help users keep track of what they're
+ * running
+ */
+
+#ifndef __XFS_VERSION_H__
+#define __XFS_VERSION_H__
+
+#define XFS_VERSION_STRING "CVS"
+
+#endif /* __XFS_VERSION_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_vfs.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_vfs.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_vfs.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_vfs.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_VFS_H__
+#define __XFS_VFS_H__
+
+#include <linux/vfs.h>
+
+struct statfs;
+struct vnode;
+struct cred;
+struct super_block;
+struct fid;
+struct dm_fcntl_vector;
+struct xfs_mount_args;
+
+typedef struct vfs {
+	u_int		vfs_flag;	/* flags */
+	fsid_t		vfs_fsid;	/* file system id */
+	fsid_t		*vfs_altfsid;	/* An ID fixed for life of FS */
+	bhv_head_t	vfs_bh;		/* head of vfs behavior chain */
+	struct super_block *vfs_super;	/* pointer to super block structure */
+} vfs_t;
+
+#define vfs_fbhv	vfs_bh.bh_first		/* 1st on vfs behavior chain */
+#define VFS_FOPS(vfsp)	\
+	((vfsops_t *)((vfsp)->vfs_fbhv->bd_ops))/* ops for 1st behavior */
+
+
+#define bhvtovfs(bdp)	((struct vfs *)BHV_VOBJ(bdp))
+#define VFS_BHVHEAD(vfsp) (&(vfsp)->vfs_bh)
+
+
+#define VFS_RDONLY	0x0001		/* read-only vfs */
+#define VFS_GRPID	0x0002		/* group-ID assigned from directory */
+#define VFS_DMI		0x0004		/* filesystem has the DMI enabled */
+
+#define SYNC_ATTR	0x0001		/* sync attributes */
+#define SYNC_CLOSE	0x0002		/* close file system down */
+#define SYNC_DELWRI	0x0004		/* look at delayed writes */
+#define SYNC_WAIT	0x0008		/* wait for i/o to complete */
+#define SYNC_FSDATA	0x0020		/* flush fs data (e.g. superblocks) */
+#define SYNC_BDFLUSH	0x0010		/* BDFLUSH is calling -- don't block */
+
+
+typedef struct vfsops {
+	int	(*vfs_mount)(struct vfs *, struct xfs_mount_args *,
+					struct cred *);
+					/* mount file system */
+	int	(*vfs_dounmount)(bhv_desc_t *, int, struct vnode *,
+				 struct cred *);
+					/* preparation and unmount */
+	int	(*vfs_unmount)(bhv_desc_t *, int, struct cred *);
+					/* unmount file system */
+	int	(*vfs_root)(bhv_desc_t *, struct vnode **);
+					/* get root vnode */
+	int	(*vfs_statvfs)(bhv_desc_t *, struct statfs *, struct vnode *);
+					/* get file system statistics */
+	int	(*vfs_sync)(bhv_desc_t *, int, struct cred *);
+					/* flush files */
+	int	(*vfs_vget)(bhv_desc_t *, struct vnode **, struct fid *);
+					/* get vnode from fid */
+	int	(*vfs_dmapi_mount)(struct vfs *, char *, char *);
+					/* send dmapi mount event */
+	int	(*vfs_dmapi_fsys_vector)(bhv_desc_t *,
+					 struct dm_fcntl_vector *);
+	void	(*vfs_force_shutdown)(bhv_desc_t *,
+					int, char *, int);
+} vfsops_t;
+
+#define VFS_DOUNMOUNT(vfsp,f,vp,cr, rv) \
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_dounmount))((vfsp)->vfs_fbhv, f, vp, cr);	\
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_UNMOUNT(vfsp,f,cr, rv)	\
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_unmount))((vfsp)->vfs_fbhv, f, cr);		\
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_ROOT(vfsp, vpp, rv)		\
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_root))((vfsp)->vfs_fbhv, vpp);	\
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_STATVFS(vfsp, sp, vp, rv)	\
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_statvfs))((vfsp)->vfs_fbhv, sp, vp);	\
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_SYNC(vfsp, flag, cr, rv) \
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_sync))((vfsp)->vfs_fbhv, flag, cr); \
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_VGET(vfsp, vpp, fidp, rv) \
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_vget))((vfsp)->vfs_fbhv, vpp, fidp);  \
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+/* No behavior lock here */
+#define VFS_FORCE_SHUTDOWN(vfsp, flags) \
+	(*(VFS_FOPS(vfsp)->vfs_force_shutdown))((vfsp)->vfs_fbhv, flags, __FILE__, __LINE__);
+
+#define VFS_DMAPI_FSYS_VECTOR(vfsp, df, rv) \
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_dmapi_fsys_vector))((vfsp)->vfs_fbhv, df);	      \
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+
+
+#define VFSOPS_DMAPI_MOUNT(vfs_op, vfsp, dir_name, fsname, rv) \
+	rv = (*(vfs_op)->vfs_dmapi_mount)(vfsp, dir_name, fsname)
+#define VFSOPS_MOUNT(vfs_op, vfsp, args, cr, rv) \
+	rv = (*(vfs_op)->vfs_mount)(vfsp, args, cr)
+
+#define VFS_REMOVEBHV(vfsp, bdp)\
+{	\
+	bhv_remove(VFS_BHVHEAD(vfsp), bdp); \
+}
+
+#define PVFS_UNMOUNT(bdp,f,cr, rv)	\
+{	\
+	rv = (*((vfsops_t *)(bdp)->bd_ops)->vfs_unmount)(bdp, f, cr);	\
+}
+
+#define PVFS_SYNC(bdp, flag, cr, rv) \
+{	\
+	rv = (*((vfsops_t *)(bdp)->bd_ops)->vfs_sync)(bdp, flag, cr);	\
+}
+
+
+static __inline vfs_t *
+vfs_allocate(void)
+{
+	vfs_t	*vfsp;
+
+	vfsp = kmalloc(sizeof(vfs_t), GFP_KERNEL);
+	if (vfsp) {
+		memset(vfsp, 0, sizeof(vfs_t));
+		bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
+	}
+	return (vfsp);
+}
+
+static __inline void
+vfs_deallocate(
+	vfs_t		*vfsp)
+{
+	bhv_head_destroy(VFS_BHVHEAD(vfsp));
+	kfree(vfsp);
+}
+
+/*
+ * Called by fs dependent VFS_MOUNT code to link the VFS base file system
+ * dependent behavior with the VFS virtual object.
+ */
+static __inline void
+vfs_insertbhv(
+	vfs_t		*vfsp,
+	bhv_desc_t	*bdp,
+	vfsops_t	*vfsops,
+	void		*mount)
+{
+	/*
+	 * Initialize behavior desc with ops and data and then
+	 * attach it to the vfs.
+	 */
+	bhv_desc_init(bdp, mount, vfsp, vfsops);
+	bhv_insert_initial(&vfsp->vfs_bh, bdp);
+}
+
+#endif	/* __XFS_VFS_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_vnode.c linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_vnode.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_vnode.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_vnode.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/pagemap.h>
+
+
+uint64_t vn_generation;		/* vnode generation number */
+
+spinlock_t	vnumber_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Dedicated vnode inactive/reclaim sync semaphores.
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC			37
+#define vptosync(v)		(&vsync[((unsigned long)v) % NVSYNC])
+sv_t vsync[NVSYNC];
+
+/*
+ * Translate stat(2) file types to vnode types and vice versa.
+ * Aware of numeric order of S_IFMT and vnode type values.
+ */
+enum vtype iftovt_tab[] = {
+	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
+};
+
+u_short vttoif_tab[] = {
+	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK
+};
+
+#define VN_LOCK(vp)		spin_lock(&(vp)->v_lock)
+#define VN_UNLOCK(vp)		spin_unlock(&(vp)->v_lock)
+
+
+void
+vn_init(void)
+{
+	register sv_t *svp;
+	register int i;
+
+	for (svp = vsync, i = 0; i < NVSYNC; i++, svp++)
+		init_sv(svp, SV_DEFAULT, "vsy", i);
+}
+
+
+/*
+ * Clean a vnode of filesystem-specific data and prepare it for reuse.
+ */
+STATIC int
+vn_reclaim(struct vnode *vp)
+{
+	int error;
+
+	XFS_STATS_INC(xfsstats.vn_reclaim);
+
+	vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address);
+
+	/*
+	 * Only make the VOP_RECLAIM call if there are behaviors
+	 * to call.
+	 */
+	if (vp->v_fbhv != NULL) {
+		VOP_RECLAIM(vp, error);
+		if (error)
+			return -error;
+	}
+	ASSERT(vp->v_fbhv == NULL);
+
+	VN_LOCK(vp);
+
+	vp->v_flag &= (VRECLM|VWAIT);
+	VN_UNLOCK(vp);
+
+	vp->v_type = VNON;
+	vp->v_fbhv = NULL;
+
+#ifdef CONFIG_XFS_VNODE_TRACING
+	ktrace_free(vp->v_trace);
+	vp->v_trace = NULL;
+#endif
+
+	return 0;
+}
+
+STATIC void
+vn_wakeup(struct vnode *vp)
+{
+	VN_LOCK(vp);
+	if (vp->v_flag & VWAIT) {
+		sv_broadcast(vptosync(vp));
+	}
+	vp->v_flag &= ~(VRECLM|VWAIT|VMODIFIED);
+	VN_UNLOCK(vp);
+}
+
+int
+vn_wait(struct vnode *vp)
+{
+	VN_LOCK(vp);
+	if (vp->v_flag & (VINACT | VRECLM)) {
+		vp->v_flag |= VWAIT;
+		sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
+		return 1;
+	}
+	VN_UNLOCK(vp);
+	return 0;
+}
+
+struct vnode *
+vn_initialize(struct inode *inode)
+{
+	struct vnode	*vp = LINVFS_GET_VP(inode);
+
+	XFS_STATS_INC(xfsstats.vn_active);
+
+	vp->v_flag = VMODIFIED;
+	spinlock_init(&vp->v_lock, "v_lock");
+
+	spin_lock(&vnumber_lock);
+	if (!++vn_generation)	/* v_number shouldn't be zero */
+		vn_generation++;
+	vp->v_number = vn_generation;
+	spin_unlock(&vnumber_lock);
+
+	ASSERT(VN_CACHED(vp) == 0);
+
+	/* Initialize the first behavior and the behavior chain head. */
+	vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");
+
+#ifdef	CONFIG_XFS_VNODE_TRACING
+	vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
+#endif	/* CONFIG_XFS_VNODE_TRACING */
+
+	vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address);
+	return vp;
+}
+
+/*
+ * Get a reference on a vnode.
+ */
+vnode_t *
+vn_get(struct vnode *vp, vmap_t *vmap)
+{
+	struct inode	*inode;
+
+	XFS_STATS_INC(xfsstats.vn_get);
+	inode = LINVFS_GET_IP(vp);
+	if (inode->i_state & I_FREEING)
+		return NULL;
+
+	inode = iget_locked(vmap->v_vfsp->vfs_super, vmap->v_ino);
+	if (inode == NULL)		/* Inode not present */
+		return NULL;
+
+	/* We do not want to create new inodes via vn_get,
+	 * returning NULL here is OK.
+	 */
+	if (inode->i_state & I_NEW) {
+		make_bad_inode(inode);
+		unlock_new_inode(inode);
+		iput(inode);
+		return NULL;
+	}
+
+	vn_trace_exit(vp, "vn_get", (inst_t *)__return_address);
+	ASSERT((vp->v_flag & VPURGE) == 0);
+
+	return vp;
+}
+
+/*
+ * "revalidate" the linux inode.
+ */
+int
+vn_revalidate(struct vnode *vp, int flags)
+{
+	int		error;
+	struct inode	*inode;
+	vattr_t		va;
+
+	vn_trace_entry(vp, "vn_revalidate", (inst_t *)__return_address);
+
+	va.va_mask = AT_STAT|AT_GENCOUNT;
+
+	ASSERT(vp->v_bh.bh_first != NULL);
+
+	VOP_GETATTR(vp, &va, flags & ATTR_LAZY, NULL, error);
+
+	if (! error) {
+		inode = LINVFS_GET_IP(vp);
+		ASSERT(inode);
+
+		inode->i_mode	    = VTTOIF(va.va_type) | va.va_mode;
+		inode->i_nlink	    = va.va_nlink;
+		inode->i_uid	    = va.va_uid;
+		inode->i_gid	    = va.va_gid;
+		inode->i_rdev	    = mk_kdev(MAJOR(va.va_rdev),
+						MINOR(va.va_rdev));
+		inode->i_blksize    = PAGE_CACHE_SIZE;
+		inode->i_generation = va.va_gencount;
+		if ((flags & ATTR_COMM) ||
+		    S_ISREG(inode->i_mode) ||
+		    S_ISDIR(inode->i_mode) ||
+		    S_ISLNK(inode->i_mode)) {
+			inode->i_size	    = va.va_size;
+			inode->i_blocks	    = va.va_nblocks;
+			inode->i_atime	    = va.va_atime.tv_sec;
+			inode->i_mtime	    = va.va_mtime.tv_sec;
+			inode->i_ctime	    = va.va_ctime.tv_sec;
+		}
+		if (flags & ATTR_LAZY)
+			vp->v_flag &= ~VMODIFIED;
+		else
+			VUNMODIFY(vp);
+	} else {
+		vn_trace_exit(vp, "vn_revalidate.error",
+					(inst_t *)__return_address);
+	}
+
+	return -error;
+}
+
+
+/*
+ * purge a vnode from the cache
+ * At this point the vnode is guaranteed to have no references (vn_count == 0)
+ * The caller has to make sure that there are no ways someone could
+ * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock).
+ */
+void
+vn_purge(struct vnode *vp, vmap_t *vmap)
+{
+	vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address);
+
+	ASSERT(vp->v_flag & VPURGE);
+
+again:
+	/*
+	 * Check whether vp has already been reclaimed since our caller
+	 * sampled its version while holding a filesystem cache lock that
+	 * its VOP_RECLAIM function acquires.
+	 */
+	VN_LOCK(vp);
+	if (vp->v_number != vmap->v_number) {
+		VN_UNLOCK(vp);
+		return;
+	}
+
+	/*
+	 * If vp is being reclaimed or inactivated, wait until it is inert,
+	 * then proceed.  Can't assume that vnode is actually reclaimed
+	 * just because the reclaimed flag is asserted -- a vn_alloc
+	 * reclaim can fail.
+	 */
+	if (vp->v_flag & (VINACT | VRECLM)) {
+		ASSERT(vn_count(vp) == 0);
+		vp->v_flag |= VWAIT;
+		sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
+		goto again;
+	}
+
+	/*
+	 * Another process could have raced in and gotten this vnode...
+	 */
+	if (vn_count(vp) > 0) {
+		VN_UNLOCK(vp);
+		return;
+	}
+
+	XFS_STATS_DEC(xfsstats.vn_active);
+	vp->v_flag |= VRECLM;
+	VN_UNLOCK(vp);
+
+	/*
+	 * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells
+	 * vp's filesystem to flush and invalidate all cached resources.
+	 * When vn_reclaim returns, vp should have no private data,
+	 * either in a system cache or attached to v_data.
+	 */
+	if (vn_reclaim(vp) != 0)
+		panic("vn_purge: cannot reclaim");
+
+	/*
+	 * Wakeup anyone waiting for vp to be reclaimed.
+	 */
+	vn_wakeup(vp);
+}
+
+/*
+ * Add a reference to a referenced vnode.
+ */
+struct vnode *
+vn_hold(struct vnode *vp)
+{
+	struct inode *inode;
+
+	XFS_STATS_INC(xfsstats.vn_hold);
+
+	VN_LOCK(vp);
+	inode = igrab(LINVFS_GET_IP(vp));
+	ASSERT(inode);
+	VN_UNLOCK(vp);
+
+	return vp;
+}
+
+/*
+ *  Call VOP_INACTIVE on last reference.
+ */
+void
+vn_rele(struct vnode *vp)
+{
+	int	vcnt;
+	/* REFERENCED */
+	int cache;
+
+	XFS_STATS_INC(xfsstats.vn_rele);
+
+
+	VN_LOCK(vp);
+
+	vn_trace_entry(vp, "vn_rele", (inst_t *)__return_address);
+	vcnt = vn_count(vp);
+
+	/*
+	 * Since we always get called from put_inode we know
+	 * that i_count won't be decremented after we
+	 * return.
+	 */
+	if (vcnt == 0) {
+		/*
+		 * As soon as we turn this on, noone can find us in vn_get
+		 * until we turn off VINACT or VRECLM
+		 */
+		vp->v_flag |= VINACT;
+		VN_UNLOCK(vp);
+
+		/*
+		 * Do not make the VOP_INACTIVE call if there
+		 * are no behaviors attached to the vnode to call.
+		 */
+		if (vp->v_fbhv != NULL) {
+			VOP_INACTIVE(vp, NULL, cache);
+		}
+
+		VN_LOCK(vp);
+		if (vp->v_flag & VWAIT) {
+			if (vp->v_flag & VWAIT) {
+				sv_broadcast(vptosync(vp));
+			}
+		}
+
+		vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VMODIFIED);
+
+	}
+
+	VN_UNLOCK(vp);
+
+	vn_trace_exit(vp, "vn_rele", (inst_t *)__return_address);
+}
+
+
+/*
+ * Finish the removal of a vnode.
+ */
+void
+vn_remove(struct vnode *vp)
+{
+	/* REFERENCED */
+	vmap_t	vmap;
+
+	/* Make sure we don't do this to the same vnode twice */
+	if (!(vp->v_fbhv))
+		return;
+
+	XFS_STATS_INC(xfsstats.vn_remove);
+
+	vn_trace_exit(vp, "vn_remove", (inst_t *)__return_address);
+
+	/*
+	 * After the following purge the vnode
+	 * will no longer exist.
+	 */
+	VMAP(vp, XFS_BHVTOI(vp->v_fbhv), vmap);
+
+	vn_purge(vp, &vmap);
+}
+
+
+#ifdef	CONFIG_XFS_VNODE_TRACING
+
+#define KTRACE_ENTER(vp, vk, s, line, ra)			\
+	ktrace_enter(	(vp)->v_trace,				\
+/*  0 */		(void *)(__psint_t)(vk),		\
+/*  1 */		(void *)(s),				\
+/*  2 */		(void *)(__psint_t) line,		\
+/*  3 */		(void *)(vn_count(vp)), \
+/*  4 */		(void *)(ra),				\
+/*  5 */		(void *)(__psunsigned_t)(vp)->v_flag,	\
+/*  6 */		(void *)(__psint_t)smp_processor_id(),	\
+/*  7 */		(void *)(__psint_t)(current->pid),	\
+/*  8 */		(void *)__return_address,		\
+/*  9 */		0, 0, 0, 0, 0, 0, 0)
+
+/*
+ * Vnode tracing code.
+ */
+void
+vn_trace_entry(vnode_t *vp, char *func, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
+}
+
+void
+vn_trace_exit(vnode_t *vp, char *func, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
+}
+
+void
+vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
+}
+
+void
+vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
+}
+
+void
+vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
+}
+#endif	/* CONFIG_XFS_VNODE_TRACING */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_vnode.h linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_vnode.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/linux/xfs_vnode.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/linux/xfs_vnode.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_VNODE_H__
+#define __XFS_VNODE_H__
+
+/*
+ * Vnode types (unrelated to on-disk inodes).  VNON means no type.
+ */
+typedef enum vtype {
+	VNON	= 0,
+	VREG	= 1,
+	VDIR	= 2,
+	VBLK	= 3,
+	VCHR	= 4,
+	VLNK	= 5,
+	VFIFO	= 6,
+	VBAD	= 7,
+	VSOCK	= 8
+} vtype_t;
+
+typedef __u64	vnumber_t;
+
+/*
+ * Define the type of behavior head used by vnodes.
+ */
+#define vn_bhv_head_t	bhv_head_t
+
+/*
+ * MP locking protocols:
+ *	v_flag, v_count				VN_LOCK/VN_UNLOCK
+ *	v_vfsp					VN_LOCK/VN_UNLOCK
+ *	v_type					read-only or fs-dependent
+ *	v_list, v_hashp, v_hashn		freelist lock
+ */
+typedef struct vnode {
+	__u32		v_flag;			/* vnode flags (see below) */
+	enum vtype	v_type;			/* vnode type		*/
+	struct vfs	*v_vfsp;		/* ptr to containing VFS*/
+	vnumber_t	v_number;		/* in-core vnode number */
+	vn_bhv_head_t	v_bh;			/* behavior head */
+
+	spinlock_t	v_lock;			/* don't use VLOCK on Linux */
+	struct inode	v_inode;		/* linux inode */
+#ifdef	CONFIG_XFS_VNODE_TRACING
+	struct ktrace	*v_trace;		/* trace header structure    */
+#endif	/* CONFIG_XFS_VNODE_TRACING */
+} vnode_t;
+
+/*
+ * Vnode to Linux inode mapping.
+ */
+#define LINVFS_GET_VP(inode)	((vnode_t *)list_entry(inode, vnode_t, v_inode))
+#define LINVFS_GET_IP(vp)	(&(vp)->v_inode)
+
+/*
+ * Conversion between vnode types/modes and encoded type/mode as
+ * seen by stat(2) and mknod(2).
+ */
+extern enum vtype	iftovt_tab[];
+extern ushort		vttoif_tab[];
+#define IFTOVT(M)	(iftovt_tab[((M) & S_IFMT) >> 12])
+#define VTTOIF(T)	(vttoif_tab[(int)(T)])
+#define MAKEIMODE(T, M) (VTTOIF(T) | ((M) & ~S_IFMT))
+
+/*
+ * Vnode flags.
+ *
+ * The vnode flags fall into two categories:
+ * 1) Local only -
+ *    Flags that are relevant only to a particular cell
+ * 2) Single system image -
+ *    Flags that must be maintained coherent across all cells
+ */
+ /* Local only flags */
+#define VINACT		       0x1	/* vnode is being inactivated	*/
+#define VRECLM		       0x2	/* vnode is being reclaimed	*/
+#define VWAIT		       0x4	/* waiting for VINACT
+					   or VRECLM to finish */
+#define VMODIFIED	       0x8	/* xfs inode state possibly different
+					 * from linux inode state.
+					 */
+
+/* Single system image flags */
+#define VROOT		  0x100000	/* root of its file system	*/
+#define VNOSWAP		  0x200000	/* cannot be used as virt swap device */
+#define VISSWAP		  0x400000	/* vnode is part of virt swap device */
+#define VREPLICABLE	  0x800000	/* Vnode can have replicated pages */
+#define VNONREPLICABLE	 0x1000000	/* Vnode has writers. Don't replicate */
+#define VDOCMP		 0x2000000	/* Vnode has special VOP_CMP impl. */
+#define VSHARE		 0x4000000	/* vnode part of global cache	*/
+					/* VSHARE applies to local cell only */
+#define VFRLOCKS	 0x8000000	/* vnode has FR locks applied	*/
+#define VENF_LOCKING	0x10000000	/* enf. mode FR locking in effect */
+#define VOPLOCK		0x20000000	/* oplock set on the vnode	*/
+#define VPURGE		0x40000000	/* In the linux 'put' thread	*/
+
+typedef enum vrwlock	{ VRWLOCK_NONE, VRWLOCK_READ,
+			  VRWLOCK_WRITE, VRWLOCK_WRITE_DIRECT,
+			  VRWLOCK_TRY_READ, VRWLOCK_TRY_WRITE } vrwlock_t;
+
+/*
+ * Return values for VOP_INACTIVE.  A return value of
+ * VN_INACTIVE_NOCACHE implies that the file system behavior
+ * has disassociated its state and bhv_desc_t from the vnode.
+ */
+#define VN_INACTIVE_CACHE	0
+#define VN_INACTIVE_NOCACHE	1
+
+/*
+ * Values for the cmd code given to VOP_VNODE_CHANGE.
+ */
+typedef enum vchange {
+	VCHANGE_FLAGS_FRLOCKS		= 0,
+	VCHANGE_FLAGS_ENF_LOCKING	= 1,
+	VCHANGE_FLAGS_TRUNCATED		= 2,
+	VCHANGE_FLAGS_PAGE_DIRTY	= 3,
+	VCHANGE_FLAGS_IOEXCL_COUNT	= 4
+} vchange_t;
+
+/*
+ * Macros for dealing with the behavior descriptor inside of the vnode.
+ */
+#define BHV_TO_VNODE(bdp)	((vnode_t *)BHV_VOBJ(bdp))
+#define BHV_TO_VNODE_NULL(bdp)	((vnode_t *)BHV_VOBJNULL(bdp))
+
+#define VNODE_TO_FIRST_BHV(vp)		(BHV_HEAD_FIRST(&(vp)->v_bh))
+#define VN_BHV_HEAD(vp)			((vn_bhv_head_t *)(&((vp)->v_bh)))
+#define VN_BHV_READ_LOCK(bhp)		BHV_READ_LOCK(bhp)
+#define VN_BHV_READ_UNLOCK(bhp)		BHV_READ_UNLOCK(bhp)
+#define VN_BHV_WRITE_LOCK(bhp)		BHV_WRITE_LOCK(bhp)
+#define VN_BHV_NOT_READ_LOCKED(bhp)	BHV_NOT_READ_LOCKED(bhp)
+#define VN_BHV_NOT_WRITE_LOCKED(bhp)	BHV_NOT_WRITE_LOCKED(bhp)
+#define vn_bhv_head_init(bhp,name)	bhv_head_init(bhp,name)
+#define vn_bhv_head_reinit(bhp)		bhv_head_reinit(bhp)
+#define vn_bhv_insert_initial(bhp,bdp)	bhv_insert_initial(bhp,bdp)
+#define vn_bhv_remove(bhp,bdp)		bhv_remove(bhp,bdp)
+#define vn_bhv_lookup(bhp,ops)		bhv_lookup(bhp,ops)
+#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops)
+
+#define v_fbhv		v_bh.bh_first	       /* first behavior */
+#define v_fops		v_bh.bh_first->bd_ops  /* ops for first behavior */
+
+
+union rval;
+struct uio;
+struct file;
+struct vattr;
+struct page_buf_bmap_s;
+struct attrlist_cursor_kern;
+
+typedef int	(*vop_open_t)(bhv_desc_t *, struct cred *);
+typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct file *, char *, size_t,
+				loff_t *, struct cred *);
+typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct file *, const char *, size_t,
+				loff_t *, struct cred *);
+typedef int	(*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, unsigned int, unsigned long);
+typedef int	(*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
+				struct cred *);
+typedef int	(*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
+				struct cred *);
+typedef int	(*vop_access_t)(bhv_desc_t *, int, struct cred *);
+typedef int	(*vop_lookup_t)(bhv_desc_t *, struct dentry *, vnode_t **,
+				int, vnode_t *, struct cred *);
+typedef int	(*vop_create_t)(bhv_desc_t *, struct dentry *, struct vattr *,
+				vnode_t **, struct cred *);
+typedef int	(*vop_remove_t)(bhv_desc_t *, struct dentry *, struct cred *);
+typedef int	(*vop_link_t)(bhv_desc_t *, vnode_t *, struct dentry *,
+				struct cred *);
+typedef int	(*vop_rename_t)(bhv_desc_t *, struct dentry *, vnode_t *,
+				struct dentry *, struct cred *);
+typedef int	(*vop_mkdir_t)(bhv_desc_t *, struct dentry *, struct vattr *,
+				vnode_t **, struct cred *);
+typedef int	(*vop_rmdir_t)(bhv_desc_t *, struct dentry *, struct cred *);
+typedef int	(*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
+				int *);
+typedef int	(*vop_symlink_t)(bhv_desc_t *, struct dentry *,
+				struct vattr *, char *,
+				vnode_t **, struct cred *);
+typedef int	(*vop_readlink_t)(bhv_desc_t *, struct uio *, struct cred *);
+typedef int	(*vop_fsync_t)(bhv_desc_t *, int, struct cred *, xfs_off_t, xfs_off_t);
+typedef int	(*vop_inactive_t)(bhv_desc_t *, struct cred *);
+typedef int	(*vop_fid2_t)(bhv_desc_t *, struct fid *);
+typedef int	(*vop_release_t)(bhv_desc_t *);
+typedef int	(*vop_rwlock_t)(bhv_desc_t *, vrwlock_t);
+typedef void	(*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t);
+typedef int	(*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, struct page_buf_bmap_s *, int *);
+typedef int	(*vop_strategy_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, struct page_buf_bmap_s *, int *);
+typedef int	(*vop_reclaim_t)(bhv_desc_t *);
+typedef int	(*vop_attr_get_t)(bhv_desc_t *, char *, char *, int *, int,
+				struct cred *);
+typedef int	(*vop_attr_set_t)(bhv_desc_t *, char *, char *, int, int,
+				struct cred *);
+typedef int	(*vop_attr_remove_t)(bhv_desc_t *, char *, int, struct cred *);
+typedef int	(*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
+				struct attrlist_cursor_kern *, struct cred *);
+typedef void	(*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int);
+typedef void	(*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
+typedef void	(*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+typedef void	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
+typedef int	(*vop_iflush_t)(bhv_desc_t *, int);
+
+
+typedef struct vnodeops {
+	bhv_position_t	vn_position;	/* position within behavior chain */
+	vop_open_t		vop_open;
+	vop_read_t		vop_read;
+	vop_write_t		vop_write;
+	vop_ioctl_t		vop_ioctl;
+	vop_getattr_t		vop_getattr;
+	vop_setattr_t		vop_setattr;
+	vop_access_t		vop_access;
+	vop_lookup_t		vop_lookup;
+	vop_create_t		vop_create;
+	vop_remove_t		vop_remove;
+	vop_link_t		vop_link;
+	vop_rename_t		vop_rename;
+	vop_mkdir_t		vop_mkdir;
+	vop_rmdir_t		vop_rmdir;
+	vop_readdir_t		vop_readdir;
+	vop_symlink_t		vop_symlink;
+	vop_readlink_t		vop_readlink;
+	vop_fsync_t		vop_fsync;
+	vop_inactive_t		vop_inactive;
+	vop_fid2_t		vop_fid2;
+	vop_rwlock_t		vop_rwlock;
+	vop_rwunlock_t		vop_rwunlock;
+	vop_bmap_t		vop_bmap;
+	vop_strategy_t		vop_strategy;
+	vop_reclaim_t		vop_reclaim;
+	vop_attr_get_t		vop_attr_get;
+	vop_attr_set_t		vop_attr_set;
+	vop_attr_remove_t	vop_attr_remove;
+	vop_attr_list_t		vop_attr_list;
+	vop_link_removed_t	vop_link_removed;
+	vop_vnode_change_t	vop_vnode_change;
+	vop_ptossvp_t		vop_tosspages;
+	vop_pflushinvalvp_t	vop_flushinval_pages;
+	vop_pflushvp_t		vop_flush_pages;
+	vop_release_t		vop_release;
+	vop_iflush_t		vop_iflush;
+} vnodeops_t;
+
+/*
+ * VOP's.
+ */
+#define _VOP_(op, vp)	(*((vnodeops_t *)(vp)->v_fops)->op)
+
+/*
+ * Be careful with VOP_OPEN, since we're holding the chain lock on the
+ * original vnode and VOP_OPEN semantic allows the new vnode to be returned
+ * in vpp. The practice of passing &vp for vpp just doesn't work.
+ */
+#define VOP_READ(vp,file,buf,size,offset,cr,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,buf,size,offset,cr); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_WRITE(vp,file,buf,size,offset,cr,rv)			\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,buf,size,offset,cr);\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_BMAP(vp,of,sz,rw,cr,b,n,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,cr,b,n);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_STRATEGY(vp,of,sz,rw,cr,b,n,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_strategy, vp)((vp)->v_fbhv,of,sz,rw,cr,b,n);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_OPEN(vp, cr, rv)						\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr);			\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_GETATTR(vp, vap, f, cr, rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_SETATTR(vp, vap, f, cr, rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_ACCESS(vp, mode, cr, rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_CREATE(dvp,d,vap,vpp,cr,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(dvp)->v_bh);					\
+	rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr);	\
+	VN_BHV_READ_UNLOCK(&(dvp)->v_bh);				\
+}
+#define VOP_REMOVE(dvp,d,cr,rv)						\
+{									\
+	VN_BHV_READ_LOCK(&(dvp)->v_bh);					\
+	rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr);		\
+	VN_BHV_READ_UNLOCK(&(dvp)->v_bh);				\
+}
+#define VOP_LINK(tdvp,fvp,d,cr,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(tdvp)->v_bh);				\
+	rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr);		\
+	VN_BHV_READ_UNLOCK(&(tdvp)->v_bh);				\
+}
+#define VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(fvp)->v_bh);					\
+	rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr);	\
+	VN_BHV_READ_UNLOCK(&(fvp)->v_bh);				\
+}
+#define VOP_MKDIR(dp,d,vap,vpp,cr,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(dp)->v_bh);					\
+	rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr);		\
+	VN_BHV_READ_UNLOCK(&(dp)->v_bh);				\
+}
+#define VOP_RMDIR(dp,d,cr,rv)						\
+{									\
+	VN_BHV_READ_LOCK(&(dp)->v_bh);					\
+	rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr);			\
+	VN_BHV_READ_UNLOCK(&(dp)->v_bh);				\
+}
+#define VOP_READDIR(vp,uiop,cr,eofp,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(dvp)->v_bh);					\
+	rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr); \
+	VN_BHV_READ_UNLOCK(&(dvp)->v_bh);				\
+}
+#define VOP_READLINK(vp,uiop,cr,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,cr);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_FSYNC(vp,f,cr,b,e,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_INACTIVE(vp, cr, rv)					\
+{	/* vnode not reference-able, so no need to lock chain */	\
+	rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr);			\
+}
+#define VOP_RELEASE(vp, rv)						\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_release, vp)((vp)->v_fbhv);			\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_FID2(vp, fidp, rv)						\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp);			\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_RWLOCK(vp,i)						\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	(void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i);			\
+	/* "allow" is done by rwunlock */				\
+}
+#define VOP_RWLOCK_TRY(vp,i)						\
+	_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
+
+#define VOP_RWUNLOCK(vp,i)						\
+{	/* "prevent" was done by rwlock */				\
+	(void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i);			\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_RECLAIM(vp, rv)						\
+{	/* vnode not reference-able, so no need to lock chain */	\
+	rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv);			\
+}
+#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv)		\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv)		\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_ATTR_REMOVE(vp, name, flags, cred, rv)			\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv)		\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred);\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_LINK_REMOVED(vp, dvp, linkzero)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	(void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_VNODE_CHANGE(vp, cmd, val)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	(void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+/*
+ * These are page cache functions that now go thru VOPs.
+ * 'last' parameter is unused and left in for IRIX compatibility
+ */
+#define VOP_TOSS_PAGES(vp, first, last, fiopt)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	_VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+/*
+ * 'last' parameter is unused and left in for IRIX compatibility
+ */
+#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt)			\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	_VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+/*
+ * 'last' parameter is unused and left in for IRIX compatibility
+ */
+#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv)		\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt);\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_IOCTL(vp, inode, filp, cmd, arg, rv)			\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,cmd,arg);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_IFLUSH(vp, flags, rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+
+/*
+ * Flags for VOP_IFLUSH call
+ */
+
+#define FLUSH_SYNC		1	/* wait for flush to complete	*/
+#define FLUSH_INODE		2	/* flush the inode itself	*/
+#define FLUSH_LOG		4	/* force the last log entry for
+					 * this inode out to disk	*/
+
+/*
+ * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and
+ *	VOP_FLUSH_PAGES.
+ */
+#define FI_NONE			0	/* none */
+#define FI_REMAPF		1	/* Do a remapf prior to the operation */
+#define FI_REMAPF_LOCKED	2	/* Do a remapf prior to the operation.
+					   Prevent VM access to the pages until
+					   the operation completes. */
+
+/*
+ * Vnode attributes.  va_mask indicates those attributes the caller
+ * wants to set (setattr) or extract (getattr).
+ */
+typedef struct vattr {
+	int		va_mask;	/* bit-mask of attributes */
+	vtype_t		va_type;	/* vnode type (for create) */
+	mode_t		va_mode;	/* file access mode */
+	uid_t		va_uid;		/* owner user id */
+	gid_t		va_gid;		/* owner group id */
+	dev_t		va_fsid;	/* file system id (dev for now) */
+	xfs_ino_t	va_nodeid;	/* node id */
+	nlink_t		va_nlink;	/* number of references to file */
+	xfs_off_t	va_size;	/* file size in bytes */
+	timespec_t	va_atime;	/* time of last access */
+	timespec_t	va_mtime;	/* time of last modification */
+	timespec_t	va_ctime;	/* time file ``created'' */
+	dev_t		va_rdev;	/* device the file represents */
+	u_long		va_blksize;	/* fundamental block size */
+	__int64_t	va_nblocks;	/* # of blocks allocated */
+	u_long		va_vcode;	/* version code */
+	u_long		va_xflags;	/* random extended file flags */
+	u_long		va_extsize;	/* file extent size */
+	u_long		va_nextents;	/* number of extents in file */
+	u_long		va_anextents;	/* number of attr extents in file */
+	int		va_projid;	/* project id */
+	u_int		va_gencount;	/* object generation count */
+} vattr_t;
+
+/*
+ * setattr or getattr attributes
+ */
+#define AT_TYPE		0x00000001
+#define AT_MODE		0x00000002
+#define AT_UID		0x00000004
+#define AT_GID		0x00000008
+#define AT_FSID		0x00000010
+#define AT_NODEID	0x00000020
+#define AT_NLINK	0x00000040
+#define AT_SIZE		0x00000080
+#define AT_ATIME	0x00000100
+#define AT_MTIME	0x00000200
+#define AT_CTIME	0x00000400
+#define AT_RDEV		0x00000800
+#define AT_BLKSIZE	0x00001000
+#define AT_NBLOCKS	0x00002000
+#define AT_VCODE	0x00004000
+#define AT_MAC		0x00008000
+#define AT_UPDATIME	0x00010000
+#define AT_UPDMTIME	0x00020000
+#define AT_UPDCTIME	0x00040000
+#define AT_ACL		0x00080000
+#define AT_CAP		0x00100000
+#define AT_INF		0x00200000
+#define AT_XFLAGS	0x00400000
+#define AT_EXTSIZE	0x00800000
+#define AT_NEXTENTS	0x01000000
+#define AT_ANEXTENTS	0x02000000
+#define AT_PROJID	0x04000000
+#define AT_SIZE_NOPERM	0x08000000
+#define AT_GENCOUNT	0x10000000
+
+#define AT_ALL	(AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\
+		AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|\
+		AT_BLKSIZE|AT_NBLOCKS|AT_VCODE|AT_MAC|AT_ACL|AT_CAP|\
+		AT_INF|AT_XFLAGS|AT_EXTSIZE|AT_NEXTENTS|AT_ANEXTENTS|\
+		AT_PROJID|AT_GENCOUNT)
+
+#define AT_STAT (AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\
+		AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_BLKSIZE|\
+		AT_NBLOCKS|AT_PROJID)
+
+#define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME)
+
+#define AT_UPDTIMES (AT_UPDATIME|AT_UPDMTIME|AT_UPDCTIME)
+
+#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
+		 AT_BLKSIZE|AT_NBLOCKS|AT_VCODE|AT_NEXTENTS|AT_ANEXTENTS|\
+		 AT_GENCOUNT)
+
+#define VREAD		00400
+#define VWRITE		00200
+#define VEXEC		00100
+#define VSGID		02000		/* set group id on execution */
+#define MODEMASK	07777		/* mode bits plus permission bits */
+
+/*
+ * Check whether mandatory file locking is enabled.
+ */
+#define MANDLOCK(vp, mode)	\
+	((vp)->v_type == VREG && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
+
+extern void	vn_init(void);
+extern int	vn_wait(struct vnode *);
+extern vnode_t	*vn_initialize(struct inode *);
+
+/*
+ * Acquiring and invalidating vnodes:
+ *
+ *	if (vn_get(vp, version, 0))
+ *		...;
+ *	vn_purge(vp, version);
+ *
+ * vn_get and vn_purge must be called with vmap_t arguments, sampled
+ * while a lock that the vnode's VOP_RECLAIM function acquires is
+ * held, to ensure that the vnode sampled with the lock held isn't
+ * recycled (VOP_RECLAIMed) or deallocated between the release of the lock
+ * and the subsequent vn_get or vn_purge.
+ */
+
+/*
+ * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
+ */
+typedef struct vnode_map {
+	vfs_t		*v_vfsp;
+	vnumber_t	v_number;		/* in-core vnode number */
+	xfs_ino_t	v_ino;			/* inode #	*/
+} vmap_t;
+
+#define VMAP(vp, ip, vmap)	{(vmap).v_vfsp	 = (vp)->v_vfsp,	\
+				 (vmap).v_number = (vp)->v_number,	\
+				 (vmap).v_ino	 = (ip)->i_ino; }
+extern void	vn_purge(struct vnode *, vmap_t *);
+extern vnode_t	*vn_get(struct vnode *, vmap_t *);
+extern int	vn_revalidate(struct vnode *, int);
+extern void	vn_remove(struct vnode *);
+
+static inline int vn_count(struct vnode *vp)
+{
+	return atomic_read(&LINVFS_GET_IP(vp)->i_count);
+}
+
+/*
+ * Vnode reference counting functions (and macros for compatibility).
+ */
+extern vnode_t	*vn_hold(struct vnode *);
+extern void	vn_rele(struct vnode *);
+
+#if defined(CONFIG_XFS_VNODE_TRACING)
+
+#define VN_HOLD(vp)		\
+	((void)vn_hold(vp), \
+	  vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address))
+#define VN_RELE(vp)		\
+	  (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \
+	   iput(LINVFS_GET_IP(vp)))
+
+#else	/* ! (defined(CONFIG_XFS_VNODE_TRACING)) */
+
+#define VN_HOLD(vp)		((void)vn_hold(vp))
+#define VN_RELE(vp)		(iput(LINVFS_GET_IP(vp)))
+
+#endif	/* ! (defined(CONFIG_XFS_VNODE_TRACING) */
+
+/*
+ * Vnode spinlock manipulation.
+ */
+#define VN_FLAGSET(vp,b)	vn_flagset(vp,b)
+#define VN_FLAGCLR(vp,b)	vn_flagclr(vp,b)
+
+static __inline__ void vn_flagset(struct vnode *vp, uint flag)
+{
+	spin_lock(&vp->v_lock);
+	vp->v_flag |= flag;
+	spin_unlock(&vp->v_lock);
+}
+
+static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
+{
+	spin_lock(&vp->v_lock);
+	vp->v_flag &= ~flag;
+	spin_unlock(&vp->v_lock);
+}
+
+/*
+ * Some useful predicates.
+ */
+#define VN_MAPPED(vp)	((LINVFS_GET_IP(vp)->i_mapping->i_mmap != NULL) || \
+			 (LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared != NULL))
+#define VN_CACHED(vp)	(LINVFS_GET_IP(vp)->i_mapping->nrpages)
+#define VN_DIRTY(vp)	(!list_empty(&(LINVFS_GET_IP(vp)->i_dirty_data_buffers)))
+#define VMODIFY(vp)	{ VN_FLAGSET(vp, VMODIFIED); \
+			mark_inode_dirty(LINVFS_GET_IP(vp)); }
+#define VUNMODIFY(vp)	VN_FLAGCLR(vp, VMODIFIED)
+
+/*
+ * Flags to VOP_SETATTR/VOP_GETATTR.
+ */
+#define ATTR_UTIME	0x01	/* non-default utime(2) request */
+#define ATTR_EXEC	0x02	/* invocation from exec(2) */
+#define ATTR_COMM	0x04	/* yield common vp attributes */
+#define ATTR_DMI	0x08	/* invocation from a DMI function */
+#define ATTR_LAZY	0x80	/* set/get attributes lazily */
+#define ATTR_NONBLOCK	0x100	/* return EAGAIN if operation would block */
+#define ATTR_NOLOCK	0x200	/* Don't grab any conflicting locks */
+#define ATTR_NOSIZETOK	0x400	/* Don't get the DVN_SIZE_READ token */
+
+/*
+ * Flags to VOP_FSYNC and VOP_RECLAIM.
+ */
+#define FSYNC_NOWAIT	0	/* asynchronous flush */
+#define FSYNC_WAIT	0x1	/* synchronous fsync or forced reclaim */
+#define FSYNC_INVAL	0x2	/* flush and invalidate cached data */
+#define FSYNC_DATA	0x4	/* synchronous fsync of data only */
+
+#if (defined(CONFIG_XFS_VNODE_TRACING))
+
+#define VNODE_TRACE_SIZE	16		/* number of trace entries */
+
+/*
+ * Tracing entries.
+ */
+#define VNODE_KTRACE_ENTRY	1
+#define VNODE_KTRACE_EXIT	2
+#define VNODE_KTRACE_HOLD	3
+#define VNODE_KTRACE_REF	4
+#define VNODE_KTRACE_RELE	5
+
+extern void vn_trace_entry(struct vnode *, char *, inst_t *);
+extern void vn_trace_exit(struct vnode *, char *, inst_t *);
+extern void vn_trace_hold(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_ref(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_rele(struct vnode *, char *, int, inst_t *);
+#define VN_TRACE(vp)		\
+	vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address)
+
+#else	/* ! (defined(CONFIG_XFS_VNODE_TRACING)) */
+
+#define vn_trace_entry(a,b,c)
+#define vn_trace_exit(a,b,c)
+#define vn_trace_hold(a,b,c,d)
+#define vn_trace_ref(a,b,c,d)
+#define vn_trace_rele(a,b,c,d)
+#define VN_TRACE(vp)
+
+#endif	/* ! (defined(CONFIG_XFS_VNODE_TRACING)) */
+
+#endif	/* __XFS_VNODE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/Makefile linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/Makefile
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/Makefile	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/Makefile	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,56 @@
+#
+# Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+# Makefile for the linux pagebuf routines.
+#
+
+# Debug options:
+#	-DPAGEBUF_TRACKING
+#	-DPAGEBUF_LOCKING
+#	-DPAGEBUF_DEBUG
+#	-DPAGEBUF_TRACE
+ifeq ($(CONFIG_PAGEBUF_DEBUG),y)
+	EXTRA_CFLAGS += -g -DSTATIC="" \
+			-DPAGEBUF_DEBUG -DPAGEBUF_TRACKING -DPAGEBUF_TRACE
+endif
+EXTRA_CFLAGS += -I..
+
+O_TARGET			:= pagebuf.o
+
+ifneq ($(MAKECMDGOALS),modules_install)
+  obj-m				:= $(O_TARGET)
+endif
+
+export-objs			+= page_buf.o
+obj-y				+= page_buf.o \
+				   page_buf_locking.o
+
+include $(TOPDIR)/Rules.make
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/Makefile.in linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/Makefile.in
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/Makefile.in	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/Makefile.in	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+# Makefile for the linux pagebuf routines.
+#
+
+expsyms(page_buf.o)
+objlink(pagebuf.o page_buf.o page_buf_io.o page_buf_locking.o)
+# No select() for pagebuf.o in this directory.	It is a sub-component of XFS,
+# see fs/xfs/Makefile.in for the objlink.
+
+# Debug options:
+#	-DPAGEBUF_TRACKING
+#	-DPAGEBUF_LOCKING
+#	-DPAGEBUF_DEBUG
+#	-DPAGEBUF_TRACE
+ifsel(CONFIG_PAGEBUF_DEBUG)
+	extra_cflags_all(-g -DSTATIC="" -DPAGEBUF_DEBUG -DPAGEBUF_TRACKING -DPAGEBUF_TRACE)
+endif
+
+# FIXME: page_buf.c does #include <support/xxx.h>
+extra_cflags(page_buf.o $(src_includelist /fs/xfs))
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/diff linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/diff
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/diff	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/diff	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,232 @@
+? .depend
+? .page_buf.o.flags
+? .page_buf_io.o.flags
+? .page_buf_locking.o.flags
+? diff
+? .pagebuf.o.flags
+? page_buf_inline.h
+Index: page_buf.c
+===================================================================
+RCS file: /cvs/linux-2.4-xfs/linux/fs/xfs/pagebuf/page_buf.c,v
+retrieving revision 1.48
+diff -u -p -r1.48 page_buf.c
+--- page_buf.c	2002/08/02 20:09:37	1.48
++++ page_buf.c	2002/08/06 17:09:39
+@@ -239,8 +239,6 @@ _bhash(
+  * Mapping of multi-page buffers into contingous virtual space
+  */
+ 
+-STATIC void *pagebuf_mapout_locked(page_buf_t *);
+-
+ STATIC	spinlock_t		as_lock = SPIN_LOCK_UNLOCKED;
+ typedef struct a_list {
+ 	void	*vm_addr;
+@@ -249,40 +247,6 @@ typedef struct a_list {
+ STATIC	a_list_t	*as_free_head;
+ STATIC	int		as_list_len;
+ 
+-STATIC void
+-free_address(void *addr)
+-{
+-	a_list_t	*aentry;
+-
+-	spin_lock(&as_lock);
+-	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
+-	aentry->next = as_free_head;
+-	aentry->vm_addr = addr;
+-	as_free_head = aentry;
+-	as_list_len++;
+-	spin_unlock(&as_lock);
+-}
+-
+-STATIC void
+-purge_addresses(void)
+-{
+-	a_list_t	*aentry, *old;
+-
+-	if (as_free_head == NULL) return;
+-
+-	spin_lock(&as_lock);
+-	aentry = as_free_head;
+-	as_free_head = NULL;
+-	as_list_len = 0;
+-	spin_unlock(&as_lock);
+-
+-	while ((old = aentry) != NULL) {
+-		vfree(aentry->vm_addr);
+-		aentry = aentry->next;
+-		kfree(old);
+-	}
+-}
+-
+ /*
+  *	Locking model:
+  *
+@@ -340,6 +304,88 @@ _pagebuf_initialize(
+ 	return (0);
+ }
+ 
++STATIC void
++_pagebuf_purge_addresses(void)
++{
++	a_list_t	*aentry, *old;
++
++	if (as_free_head == NULL)
++		return;
++
++	spin_lock(&as_lock);
++	aentry = as_free_head;
++	as_free_head = NULL;
++	as_list_len = 0;
++	spin_unlock(&as_lock);
++
++	while ((old = aentry) != NULL) {
++		vfree(aentry->vm_addr);
++		aentry = aentry->next;
++		kfree(old);
++	}
++}
++
++STATIC int
++_pagebuf_map(page_buf_t *pb, int count, int flags, int gfp_mask)
++{
++	if (count == 1) {
++		/*
++		 * A single page buffer is always mappable, thus we mark it
++		 * mapped even if we weren't requested to do so explicitly.
++		 *
++		 * This might save some expensive calculations later on.
++		 */
++
++		pb->pb_addr = (page_address(pb->pb_pages[0]) + pb->pb_offset);
++		pb->pb_flags |= PBF_MAPPED;
++	} else if (flags & PBF_MAPPED) {
++		/*
++		 * For multipage buffers mapping is very expensive as it
++		 * requires changes to the kernel virtaul address space.
++		 *
++		 * We do this only if we really need to.
++		 */
++		void *vaddr;
++
++		if (as_list_len > 64)
++			_pagebuf_purge_addresses();
++
++		vaddr = remap_page_array(pb->pb_pages, count, gfp_mask);
++		if (unlikely(!vaddr))
++			return -ENOMEM;
++
++		pb->pb_addr = (vaddr + pb->pb_offset);
++		pb->pb_flags |= _PBF_ADDR_ALLOCATED;
++	}
++
++	return 0;
++}
++
++STATIC void
++_pagebuf_unmap(page_buf_t *pb)
++{
++	if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
++		void		*addr = (pb->pb_addr - pb->pb_offset);
++		a_list_t	*aentry = kmalloc(sizeof(*aentry), GFP_ATOMIC);
++
++		/*
++		 * If we can't allocate a list entry we have to free now.
++		 */
++		if (aentry) {
++			spin_lock(&as_lock);
++			aentry->vm_addr = addr;
++			as_free_head = aentry;
++			aentry->next = as_free_head;
++			as_list_len++;
++			spin_unlock(&as_lock);
++		} else
++			vfree(addr);
++	}
++
++	pb->pb_addr = NULL;
++	pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
++}
++
+ /*
+  * Allocate a page array capable of holding a specified number
+  * of pages, and point the page buf at it.
+@@ -415,11 +461,8 @@ void _pagebuf_free_object(
+ 
+ 	if (!(pb_flags & PBF_FREED)) {
+ 		/* release any virtual mapping */ ;
+-		if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
+-			void *vaddr = pagebuf_mapout_locked(pb);
+-			if (vaddr) {
+-				free_address(vaddr);
+-			}
++		if (pb->pb_flags & PBF_MAPPED) {
++			_pagebuf_unmap(pb);
+ 		}
+ 
+ 		if (pb->pb_flags & _PBF_MEM_ALLOCATED) {
+@@ -578,23 +621,12 @@ mapit:
+ 	pb->pb_flags |= _PBF_MEM_ALLOCATED;
+ 	if (all_mapped) {
+ 		pb->pb_flags |= _PBF_ALL_PAGES_MAPPED;
+-		/* A single page buffer is always mappable */
+-		if (page_count == 1) {
+-			pb->pb_addr =
+-			    (caddr_t) page_address(pb->pb_pages[0]) +
+-					pb->pb_offset;
+-			pb->pb_flags |= PBF_MAPPED;
+-		} else if (flags & PBF_MAPPED) {
+-			if (as_list_len > 64)
+-				purge_addresses();
+-			pb->pb_addr = remap_page_array(pb->pb_pages,
+-							page_count, gfp_mask);
+-			if (!pb->pb_addr)
+-				BUG();
+-			pb->pb_addr += pb->pb_offset;
+-			pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED;
+-		}
++
++		/* TODO(hch): handle this error case properly. */
++		if (unlikely(_pagebuf_map(pb, page_count, flags, gfp_mask)))
++			BUG();
+ 	}
++
+ 	/* If some pages were found with data in them
+ 	 * we are not in PBF_NONE state.
+ 	 */
+@@ -1830,26 +1862,6 @@ int pagebuf_iowait(page_buf_t * pb) /* b
+ }
+ 
+ 
+-/* reverse pagebuf_mapin()	*/
+-STATIC void *
+-pagebuf_mapout_locked(
+-    page_buf_t * pb)	/* buffer to unmap		  */
+-{
+-	void *old_addr = NULL;
+-
+-	if (pb->pb_flags & PBF_MAPPED) {
+-		if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
+-			old_addr = pb->pb_addr - pb->pb_offset;
+-		pb->pb_addr = NULL;
+-		pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
+-	}
+-
+-	return (old_addr);	/* Caller must free the address space,
+-				 * we are under a spin lock, probably
+-				 * not safe to do vfree here
+-				 */
+-}
+-
+ caddr_t
+ pagebuf_offset(page_buf_t *pb, off_t offset)
+ {
+@@ -2189,7 +2201,7 @@ pagebuf_daemon(void *data)
+ 		if (count)
+ 			run_task_queue(&tq_disk);
+ 		if (as_list_len > 0)
+-			purge_addresses();
++			_pagebuf_purge_addresses();
+ 
+ 		force_flush = 0;
+ 	} while (pb_daemon->active == 1);
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf.c linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,2524 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ *	page_buf.c
+ *
+ *	The page_buf module provides an abstract buffer cache model on top of
+ *	the Linux page cache.  Cached blocks for a file are hashed to the
+ *	inode for that file, and can be held dirty in delayed write mode in
+ *	the page cache.	 Cached metadata blocks for a file system are hashed
+ *	to the inode for the mounted device.  The page_buf module assembles
+ *	buffer (page_buf_t) objects on demand to aggregate such cached pages
+ *	for I/O.
+ *
+ *
+ *	Written by Steve Lord, Jim Mostek, Russell Cattelan
+ *		    and Rajagopal Ananthanarayanan ("ananth") at SGI.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/locks.h>
+#include <asm/softirq.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include <support/kmem.h>
+#include "page_buf_internal.h"
+
+#define SECTOR_SHIFT	9
+#define SECTOR_SIZE	(1<<SECTOR_SHIFT)
+#define SECTOR_MASK	(SECTOR_SIZE - 1)
+#define BN_ALIGN_MASK	((1 << (PAGE_CACHE_SHIFT - SECTOR_SHIFT)) - 1)
+
+#ifndef GFP_READAHEAD
+#define GFP_READAHEAD	0
+#endif
+
+/*
+ * Debug code
+ */
+
+#ifdef PAGEBUF_TRACE
+static	spinlock_t		pb_trace_lock = SPIN_LOCK_UNLOCKED;
+struct pagebuf_trace_buf	pb_trace;
+EXPORT_SYMBOL(pb_trace);
+EXPORT_SYMBOL(pb_trace_func);
+#define CIRC_INC(i)	(((i) + 1) & (PB_TRACE_BUFSIZE - 1))
+
+void
+pb_trace_func(
+	page_buf_t	*pb,
+	int		event,
+	void		*misc,
+	void		*ra)
+{
+	int		j;
+	unsigned long	flags;
+
+	if (!pb_params.p_un.debug) return;
+
+	if (ra == NULL) ra = (void *)__builtin_return_address(0);
+
+	spin_lock_irqsave(&pb_trace_lock, flags);
+	j = pb_trace.start;
+	pb_trace.start = CIRC_INC(j);
+	spin_unlock_irqrestore(&pb_trace_lock, flags);
+
+	pb_trace.buf[j].pb = (unsigned long) pb;
+	pb_trace.buf[j].event = event;
+	pb_trace.buf[j].flags = pb->pb_flags;
+	pb_trace.buf[j].hold = pb->pb_hold.counter;
+	pb_trace.buf[j].lock_value = PBP(pb)->pb_sema.count.counter;
+	pb_trace.buf[j].task = (void *)current;
+	pb_trace.buf[j].misc = misc;
+	pb_trace.buf[j].ra = ra;
+	pb_trace.buf[j].offset = pb->pb_file_offset;
+	pb_trace.buf[j].size = pb->pb_buffer_length;
+}
+#endif	/* PAGEBUF_TRACE */
+
+#ifdef PAGEBUF_TRACKING
+#define MAX_PB	10000
+page_buf_t	*pb_array[MAX_PB];
+EXPORT_SYMBOL(pb_array);
+
+void
+pb_tracking_get(
+	page_buf_t	*pb)
+{
+	int		i;
+
+	for (i = 0; (pb_array[i] != 0) && (i < MAX_PB); i++) { }
+	if (i == MAX_PB)
+		printk("pb 0x%p not recorded in pb_array\n", pb);
+	else {
+		//printk("pb_get 0x%p in pb_array[%d]\n", pb, i);
+		pb_array[i] = pb;
+	}
+}
+
+void
+pb_tracking_free(
+	page_buf_t	*pb)
+{
+	int		i;
+
+	for (i = 0; (pb_array[i] != pb) && (i < MAX_PB); i++) { }
+	if (i < MAX_PB) {
+		//printk("pb_free 0x%p from pb_array[%d]\n", pb, i);
+		pb_array[i] = NULL;
+	}
+	else
+		printk("Freed unmonitored pagebuf 0x%p\n", pb);
+}
+#else
+#define pb_tracking_get(pb)	do { } while (0)
+#define pb_tracking_free(pb)	do { } while (0)
+#endif	/* PAGEBUF_TRACKING */
+
+/*
+ *	File wide globals
+ */
+
+STATIC kmem_cache_t *pagebuf_cache;
+STATIC pagebuf_daemon_t *pb_daemon;
+STATIC struct list_head pagebuf_iodone_tq[NR_CPUS];
+STATIC wait_queue_head_t pagebuf_iodone_wait[NR_CPUS];
+
+/*
+ *	For pre-allocated buffer head pool
+ */
+
+#define NR_RESERVED_BH	64
+static wait_queue_head_t	pb_resv_bh_wait;
+static spinlock_t		pb_resv_bh_lock = SPIN_LOCK_UNLOCKED;
+struct buffer_head		*pb_resv_bh = NULL;	/* list of bh */
+int				pb_resv_bh_cnt = 0;	/* # of bh available */
+
+STATIC void pagebuf_daemon_wakeup(int);
+STATIC int _pagebuf_segment_apply(page_buf_t *);
+
+/*
+ * Pagebuf module configuration parameters, exported via
+ * /proc/sys/vm/pagebuf
+ */
+
+unsigned long pagebuf_min[P_PARAM] = {	HZ/2,	1*HZ, 0, 0 };
+unsigned long pagebuf_max[P_PARAM] = { HZ*30, HZ*300, 1, 1 };
+
+pagebuf_param_t pb_params = {{ HZ, 15 * HZ, 0, 0 }};
+
+/*
+ * Pagebuf statistics variables
+ */
+
+struct pbstats pbstats;
+
+/*
+ * Pagebuf allocation / freeing.
+ */
+
+#define pb_to_gfp(flags) \
+	(((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \
+	 ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)
+
+#define pagebuf_allocate(flags) \
+	kmem_cache_alloc(pagebuf_cache, pb_to_gfp(flags))
+#define pagebuf_deallocate(pb) \
+	kmem_cache_free(pagebuf_cache, (pb));
+
+/*
+ * Pagebuf hashing
+ */
+
+#define NBITS	5
+#define NHASH	(1<<NBITS)
+
+typedef struct {
+	struct list_head	pb_hash;
+	int			pb_count;
+	spinlock_t		pb_hash_lock;
+} pb_hash_t;
+
+STATIC pb_hash_t	pbhash[NHASH];
+#define pb_hash(pb)	&pbhash[pb->pb_hash_index]
+
+STATIC int
+_bhash(
+	dev_t		dev,
+	loff_t		base)
+{
+	int		bit, hval;
+
+	base >>= 9;
+	/*
+	 * dev_t is 16 bits, loff_t is always 64 bits
+	 */
+	base ^= dev;
+	for (bit = hval = 0; base != 0 && bit < sizeof(base) * 8; bit += NBITS) {
+		hval ^= (int)base & (NHASH-1);
+		base >>= NBITS;
+	}
+	return hval;
+}
+
+/*
+ * Mapping of multi-page buffers into contingous virtual space
+ */
+
+STATIC void *pagebuf_mapout_locked(page_buf_t *);
+
+STATIC	spinlock_t		as_lock = SPIN_LOCK_UNLOCKED;
+typedef struct a_list {
+	void	*vm_addr;
+	struct a_list	*next;
+} a_list_t;
+STATIC	a_list_t	*as_free_head;
+STATIC	int		as_list_len;
+
+STATIC void
+free_address(
+	void		*addr)
+{
+	a_list_t	*aentry;
+
+	spin_lock(&as_lock);
+	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
+	aentry->next = as_free_head;
+	aentry->vm_addr = addr;
+	as_free_head = aentry;
+	as_list_len++;
+	spin_unlock(&as_lock);
+}
+
+STATIC void
+purge_addresses(void)
+{
+	a_list_t	*aentry, *old;
+
+	if (as_free_head == NULL) return;
+
+	spin_lock(&as_lock);
+	aentry = as_free_head;
+	as_free_head = NULL;
+	as_list_len = 0;
+	spin_unlock(&as_lock);
+
+	while ((old = aentry) != NULL) {
+		vunmap(aentry->vm_addr);
+		aentry = aentry->next;
+		kfree(old);
+	}
+}
+
+/*
+ *	Locking model:
+ *
+ *	Buffers associated with inodes for which buffer locking
+ *	is not enabled are not protected by semaphores, and are
+ *	assumed to be exclusively owned by the caller.	There is
+ *	spinlock in the buffer, for use by the caller when concurrent
+ *	access is possible.
+ */
+
+/*
+ *	Internal pagebuf object manipulation
+ */
+
+STATIC void
+_pagebuf_initialize(
+	page_buf_t		*pb,
+	pb_target_t		*target,
+	loff_t			range_base,
+	size_t			range_length,
+	page_buf_flags_t	flags)
+{
+	/*
+	 * We don't want certain flags to appear in pb->pb_flags.
+	 */
+	flags &= ~(PBF_LOCK|PBF_ENTER_PAGES|PBF_MAPPED);
+	flags &= ~(PBF_DONT_BLOCK|PBF_READ_AHEAD);
+
+	pb_tracking_get(pb);
+
+	memset(pb, 0, sizeof(page_buf_private_t));
+	atomic_set(&pb->pb_hold, 1);
+	init_MUTEX_LOCKED(&pb->pb_iodonesema);
+	INIT_LIST_HEAD(&pb->pb_list);
+	INIT_LIST_HEAD(&pb->pb_hash_list);
+	init_MUTEX_LOCKED(&PBP(pb)->pb_sema); /* held, no waiters */
+	PB_SET_OWNER(pb);
+	pb->pb_target = target;
+	pb->pb_file_offset = range_base;
+	/*
+	 * Set buffer_length and count_desired to the same value initially.
+	 * IO routines should use count_desired, which will be the same in
+	 * most cases but may be reset (e.g. XFS recovery).
+	 */
+	pb->pb_buffer_length = pb->pb_count_desired = range_length;
+	pb->pb_flags = flags | PBF_NONE;
+	pb->pb_bn = PAGE_BUF_DADDR_NULL;
+	atomic_set(&PBP(pb)->pb_pin_count, 0);
+	init_waitqueue_head(&PBP(pb)->pb_waiters);
+
+	PB_STATS_INC(pbstats.pb_create);
+	PB_TRACE(pb, PB_TRACE_REC(get), target);
+}
+
+/*
+ * Allocate a page array capable of holding a specified number
+ * of pages, and point the page buf at it.
+ */
+STATIC int
+_pagebuf_get_pages(
+	page_buf_t		*pb,
+	int			page_count,
+	int			flags)
+{
+	int			gpf_mask = pb_to_gfp(flags);
+
+	/* Make sure that we have a page list */
+	if (pb->pb_pages == NULL) {
+		pb->pb_offset = page_buf_poff(pb->pb_file_offset);
+		pb->pb_page_count = page_count;
+		if (page_count <= PB_PAGES) {
+			pb->pb_pages = pb->pb_page_array;
+		} else {
+			pb->pb_pages = kmalloc(sizeof(struct page *) *
+					page_count, gpf_mask);
+			if (pb->pb_pages == NULL)
+				return -ENOMEM;
+		}
+		memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
+	}
+	return 0;
+}
+
+/*
+ * Walk a pagebuf releasing all the pages contained within it.
+ */
+STATIC inline void
+_pagebuf_freepages(
+	page_buf_t		*pb)
+{
+	int			buf_index;
+
+	for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) {
+		struct page	*page = pb->pb_pages[buf_index];
+
+		if (page) {
+			pb->pb_pages[buf_index] = NULL;
+			page_cache_release(page);
+		}
+	}
+
+	if (pb->pb_pages != pb->pb_page_array)
+		kfree(pb->pb_pages);
+}
+
+/*
+ *	_pagebuf_free_object
+ *
+ *	_pagebuf_free_object releases the contents specified buffer.
+ *	The modification state of any associated pages is left unchanged.
+ */
+void
+_pagebuf_free_object(
+	pb_hash_t	*hash,	/* hash bucket for buffer	*/
+	page_buf_t	*pb)	/* buffer to deallocate		*/
+{
+	int		pb_flags = pb->pb_flags;
+
+	PB_TRACE(pb, PB_TRACE_REC(free_obj), 0);
+	pb->pb_flags |= PBF_FREED;
+
+	if (hash) {
+		if (!list_empty(&pb->pb_hash_list)) {
+			hash->pb_count--;
+			list_del_init(&pb->pb_hash_list);
+		}
+		spin_unlock(&hash->pb_hash_lock);
+	}
+
+	if (!(pb_flags & PBF_FREED)) {
+		/* release any virtual mapping */ ;
+		if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
+			void *vaddr = pagebuf_mapout_locked(pb);
+			if (vaddr) {
+				free_address(vaddr);
+			}
+		}
+
+		if (pb->pb_flags & _PBF_MEM_ALLOCATED) {
+			if (pb->pb_pages) {
+				/* release the pages in the address list */
+				if (pb->pb_pages[0] &&
+				    PageSlab(pb->pb_pages[0])) {
+					/*
+					 * This came from the slab
+					 * allocator free it as such
+					 */
+					kfree(pb->pb_addr);
+				} else {
+					_pagebuf_freepages(pb);
+				}
+
+				pb->pb_pages = NULL;
+			}
+			pb->pb_flags &= ~_PBF_MEM_ALLOCATED;
+		}
+	}
+
+	pb_tracking_free(pb);
+	pagebuf_deallocate(pb);
+}
+
+/*
+ *	_pagebuf_lookup_pages
+ *
+ *	_pagebuf_lookup_pages finds all pages which match the buffer
+ *	in question and the range of file offsets supplied,
+ *	and builds the page list for the buffer, if the
+ *	page list is not already formed or if not all of the pages are
+ *	already in the list. Invalid pages (pages which have not yet been
+ *	read in from disk) are assigned for any pages which are not found.
+ */
+STATIC int
+_pagebuf_lookup_pages(
+	page_buf_t		*pb,
+	struct address_space	*aspace,
+	page_buf_flags_t	flags)
+{
+	loff_t			next_buffer_offset;
+	unsigned long		page_count, pi, index;
+	struct page		*page;
+	int			gfp_mask, retry_count = 5, rval = 0;
+	int			all_mapped, good_pages;
+	size_t			blocksize;
+
+	/* For pagebufs where we want to map an address, do not use
+	 * highmem pages - so that we do not need to use kmap resources
+	 * to access the data.
+	 *
+	 * For pages where the caller has indicated there may be resource
+	 * contention (e.g. called from a transaction) do not flush
+	 * delalloc pages to obtain memory.
+	 */
+
+	if (flags & PBF_READ_AHEAD) {
+		gfp_mask = GFP_READAHEAD;
+		retry_count = 0;
+	} else if (flags & PBF_DONT_BLOCK) {
+		gfp_mask = GFP_NOFS;
+	} else if (flags & PBF_MAPPABLE) {
+		gfp_mask = GFP_KERNEL;
+	} else {
+		gfp_mask = GFP_HIGHUSER;
+	}
+
+	next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length;
+
+	good_pages = page_count = (page_buf_btoc(next_buffer_offset) -
+				   page_buf_btoct(pb->pb_file_offset));
+
+	if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) {
+		/* Bring pages forward in cache */
+		for (pi = 0; pi < page_count; pi++) {
+			mark_page_accessed(pb->pb_pages[pi]);
+		}
+		if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) {
+			all_mapped = 1;
+			goto mapit;
+		}
+		return 0;
+	}
+
+	/* Ensure pb_pages field has been initialised */
+	rval = _pagebuf_get_pages(pb, page_count, flags);
+	if (rval)
+		return rval;
+
+	rval = pi = 0;
+	blocksize = pb->pb_target->pbr_blocksize;
+
+	/* Enter the pages in the page list */
+	index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT;
+	for (all_mapped = 1; pi < page_count; pi++, index++) {
+		if (pb->pb_pages[pi] == 0) {
+		      retry:
+			page = find_or_create_page(aspace, index, gfp_mask);
+			if (!page) {
+				if (--retry_count > 0) {
+					PB_STATS_INC(pbstats.pb_page_retries);
+					pagebuf_daemon_wakeup(1);
+					current->state = TASK_UNINTERRUPTIBLE;
+					schedule_timeout(10);
+					goto retry;
+				}
+				rval = -ENOMEM;
+				all_mapped = 0;
+				continue;
+			}
+			PB_STATS_INC(pbstats.pb_page_found);
+			mark_page_accessed(page);
+			pb->pb_pages[pi] = page;
+		} else {
+			page = pb->pb_pages[pi];
+			lock_page(page);
+		}
+
+		/* If we need to do I/O on a page record the fact */
+		if (!Page_Uptodate(page)) {
+			good_pages--;
+			if ((blocksize == PAGE_CACHE_SIZE) &&
+			    (flags & PBF_READ))
+				pb->pb_locked = 1;
+		}
+	}
+
+	if (!pb->pb_locked) {
+		for (pi = 0; pi < page_count; pi++) {
+			unlock_page(pb->pb_pages[pi]);
+		}
+	}
+
+mapit:
+	pb->pb_flags |= _PBF_MEM_ALLOCATED;
+	if (all_mapped) {
+		pb->pb_flags |= _PBF_ALL_PAGES_MAPPED;
+
+		/* A single page buffer is always mappable */
+		if (page_count == 1) {
+			pb->pb_addr = (caddr_t)
+			    	page_address(pb->pb_pages[0]) + pb->pb_offset;
+			pb->pb_flags |= PBF_MAPPED;
+		} else if (flags & PBF_MAPPED) {
+			if (as_list_len > 64)
+				purge_addresses();
+			pb->pb_addr = vmap(pb->pb_pages, page_count);
+			if (!pb->pb_addr)
+				BUG();
+			pb->pb_addr += pb->pb_offset;
+			pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED;
+		}
+	}
+	/* If some pages were found with data in them
+	 * we are not in PBF_NONE state.
+	 */
+	if (good_pages != 0) {
+		pb->pb_flags &= ~(PBF_NONE);
+		if (good_pages != page_count) {
+			pb->pb_flags |= PBF_PARTIAL;
+		}
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(look_pg), good_pages);
+
+	return rval;
+}
+
+
+/*
+ *	Pre-allocation of a pool of buffer heads for use in
+ *	low-memory situations.
+ */
+
+/*
+ *	_pagebuf_prealloc_bh
+ *
+ *	Pre-allocate a pool of "count" buffer heads at startup.
+ *	Puts them on a list at "pb_resv_bh"
+ *	Returns number of bh actually allocated to pool.
+ */
+STATIC int
+_pagebuf_prealloc_bh(
+	int			count)
+{
+	struct buffer_head	*bh;
+	int			i;
+
+	for (i = 0; i < count; i++) {
+		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
+		if (!bh)
+			break;
+		bh->b_pprev = &pb_resv_bh;
+		bh->b_next = pb_resv_bh;
+		pb_resv_bh = bh;
+		pb_resv_bh_cnt++;
+	}
+	return i;
+}
+
+/*
+ *	_pagebuf_get_prealloc_bh
+ *
+ *	Get one buffer head from our pre-allocated pool.
+ *	If pool is empty, sleep 'til one comes back in.
+ *	Returns aforementioned buffer head.
+ */
+STATIC struct buffer_head *
+_pagebuf_get_prealloc_bh(void)
+{
+	unsigned long		flags;
+	struct buffer_head	*bh = NULL;
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE	(wait, tsk);
+
+	spin_lock_irqsave(&pb_resv_bh_lock, flags);
+
+	if (pb_resv_bh_cnt < 1) {
+
+		add_wait_queue(&pb_resv_bh_wait, &wait);
+		do {
+			run_task_queue(&tq_disk);
+			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+			spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
+			schedule();
+			spin_lock_irqsave(&pb_resv_bh_lock, flags);
+		} while (pb_resv_bh_cnt < 1);
+		tsk->state = TASK_RUNNING;
+		remove_wait_queue(&pb_resv_bh_wait, &wait);
+	}
+
+	if (pb_resv_bh_cnt < 1)
+		BUG();
+
+	bh = pb_resv_bh;
+
+	if (!bh)
+		BUG();
+
+	pb_resv_bh = bh->b_next;
+	bh->b_state = 0;
+	pb_resv_bh_cnt--;
+
+	spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
+
+	return bh;
+}
+
+/*
+ *	_pagebuf_free_bh
+ *
+ *	Take care of buffer heads that we're finished with.
+ *	Call this instead of just kmem_cache_free(bh_cachep, bh)
+ *	when you're done with a bh.
+ *
+ *	If our pre-allocated pool is full, just free the buffer head.
+ *	Otherwise, put it back in the pool, and wake up anybody
+ *	waiting for one.
+ */
+STATIC inline void
+_pagebuf_free_bh(
+	struct buffer_head	*bh)
+{
+	unsigned long		flags;
+
+	if (pb_resv_bh_cnt == NR_RESERVED_BH){
+		kmem_cache_free(bh_cachep, bh);
+	} else {
+		spin_lock_irqsave(&pb_resv_bh_lock, flags);
+
+		bh->b_pprev = &pb_resv_bh;
+		bh->b_next = pb_resv_bh;
+		pb_resv_bh = bh;
+		pb_resv_bh_cnt++;
+
+		if (waitqueue_active(&pb_resv_bh_wait)) {
+			wake_up(&pb_resv_bh_wait);
+		}
+
+		spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
+	}
+}
+
+/*
+ *	Finding and Reading Buffers
+ */
+
+/*
+ *	_pagebuf_find
+ *
+ *	Looks up, and creates if absent, a lockable buffer for
+ *	a given range of an inode.  The buffer is returned
+ *	locked.	 If other overlapping buffers exist, they are
+ *	released before the new buffer is created and locked,
+ *	which may imply that this call will block until those buffers
+ *	are unlocked.  No I/O is implied by this call.
+ */
+STATIC page_buf_t *
+_pagebuf_find(				/* find buffer for block	*/
+	pb_target_t		*target,/* target for block		*/
+	loff_t			ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	page_buf_flags_t	flags,	/* PBF_TRYLOCK			*/
+	page_buf_t		*new_pb)/* newly allocated buffer	*/
+{
+	loff_t			range_base;
+	size_t			range_length;
+	int			hval;
+	pb_hash_t		*h;
+	struct list_head	*p;
+	page_buf_t		*pb;
+	int			not_locked;
+
+	range_base = (ioff << SECTOR_SHIFT);
+	range_length = (isize << SECTOR_SHIFT);
+
+	hval = _bhash(target->pbr_bdev->bd_dev, range_base);
+	h = &pbhash[hval];
+
+	spin_lock(&h->pb_hash_lock);
+	list_for_each(p, &h->pb_hash) {
+		pb = list_entry(p, page_buf_t, pb_hash_list);
+
+		if ((target == pb->pb_target) &&
+		    (pb->pb_file_offset == range_base) &&
+		    (pb->pb_buffer_length == range_length)) {
+			if (pb->pb_flags & PBF_FREED)
+				break;
+			/* If we look at something bring it to the
+			 * front of the list for next time
+			 */
+			list_del(&pb->pb_hash_list);
+			list_add(&pb->pb_hash_list, &h->pb_hash);
+			goto found;
+		}
+	}
+
+	/* No match found */
+	if (new_pb) {
+		_pagebuf_initialize(new_pb, target, range_base,
+				range_length, flags | _PBF_LOCKABLE);
+		new_pb->pb_hash_index = hval;
+		h->pb_count++;
+		list_add(&new_pb->pb_hash_list, &h->pb_hash);
+	} else {
+		PB_STATS_INC(pbstats.pb_miss_locked);
+	}
+
+	spin_unlock(&h->pb_hash_lock);
+	return (new_pb);
+
+found:
+	atomic_inc(&pb->pb_hold);
+	spin_unlock(&h->pb_hash_lock);
+
+	/* Attempt to get the semaphore without sleeping,
+	 * if this does not work then we need to drop the
+	 * spinlock and do a hard attempt on the semaphore.
+	 */
+	not_locked = down_trylock(&PBP(pb)->pb_sema);
+	if (not_locked) {
+		if (!(flags & PBF_TRYLOCK)) {
+			/* wait for buffer ownership */
+			PB_TRACE(pb, PB_TRACE_REC(get_lk), 0);
+			pagebuf_lock(pb);
+			PB_STATS_INC(pbstats.pb_get_locked_waited);
+		} else {
+			/* We asked for a trylock and failed, no need
+			 * to look at file offset and length here, we
+			 * know that this pagebuf at least overlaps our
+			 * pagebuf and is locked, therefore our buffer
+			 * either does not exist, or is this buffer
+			 */
+
+			pagebuf_rele(pb);
+			PB_STATS_INC(pbstats.pb_busy_locked);
+			return (NULL);
+		}
+	} else {
+		/* trylock worked */
+		PB_SET_OWNER(pb);
+	}
+
+	if (pb->pb_flags & PBF_STALE)
+		pb->pb_flags &= PBF_MAPPABLE | \
+				PBF_MAPPED | \
+				_PBF_LOCKABLE | \
+				_PBF_ALL_PAGES_MAPPED | \
+				_PBF_SOME_INVALID_PAGES | \
+				_PBF_ADDR_ALLOCATED | \
+				_PBF_MEM_ALLOCATED;
+	PB_TRACE(pb, PB_TRACE_REC(got_lk), 0);
+	PB_STATS_INC(pbstats.pb_get_locked);
+	return (pb);
+}
+
+
+/*
+ *	pagebuf_find
+ *
+ *	pagebuf_find returns a buffer matching the specified range of
+ *	data for the specified target, if any of the relevant blocks
+ *	are in memory.	The buffer may have unallocated holes, if
+ *	some, but not all, of the blocks are in memory.	 Even where
+ *	pages are present in the buffer, not all of every page may be
+ *	valid.	The file system may use pagebuf_segment to visit the
+ *	various segments of the buffer.
+ */
+page_buf_t *
+pagebuf_find(				/* find buffer for block	*/
+					/* if the block is in memory	*/
+	pb_target_t		*target,/* target for block		*/
+	loff_t			ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	page_buf_flags_t	flags)	/* PBF_TRYLOCK			*/
+{
+	return _pagebuf_find(target, ioff, isize, flags, NULL);
+}
+
+/*
+ *	pagebuf_get
+ *
+ *	pagebuf_get assembles a buffer covering the specified range.
+ *	Some or all of the blocks in the range may be valid.  The file
+ *	system may use pagebuf_segment to visit the various segments
+ *	of the buffer.	Storage in memory for all portions of the
+ *	buffer will be allocated, although backing storage may not be.
+ *	If PBF_READ is set in flags, pagebuf_read
+ */
+page_buf_t *
+pagebuf_get(				/* allocate a buffer		*/
+	pb_target_t		*target,/* target for buffer 		*/
+	loff_t			ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	page_buf_flags_t	flags)	/* PBF_TRYLOCK			*/
+{
+	page_buf_t		*pb, *new_pb;
+	int			error;
+
+	new_pb = pagebuf_allocate(flags);
+	if (unlikely(!new_pb))
+		return (NULL);
+
+	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
+	if (pb != new_pb) {
+		pagebuf_deallocate(new_pb);
+		if (unlikely(!pb))
+			return (NULL);
+	}
+
+	PB_STATS_INC(pbstats.pb_get);
+
+	/* fill in any missing pages */
+	error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags);
+	if (unlikely(error)) {
+		pagebuf_free(pb);
+		return (NULL);
+	}
+
+	/*
+	 * Always fill in the block number now, the mapped cases can do
+	 * their own overlay of this later.
+	 */
+	pb->pb_bn = ioff;
+	pb->pb_count_desired = pb->pb_buffer_length;
+
+	if (flags & PBF_READ) {
+		if (PBF_NOT_DONE(pb)) {
+			PB_TRACE(pb, PB_TRACE_REC(get_read), flags);
+			PB_STATS_INC(pbstats.pb_get_read);
+			pagebuf_iostart(pb, flags);
+		} else if (flags & PBF_ASYNC) {
+			/*
+			 * Read ahead call which is already satisfied,
+			 * drop the buffer
+			 */
+			if (flags & (PBF_LOCK | PBF_TRYLOCK))
+				pagebuf_unlock(pb);
+			pagebuf_rele(pb);
+			return NULL;
+		} else {
+			/* We do not want read in the flags */
+			pb->pb_flags &= ~PBF_READ;
+		}
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(get_obj), flags);
+	return (pb);
+}
+
+/*
+ * Create a pagebuf and populate it with pages from the address
+ * space of the passed in inode.
+ */
+page_buf_t *
+pagebuf_lookup(
+	struct pb_target	*target,
+	struct inode		*inode,
+	loff_t			ioff,
+	size_t			isize,
+	int			flags)
+{
+	page_buf_t		*pb = NULL;
+	int			status;
+
+	flags |= _PBF_PRIVATE_BH;
+	pb = pagebuf_allocate(flags);
+	if (pb) {
+		_pagebuf_initialize(pb, target, ioff, isize, flags);
+		if (flags & PBF_ENTER_PAGES) {
+			status = _pagebuf_lookup_pages(pb, &inode->i_data, 0);
+			if (status != 0) {
+				pagebuf_free(pb);
+				return (NULL);
+			}
+		}
+	}
+	return pb;
+}
+
+/*
+ * If we are not low on memory then do the readahead in a deadlock
+ * safe manner.
+ */
+void
+pagebuf_readahead(
+	pb_target_t		*target,
+	loff_t			ioff,
+	size_t			isize,
+	int			flags)
+{
+	flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD);
+	pagebuf_get(target, ioff, isize, flags);
+}
+
+page_buf_t *
+pagebuf_get_empty(
+	pb_target_t		*target)
+{
+	page_buf_t		*pb;
+
+	pb = pagebuf_allocate(_PBF_LOCKABLE);
+	if (pb)
+		_pagebuf_initialize(pb, target, 0, 0, _PBF_LOCKABLE);
+	return pb;
+}
+
+static inline struct page *
+mem_to_page(
+	void			*addr)
+{
+	if (((unsigned long)addr < VMALLOC_START) ||
+            ((unsigned long)addr >= VMALLOC_END)) {
+		return virt_to_page(addr);
+	} else {
+		return vmalloc_to_page(addr);
+	}
+}
+
+int
+pagebuf_associate_memory(
+	page_buf_t		*pb,
+	void			*mem,
+	size_t			len)
+{
+	int			rval;
+	int			i = 0;
+	size_t			ptr;
+	size_t			end, end_cur;
+	off_t			offset;
+	int			page_count;
+
+	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
+	if (offset && (len > PAGE_CACHE_SIZE))
+		page_count++;
+
+	/* Free any previous set of page pointers */
+	if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) {
+		kfree(pb->pb_pages);
+	}
+	pb->pb_pages = NULL;
+	pb->pb_addr = mem;
+
+	rval = _pagebuf_get_pages(pb, page_count, 0);
+	if (rval)
+		return rval;
+
+	pb->pb_offset = offset;
+	ptr = (size_t) mem & PAGE_CACHE_MASK;
+	end = PAGE_CACHE_ALIGN((size_t) mem + len);
+	end_cur = end;
+	/* set up first page */
+	pb->pb_pages[0] = mem_to_page(mem);
+
+	ptr += PAGE_CACHE_SIZE;
+	pb->pb_page_count = ++i;
+	while (ptr < end) {
+		pb->pb_pages[i] = mem_to_page((void *)ptr);
+		pb->pb_page_count = ++i;
+		ptr += PAGE_CACHE_SIZE;
+	}
+	pb->pb_locked = 0;
+
+	pb->pb_count_desired = pb->pb_buffer_length = len;
+	pb->pb_flags |= PBF_MAPPED | _PBF_PRIVATE_BH;
+
+	return 0;
+}
+
+page_buf_t *
+pagebuf_get_no_daddr(
+	size_t			len,
+	pb_target_t		*target)
+{
+	int			rval;
+	void			*rmem = NULL;
+	int			flags = _PBF_LOCKABLE | PBF_FORCEIO;
+	page_buf_t		*pb;
+	size_t			tlen = 0;
+
+	if (len > 0x20000)
+		return(NULL);
+
+	pb = pagebuf_allocate(flags);
+	if (!pb)
+		return NULL;
+
+	_pagebuf_initialize(pb, target, 0, len, flags);
+
+	do {
+		if (tlen == 0) {
+			tlen = len; /* first time */
+		} else {
+			kfree(rmem); /* free the mem from the previous try */
+			tlen <<= 1; /* double the size and try again */
+			/*
+			printk(
+			"pb_get_no_daddr NOT block 0x%p mask 0x%p len %d\n",
+				rmem, ((size_t)rmem & (size_t)~SECTOR_MASK),
+				len);
+			*/
+		}
+		if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) {
+			pagebuf_free(pb);
+			return NULL;
+		}
+	} while ((size_t)rmem != ((size_t)rmem & (size_t)~SECTOR_MASK));
+
+	if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) {
+		kfree(rmem);
+		pagebuf_free(pb);
+		return NULL;
+	}
+	/* otherwise pagebuf_free just ignores it */
+	pb->pb_flags |= _PBF_MEM_ALLOCATED;
+	up(&PBP(pb)->pb_sema);	/* Return unlocked pagebuf */
+
+	PB_TRACE(pb, PB_TRACE_REC(no_daddr), rmem);
+
+	return pb;
+}
+
+
+/*
+ *	pagebuf_hold
+ *
+ *	Increment reference count on buffer, to hold the buffer concurrently
+ *	with another thread which may release (free) the buffer asynchronously.
+ *
+ *	Must hold the buffer already to call this function.
+ */
+void
+pagebuf_hold(
+	page_buf_t		*pb)
+{
+	atomic_inc(&pb->pb_hold);
+	PB_TRACE(pb, PB_TRACE_REC(hold), 0);
+}
+
+/*
+ *	pagebuf_free
+ *
+ *	pagebuf_free releases the specified buffer.  The modification
+ *	state of any associated pages is left unchanged.
+ */
+void
+pagebuf_free(
+	page_buf_t		*pb)
+{
+	if (pb->pb_flags & _PBF_LOCKABLE) {
+		pb_hash_t	*h = pb_hash(pb);
+
+		spin_lock(&h->pb_hash_lock);
+		_pagebuf_free_object(h, pb);
+	} else {
+		_pagebuf_free_object(NULL, pb);
+	}
+}
+
+/*
+ *	pagebuf_rele
+ *
+ *	pagebuf_rele releases a hold on the specified buffer.  If the
+ *	the hold count is 1, pagebuf_rele calls pagebuf_free.
+ */
+void
+pagebuf_rele(
+	page_buf_t		*pb)
+{
+	pb_hash_t		*h;
+
+	PB_TRACE(pb, PB_TRACE_REC(rele), pb->pb_relse);
+	if (pb->pb_flags & _PBF_LOCKABLE) {
+		h = pb_hash(pb);
+		spin_lock(&h->pb_hash_lock);
+	} else {
+		h = NULL;
+	}
+
+	if (atomic_dec_and_test(&pb->pb_hold)) {
+		int		do_free = 1;
+
+		if (pb->pb_relse) {
+			atomic_inc(&pb->pb_hold);
+			if (h)
+				spin_unlock(&h->pb_hash_lock);
+			(*(pb->pb_relse)) (pb);
+			do_free = 0;
+		}
+		if (pb->pb_flags & PBF_DELWRI) {
+			pb->pb_flags |= PBF_ASYNC;
+			atomic_inc(&pb->pb_hold);
+			if (h && do_free)
+				spin_unlock(&h->pb_hash_lock);
+			pagebuf_delwri_queue(pb, 0);
+			do_free = 0;
+		} else if (pb->pb_flags & PBF_FS_MANAGED) {
+			if (h)
+				spin_unlock(&h->pb_hash_lock);
+			do_free = 0;
+		}
+
+		if (do_free) {
+			_pagebuf_free_object(h, pb);
+		}
+	} else if (h) {
+		spin_unlock(&h->pb_hash_lock);
+	}
+}
+
+
+/*
+ *	Pinning Buffer Storage in Memory
+ */
+
+/*
+ *	pagebuf_pin
+ *
+ *	pagebuf_pin locks all of the memory represented by a buffer in
+ *	memory.	 Multiple calls to pagebuf_pin and pagebuf_unpin, for
+ *	the same or different buffers affecting a given page, will
+ *	properly count the number of outstanding "pin" requests.  The
+ *	buffer may be released after the pagebuf_pin and a different
+ *	buffer used when calling pagebuf_unpin, if desired.
+ *	pagebuf_pin should be used by the file system when it wants be
+ *	assured that no attempt will be made to force the affected
+ *	memory to disk.	 It does not assure that a given logical page
+ *	will not be moved to a different physical page.
+ */
+void
+pagebuf_pin(
+	page_buf_t		*pb)
+{
+	atomic_inc(&PBP(pb)->pb_pin_count);
+	PB_TRACE(pb, PB_TRACE_REC(pin), PBP(pb)->pb_pin_count.counter);
+}
+
+/*
+ *	pagebuf_unpin
+ *
+ *	pagebuf_unpin reverses the locking of memory performed by
+ *	pagebuf_pin.  Note that both functions affected the logical
+ *	pages associated with the buffer, not the buffer itself.
+ */
+void
+pagebuf_unpin(
+	page_buf_t		*pb)
+{
+	if (atomic_dec_and_test(&PBP(pb)->pb_pin_count)) {
+		wake_up_all(&PBP(pb)->pb_waiters);
+	}
+	PB_TRACE(pb, PB_TRACE_REC(unpin), PBP(pb)->pb_pin_count.counter);
+}
+
+int
+pagebuf_ispin(
+	page_buf_t		*pb)
+{
+	return atomic_read(&PBP(pb)->pb_pin_count);
+}
+
+/*
+ *	pagebuf_wait_unpin
+ *
+ *	pagebuf_wait_unpin waits until all of the memory associated
+ *	with the buffer is not longer locked in memory.	 It returns
+ *	immediately if none of the affected pages are locked.
+ */
+static inline void
+_pagebuf_wait_unpin(
+	page_buf_t		*pb)
+{
+	DECLARE_WAITQUEUE	(wait, current);
+
+	if (atomic_read(&PBP(pb)->pb_pin_count) == 0)
+		return;
+
+	add_wait_queue(&PBP(pb)->pb_waiters, &wait);
+	for (;;) {
+		current->state = TASK_UNINTERRUPTIBLE;
+		if (atomic_read(&PBP(pb)->pb_pin_count) == 0) {
+			break;
+		}
+		run_task_queue(&tq_disk);
+		schedule();
+	}
+	remove_wait_queue(&PBP(pb)->pb_waiters, &wait);
+	current->state = TASK_RUNNING;
+}
+
+void
+pagebuf_queue_task(
+	struct tq_struct	*task)
+{
+	queue_task(task, &pagebuf_iodone_tq[smp_processor_id()]);
+	wake_up(&pagebuf_iodone_wait[smp_processor_id()]);
+}
+
+
+/*
+ *	Buffer Utility Routines
+ */
+
+/*
+ *	pagebuf_iodone
+ *
+ *	pagebuf_iodone marks a buffer for which I/O is in progress
+ *	done with respect to that I/O.	The pb_done routine, if
+ *	present, will be called as a side-effect.
+ */
+void
+pagebuf_iodone_sched(
+	void			*v)
+{
+	page_buf_t		*pb = (page_buf_t *)v;
+
+	if (pb->pb_iodone) {
+		(*(pb->pb_iodone)) (pb);
+		return;
+	}
+
+	if (pb->pb_flags & PBF_ASYNC) {
+		if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse)
+			pagebuf_unlock(pb);
+		pagebuf_rele(pb);
+	}
+}
+
+void
+pagebuf_iodone(
+	page_buf_t		*pb)
+{
+	pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
+	if (pb->pb_error == 0) {
+		pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(done), pb->pb_iodone);
+
+	if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
+		INIT_TQUEUE(&pb->pb_iodone_sched,
+			pagebuf_iodone_sched, (void *)pb);
+
+		queue_task(&pb->pb_iodone_sched,
+				&pagebuf_iodone_tq[smp_processor_id()]);
+		wake_up(&pagebuf_iodone_wait[smp_processor_id()]);
+	} else {
+		up(&pb->pb_iodonesema);
+	}
+}
+
+/*
+ *	pagebuf_ioerror
+ *
+ *	pagebuf_ioerror sets the error code for a buffer.
+ */
+void
+pagebuf_ioerror(			/* mark/clear buffer error flag */
+	page_buf_t		*pb,	/* buffer to mark		*/
+	unsigned int		error)	/* error to store (0 if none)	*/
+{
+	pb->pb_error = error;
+	PB_TRACE(pb, PB_TRACE_REC(ioerror), error);
+}
+
+/*
+ *	pagebuf_iostart
+ *
+ *	pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
+ *	If necessary, it will arrange for any disk space allocation required,
+ *	and it will break up the request if the block mappings require it.
+ *	An pb_iodone routine in the buffer supplied will only be called
+ *	when all of the subsidiary I/O requests, if any, have been completed.
+ *	pagebuf_iostart calls the pagebuf_ioinitiate routine or
+ *	pagebuf_iorequest, if the former routine is not defined, to start
+ *	the I/O on a given low-level request.
+ */
+int
+pagebuf_iostart(			/* start I/O on a buffer	  */
+	page_buf_t		*pb,	/* buffer to start		  */
+	page_buf_flags_t	flags)	/* PBF_LOCK, PBF_ASYNC, PBF_READ, */
+					/* PBF_WRITE, PBF_ALLOCATE,	  */
+					/* PBF_DELWRI, 			  */
+					/* PBF_SYNC, PBF_DONT_BLOCK	  */
+					/* PBF_RELEASE			  */
+{
+	int			status = 0;
+
+	PB_TRACE(pb, PB_TRACE_REC(iostart), flags);
+
+	if (flags & PBF_DELWRI) {
+		pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
+		pb->pb_flags |= flags &
+				(PBF_DELWRI | PBF_ASYNC | PBF_SYNC);
+		pagebuf_delwri_queue(pb, 1);
+		return status;
+	}
+
+	pb->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI|PBF_READ_AHEAD);
+	pb->pb_flags |= flags & (PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_READ_AHEAD);
+
+	if (pb->pb_bn == PAGE_BUF_DADDR_NULL) {
+		BUG();
+	}
+
+	/* For writes call internal function which checks for
+	 * filesystem specific callout function and execute it.
+	 */
+	if (flags & PBF_WRITE) {
+		status = __pagebuf_iorequest(pb);
+	} else {
+		status = pagebuf_iorequest(pb);
+	}
+
+	/* Wait for I/O if we are not an async request */
+	if ((status == 0) && (flags & PBF_ASYNC) == 0) {
+		status = pagebuf_iowait(pb);
+	}
+
+	return status;
+}
+
+
+/*
+ * Helper routines for pagebuf_iorequest (pagebuf I/O completion)
+ * 
+ * (different routines for locked/unlocked, and single/multi-bh pagebufs)
+ */
+
+STATIC inline void
+_pb_io_done(
+	page_buf_t		*pb)
+{
+	if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) {
+		pb->pb_locked = 0;
+		pagebuf_iodone(pb);
+	}
+}
+
+STATIC void
+_end_pagebuf_page_io(
+	struct buffer_head	*bh,
+	int			uptodate,
+	int			locked)
+{
+	struct page		*page;
+	page_buf_t		*pb = (page_buf_t *) bh->b_private;
+
+	mark_buffer_uptodate(bh, uptodate);
+	atomic_dec(&bh->b_count);
+
+	page = bh->b_page;
+	if (!test_bit(BH_Uptodate, &bh->b_state)) {
+		set_bit(PG_error, &page->flags);
+		pb->pb_error = EIO;
+	}
+
+	unlock_buffer(bh);
+	_pagebuf_free_bh(bh);
+
+	SetPageUptodate(page);
+	if (locked)
+		unlock_page(page);
+	_pb_io_done(pb);
+}
+
+STATIC void
+_end_io_locked(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+	_end_pagebuf_page_io(bh, uptodate, 1);
+}
+
+STATIC void
+_end_io_nolock(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+	_end_pagebuf_page_io(bh, uptodate, 0);
+}
+
+typedef struct {
+	page_buf_t	*pb;		/* pointer to pagebuf page is within */
+	int		locking;	/* are pages locked? */
+	atomic_t	remain;		/* count of remaining I/O requests */
+} pagesync_t;
+
+STATIC void
+_end_pagebuf_page_io_multi(
+	struct buffer_head	*bh,
+	int			uptodate,
+	int			fullpage)
+{
+	pagesync_t		*psync = (pagesync_t *) bh->b_private;
+	page_buf_t		*pb = psync->pb;
+	struct page		*page;
+
+	mark_buffer_uptodate(bh, uptodate);
+	put_bh(bh);
+
+	page = bh->b_page;
+	if (!test_bit(BH_Uptodate, &bh->b_state)) {
+		set_bit(PG_error, &page->flags);
+		pb->pb_error = EIO;
+	}
+
+	unlock_buffer(bh);
+	if (fullpage)
+		_pagebuf_free_bh(bh);
+
+	if (atomic_dec_and_test(&psync->remain) == 1) {
+		if (fullpage)
+			SetPageUptodate(page);
+		if (psync->locking)
+			unlock_page(page);
+		kfree(psync);
+		_pb_io_done(pb);
+	}
+}
+
+STATIC void
+_end_io_multi_full(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+	_end_pagebuf_page_io_multi(bh, uptodate, 1);
+}
+
+STATIC void
+_end_io_multi_part(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+	_end_pagebuf_page_io_multi(bh, uptodate, 0);
+}
+
+
+/*
+ * Initiate I/O on part of a page we are interested in
+ */
+STATIC int
+_pagebuf_page_io(
+	struct page		*page,	/* Page structure we are dealing with */
+	page_buf_t		*pb,	/* pagebuf holding it, can be NULL */
+	page_buf_daddr_t	bn,	/* starting block number */
+	kdev_t			dev,	/* device for I/O */
+	size_t			blocksize,	/* filesystem block size */
+	off_t			pg_offset,	/* starting offset in page */
+	size_t			pg_length,	/* count of data to process */
+	int			locking,	/* page locking in use */
+	int			rw,	/* read/write operation */
+	int			flush)
+{
+	size_t			sector;
+	size_t			blk_length = 0;
+	struct buffer_head	*bh, *head, *bufferlist[MAX_BUF_PER_PAGE];
+	int			multi_ok;
+	int			i = 0, cnt = 0, err = 0;
+	int			public_bh = 0;
+
+	if ((blocksize < PAGE_CACHE_SIZE) &&
+	    !(pb->pb_flags & _PBF_PRIVATE_BH)) {
+		int		cache_ok;
+
+		cache_ok = !((pb->pb_flags & PBF_FORCEIO) || (rw == WRITE));
+		public_bh = multi_ok = 1;
+
+		if (!page_has_buffers(page)) {
+			if (!locking) {
+				lock_page(page);
+				if (!page_has_buffers(page)) {
+					create_empty_buffers(page, dev,
+							SECTOR_SIZE);
+				}
+				unlock_page(page);
+			} else {
+				create_empty_buffers(page, dev, SECTOR_SIZE);
+			}
+		}
+
+		/* Find buffer_heads belonging to just this pagebuf */
+		bh = head = page_buffers(page);
+		do {
+			if (buffer_uptodate(bh) && cache_ok)
+				continue;
+			blk_length = i << SECTOR_SHIFT;
+			if (blk_length < pg_offset)
+				continue;
+			if (blk_length >= pg_offset + pg_length)
+				break;
+
+			lock_buffer(bh);
+			get_bh(bh);
+			assert(!waitqueue_active(&bh->b_wait));
+
+			bh->b_size = SECTOR_SIZE;
+			bh->b_blocknr = bn + (i - (pg_offset >> SECTOR_SHIFT));
+			bufferlist[cnt++] = bh;
+		} while (i++, (bh = bh->b_this_page) != head);
+
+		goto request;
+	}
+
+	/* Calculate the block offsets and length we will be using */
+	if (pg_offset) {
+		size_t		block_offset;
+
+		block_offset = pg_offset >> SECTOR_SHIFT;
+		block_offset = pg_offset - (block_offset << SECTOR_SHIFT);
+		blk_length = (pg_length + block_offset + SECTOR_MASK) >>
+								SECTOR_SHIFT;
+	} else {
+		blk_length = (pg_length + SECTOR_MASK) >> SECTOR_SHIFT;
+	}
+
+	/* This will attempt to make a request bigger than the sector
+	 * size if we are well aligned.
+	 */
+	switch (pb->pb_target->pbr_flags) {
+	case 0:
+		sector = blk_length << SECTOR_SHIFT;
+		blk_length = 1;
+		break;
+	case PBR_ALIGNED_ONLY:
+		if ((pg_offset == 0) && (pg_length == PAGE_CACHE_SIZE) &&
+		    (((unsigned int) bn) & BN_ALIGN_MASK) == 0) {
+			sector = blk_length << SECTOR_SHIFT;
+			blk_length = 1;
+			break;
+		}
+	case PBR_SECTOR_ONLY:
+		/* Fallthrough, same as default */
+	default:
+		sector = SECTOR_SIZE;
+	}
+
+	/* The b_size field of struct buffer_head is an unsigned short
+	 * ... we may need to split this request up.  [64K is too big]
+	 */
+	assert(sizeof(bh->b_size) == 2);
+	while (sector > 0xffff) {
+		sector >>= 1;
+		blk_length++;
+	}
+
+	multi_ok = (blk_length != 1);
+
+	for (; blk_length > 0; blk_length--, pg_offset += sector) {
+		bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS);
+		if (!bh) {
+			bh = _pagebuf_get_prealloc_bh();
+			if (!bh) {
+				/* This should never happen */
+				err = -ENOMEM;
+				goto error;
+			}
+		}
+		memset(bh, 0, sizeof(*bh));
+		bh->b_size = sector;
+		bh->b_blocknr = bn++;
+		bh->b_dev = dev;
+		set_bit(BH_Lock, &bh->b_state);
+		set_bh_page(bh, page, pg_offset);
+		init_waitqueue_head(&bh->b_wait);
+		atomic_set(&bh->b_count, 1);
+		bufferlist[cnt++] = bh;
+	}
+
+request:
+	if (cnt) {
+		pagesync_t	*psync = NULL;
+		void		(*callback)(struct buffer_head *, int);
+
+		if (multi_ok) {
+			size_t	size = sizeof(pagesync_t);
+
+			psync = (pagesync_t *) kmalloc(size, GFP_NOFS);
+			if (!psync)
+				BUG();	/* Ugh - out of memory condition here */
+			psync->pb = pb;
+			psync->locking = locking;
+			atomic_set(&psync->remain, 0);
+
+			callback = public_bh ?
+				   _end_io_multi_part : _end_io_multi_full;
+		} else {
+			callback = locking ? _end_io_locked : _end_io_nolock;
+		}
+
+		/* Indicate that there is another page in progress */
+		atomic_inc(&PBP(pb)->pb_io_remaining);
+
+#ifdef RQ_WRITE_ORDERED
+		if (flush)
+			set_bit(BH_Ordered_Flush, &bufferlist[cnt-1]->b_state);
+#endif
+
+		for (i = 0; i < cnt; i++) {
+			bh = bufferlist[i];
+
+			/* Complete the buffer_head, then submit the IO */
+			if (psync) {
+				init_buffer(bh, callback, psync);
+				atomic_inc(&psync->remain);
+			} else {
+				init_buffer(bh, callback, pb);
+			}
+
+			bh->b_rdev = bh->b_dev;
+			bh->b_rsector = bh->b_blocknr;
+			set_bit(BH_Mapped, &bh->b_state);
+			set_bit(BH_Req, &bh->b_state);
+
+			if (rw == WRITE) {
+				set_bit(BH_Uptodate, &bh->b_state);
+			}
+			generic_make_request(rw, bh);
+		}
+	} else {
+		if (locking)
+			unlock_page(page);
+	}
+
+	return err;
+error:
+	/* If we ever do get here then clean up what we already did */
+	for (i = 0; i < cnt; i++) {
+		atomic_set_buffer_clean(bufferlist[i]);
+		bufferlist[i]->b_end_io(bufferlist[i], 0);
+	}
+	return err;
+}
+
+STATIC int
+_page_buf_page_apply(
+	page_buf_t		*pb,
+	loff_t			offset,
+	struct page		*page,
+	size_t			pg_offset,
+	size_t			pg_length,
+	int			last)
+{
+	page_buf_daddr_t	bn = pb->pb_bn;
+	kdev_t			dev = pb->pb_target->pbr_kdev;
+	size_t			blocksize = pb->pb_target->pbr_blocksize;
+	loff_t			pb_offset;
+	size_t			ret_len = pg_length;
+
+	assert(page);
+
+	if ((blocksize == PAGE_CACHE_SIZE) &&
+	    (pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
+	    (pb->pb_flags & PBF_READ) && pb->pb_locked) {
+		bn -= (pb->pb_offset >> SECTOR_SHIFT);
+		pg_offset = 0;
+		pg_length = PAGE_CACHE_SIZE;
+	} else {
+		pb_offset = offset - pb->pb_file_offset;
+		if (pb_offset) {
+			bn += (pb_offset + SECTOR_MASK) >> SECTOR_SHIFT;
+		}
+	}
+
+	if (pb->pb_flags & PBF_READ) {
+		_pagebuf_page_io(page, pb, bn, dev, blocksize,
+			(off_t)pg_offset, pg_length, pb->pb_locked, READ, 0);
+	} else if (pb->pb_flags & PBF_WRITE) {
+		int locking = (pb->pb_flags & _PBF_LOCKABLE) == 0;
+
+		/* Check we need to lock pages */
+		if (locking && (pb->pb_locked == 0))
+			lock_page(page);
+		_pagebuf_page_io(page, pb, bn, dev, blocksize,
+			(off_t)pg_offset, pg_length, locking, WRITE,
+			last && (pb->pb_flags & PBF_FLUSH));
+	}
+
+	return ret_len;
+}
+
+/*
+ *	pagebuf_iorequest
+ *
+ *	pagebuf_iorequest is the core I/O request routine.
+ *	It assumes that the buffer is well-formed and
+ *	mapped and ready for physical I/O, unlike
+ *	pagebuf_iostart() and pagebuf_iophysio().  Those
+ *	routines call the pagebuf_ioinitiate routine to start I/O,
+ *	if it is present, or else call pagebuf_iorequest()
+ *	directly if the pagebuf_ioinitiate routine is not present.
+ *
+ *	This function will be responsible for ensuring access to the
+ *	pages is restricted whilst I/O is in progress - for locking
+ *	pagebufs the pagebuf lock is the mediator, for non-locking
+ *	pagebufs the pages will be locked. In the locking case we
+ *	need to use the pagebuf lock as multiple meta-data buffers
+ *	will reference the same page.
+ */
+int
+pagebuf_iorequest(			/* start real I/O		*/
+	page_buf_t		*pb)	/* buffer to convey to device	*/
+{
+	int			status = 0;
+
+	PB_TRACE(pb, PB_TRACE_REC(ioreq), 0);
+
+	if (pb->pb_flags & PBF_DELWRI) {
+		pagebuf_delwri_queue(pb, 1);
+		return status;
+	}
+
+	if (pb->pb_flags & PBF_WRITE) {
+		_pagebuf_wait_unpin(pb);
+	}
+
+	/* Set the count to 1 initially, this will stop an I/O
+	 * completion callout which happens before we have started
+	 * all the I/O from calling iodone too early
+	 */
+	atomic_set(&PBP(pb)->pb_io_remaining, 1);
+	status = _pagebuf_segment_apply(pb);
+
+	/* Drop our count and if everything worked we are done */
+	if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) {
+		pagebuf_iodone(pb);
+	} else if ((pb->pb_flags & (PBF_SYNC|PBF_ASYNC)) == PBF_SYNC)  {
+		run_task_queue(&tq_disk);
+	}
+
+	return status < 0 ? status : 0;
+}
+
+/*
+ *	pagebuf_iowait
+ *
+ *	pagebuf_iowait waits for I/O to complete on the buffer supplied.
+ *	It returns immediately if no I/O is pending.  In any case, it returns
+ *	the error code, if any, or 0 if there is no error.
+ */
+int
+pagebuf_iowait(
+	page_buf_t		*pb)
+{
+	PB_TRACE(pb, PB_TRACE_REC(iowait), 0);
+	run_task_queue(&tq_disk);
+	down(&pb->pb_iodonesema);
+	PB_TRACE(pb, PB_TRACE_REC(iowaited), (int)pb->pb_error);
+	return pb->pb_error;
+}
+
+STATIC void *
+pagebuf_mapout_locked(
+	page_buf_t		*pb)
+{
+	void			*old_addr = NULL;
+
+	if (pb->pb_flags & PBF_MAPPED) {
+		if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
+			old_addr = pb->pb_addr - pb->pb_offset;
+		pb->pb_addr = NULL;
+		pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
+	}
+
+	return old_addr;	/* Caller must free the address space,
+				 * we are under a spin lock, probably
+				 * not safe to do vfree here
+				 */
+}
+
+caddr_t
+pagebuf_offset(
+	page_buf_t		*pb,
+	off_t			offset)
+{
+	struct page		*page;
+
+	offset += pb->pb_offset;
+
+	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
+	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
+}
+
+/*
+ *	pagebuf_segment
+ *
+ *	pagebuf_segment is used to retrieve the various contiguous
+ *	segments of a buffer.  The variable addressed by the
+ *	loff_t * should be initialized to 0, and successive
+ *	calls will update to point to the segment following the one
+ *	returned.
+ */
+STATIC void
+pagebuf_segment(
+	page_buf_t		*pb,	/* buffer to examine		*/
+	loff_t			*boff_p,/* offset in buffer of next	*/
+					/* next segment (updated)	*/
+	struct page		**spage_p, /* page (updated)		*/
+					/* (NULL if not in page array)	*/
+	size_t			*soff_p,/* offset in page (updated)	*/
+	size_t			*ssize_p) /* segment length (updated)	*/
+{
+	loff_t			kpboff;	/* offset in pagebuf		*/
+	int			kpi;	/* page index in pagebuf	*/
+	size_t			slen;	/* segment length		*/
+
+	kpboff = *boff_p;
+
+	kpi = page_buf_btoct(kpboff + pb->pb_offset);
+
+	*spage_p = pb->pb_pages[kpi];
+
+	*soff_p = page_buf_poff(kpboff + pb->pb_offset);
+	slen = PAGE_CACHE_SIZE - *soff_p;
+	if (slen > (pb->pb_count_desired - kpboff))
+		slen = (pb->pb_count_desired - kpboff);
+	*ssize_p = slen;
+
+	*boff_p = *boff_p + slen;
+}
+
+/*
+ *	pagebuf_iomove
+ *
+ *	Move data into or out of a buffer.
+ */
+void
+pagebuf_iomove(
+	page_buf_t		*pb,	/* buffer to process		*/
+	off_t			boff,	/* starting buffer offset	*/
+	size_t			bsize,	/* length to copy		*/
+	caddr_t			data,	/* data address			*/
+	page_buf_rw_t		mode)	/* read/write flag		*/
+{
+	loff_t			cboff;
+	size_t			cpoff;
+	size_t			csize;
+	struct page		*page;
+
+	cboff = boff;
+	boff += bsize; /* last */
+
+	while (cboff < boff) {
+		pagebuf_segment(pb, &cboff, &page, &cpoff, &csize);
+		assert(((csize + cpoff) <= PAGE_CACHE_SIZE));
+
+		switch (mode) {
+		case PBRW_ZERO:
+			memset(page_address(page) + cpoff, 0, csize);
+			break;
+		case PBRW_READ:
+			memcpy(data, page_address(page) + cpoff, csize);
+			break;
+		case PBRW_WRITE:
+			memcpy(page_address(page) + cpoff, data, csize);
+		}
+
+		data += csize;
+	}
+}
+
+/*
+ *	_pagebuf_segment_apply
+ *
+ *	Applies _page_buf_page_apply to each segment of the page_buf_t.
+ */
+STATIC int
+_pagebuf_segment_apply(			/* apply function to segments	*/
+	page_buf_t		*pb)	/* buffer to examine		*/
+{
+	int			buf_index, sval, status = 0;
+	loff_t			buffer_offset = pb->pb_file_offset;
+	size_t			buffer_len = pb->pb_count_desired;
+	size_t			page_offset, len, total = 0;
+	size_t			cur_offset, cur_len;
+
+	pagebuf_hold(pb);
+
+	cur_offset = pb->pb_offset;
+	cur_len = buffer_len;
+
+	for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) {
+		if (cur_len == 0)
+			break;
+		if (cur_offset >= PAGE_CACHE_SIZE) {
+			cur_offset -= PAGE_CACHE_SIZE;
+			continue;
+		}
+
+		page_offset = cur_offset;
+		cur_offset = 0;
+
+		len = PAGE_CACHE_SIZE - page_offset;
+		if (len > cur_len)
+			len = cur_len;
+		cur_len -= len;
+
+		sval = _page_buf_page_apply(pb, buffer_offset,
+				pb->pb_pages[buf_index], page_offset, len,
+				buf_index+1 == pb->pb_page_count);
+		if (sval <= 0) {
+			status = sval;
+			break;
+		} else {
+			len = sval;
+			total += len;
+		}
+
+		buffer_offset += len;
+		buffer_len -= len;
+	}
+
+	pagebuf_rele(pb);
+
+	if (!status)
+		status = total;
+
+	return (status);
+}
+
+
+/*
+ * Pagebuf delayed write buffer handling
+ */
+
+void
+pagebuf_delwri_queue(
+	page_buf_t		*pb,
+	int			unlock)
+{
+	PB_TRACE(pb, PB_TRACE_REC(delwri_q), unlock);
+	spin_lock(&pb_daemon->pb_delwrite_lock);
+	/* If already in the queue, dequeue and place at tail */
+	if (!list_empty(&pb->pb_list)) {
+		if (unlock) {
+			atomic_dec(&pb->pb_hold);
+		}
+		list_del(&pb->pb_list);
+	} else {
+		pb_daemon->pb_delwri_cnt++;
+	}
+	list_add_tail(&pb->pb_list, &pb_daemon->pb_delwrite_l);
+	PBP(pb)->pb_flushtime = jiffies + pb_params.p_un.age_buffer;
+	spin_unlock(&pb_daemon->pb_delwrite_lock);
+
+	if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) {
+		pagebuf_unlock(pb);
+	}
+}
+
+void
+pagebuf_delwri_dequeue(
+	page_buf_t		*pb)
+{
+	PB_TRACE(pb, PB_TRACE_REC(delwri_uq), 0);
+	spin_lock(&pb_daemon->pb_delwrite_lock);
+	list_del_init(&pb->pb_list);
+	pb->pb_flags &= ~PBF_DELWRI;
+	pb_daemon->pb_delwri_cnt--;
+	spin_unlock(&pb_daemon->pb_delwrite_lock);
+}
+
+
+/*
+ * The pagebuf iodone daemon
+ */
+
+STATIC int pb_daemons[NR_CPUS];
+
+STATIC int
+pagebuf_iodone_daemon(
+	void			*__bind_cpu)
+{
+	int			bind_cpu = (int) (long) __bind_cpu;
+	int			cpu = cpu_logical_map(bind_cpu);
+	DECLARE_WAITQUEUE	(wait, current);
+
+	/*  Set up the thread  */
+	daemonize();
+
+	/* Avoid signals */
+	spin_lock_irq(&current->sigmask_lock);
+	sigfillset(&current->blocked);
+	recalc_sigpending(current);
+	spin_unlock_irq(&current->sigmask_lock);
+
+	/* Migrate to the right CPU */
+	current->cpus_allowed = 1UL << cpu;
+	while (smp_processor_id() != cpu)
+		schedule();
+
+	sprintf(current->comm, "pagebuf_io_CPU%d", bind_cpu);
+	INIT_LIST_HEAD(&pagebuf_iodone_tq[cpu]);
+	init_waitqueue_head(&pagebuf_iodone_wait[cpu]);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	mb();
+
+	pb_daemons[cpu] = 1;
+
+	for (;;) {
+		add_wait_queue(&pagebuf_iodone_wait[cpu],
+				&wait);
+
+		if (TQ_ACTIVE(pagebuf_iodone_tq[cpu]))
+			__set_task_state(current, TASK_RUNNING);
+		schedule();
+		remove_wait_queue(&pagebuf_iodone_wait[cpu],
+				&wait);
+		run_task_queue(&pagebuf_iodone_tq[cpu]);
+		if (pb_daemons[cpu] == 0)
+			break;
+		__set_current_state(TASK_INTERRUPTIBLE);
+	}
+
+	pb_daemons[cpu] = -1;
+	wake_up_interruptible(&pagebuf_iodone_wait[cpu]);
+	return 0;
+}
+
+/* Defines for pagebuf daemon */
+DECLARE_WAIT_QUEUE_HEAD(pbd_waitq);
+STATIC int force_flush;
+
+STATIC void
+pagebuf_daemon_wakeup(
+	int			flag)
+{
+	force_flush = flag;
+	if (waitqueue_active(&pbd_waitq)) {
+		wake_up_interruptible(&pbd_waitq);
+	}
+}
+
+typedef void (*timeout_fn)(unsigned long);
+
+STATIC int
+pagebuf_daemon(
+	void			*data)
+{
+	int			count;
+	page_buf_t		*pb;
+	struct list_head	*curr, *next, tmp;
+	struct timer_list	pb_daemon_timer =
+		{ {NULL, NULL}, 0, 0, (timeout_fn)pagebuf_daemon_wakeup };
+
+	/*  Set up the thread  */
+	daemonize();
+
+	/* Avoid signals */
+	spin_lock_irq(&current->sigmask_lock);
+	sigfillset(&current->blocked);
+	recalc_sigpending(current);
+	spin_unlock_irq(&current->sigmask_lock);
+
+	strcpy(current->comm, "pagebufd");
+	current->flags |= PF_MEMALLOC;
+
+	INIT_LIST_HEAD(&tmp);
+	do {
+		if (pb_daemon->active == 1) {
+			del_timer(&pb_daemon_timer);
+			pb_daemon_timer.expires = jiffies +
+					pb_params.p_un.flush_interval;
+			add_timer(&pb_daemon_timer);
+			interruptible_sleep_on(&pbd_waitq);
+		}
+
+		if (pb_daemon->active == 0) {
+			del_timer(&pb_daemon_timer);
+		}
+
+		spin_lock(&pb_daemon->pb_delwrite_lock);
+
+		count = 0;
+		list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) {
+			pb = list_entry(curr, page_buf_t, pb_list);
+
+			PB_TRACE(pb, PB_TRACE_REC(walkq1), pagebuf_ispin(pb));
+
+			if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) &&
+			    (((pb->pb_flags & _PBF_LOCKABLE) == 0) ||
+			     !pagebuf_cond_lock(pb))) {
+
+				if (!force_flush && time_before(jiffies,
+						PBP(pb)->pb_flushtime)) {
+					pagebuf_unlock(pb);
+					break;
+				}
+
+				list_del(&pb->pb_list);
+				list_add(&pb->pb_list, &tmp);
+
+				count++;
+			}
+		}
+
+		spin_unlock(&pb_daemon->pb_delwrite_lock);
+		while (!list_empty(&tmp)) {
+			pb = list_entry(tmp.next,
+							page_buf_t, pb_list);
+			list_del_init(&pb->pb_list);
+			pb->pb_flags &= ~PBF_DELWRI;
+			pb->pb_flags |= PBF_WRITE;
+
+			__pagebuf_iorequest(pb);
+		}
+
+		if (count)
+			run_task_queue(&tq_disk);
+		if (as_list_len > 0)
+			purge_addresses();
+
+		force_flush = 0;
+	} while (pb_daemon->active == 1);
+
+	pb_daemon->active = -1;
+	wake_up_interruptible(&pbd_waitq);
+
+	return 0;
+}
+
+void
+pagebuf_delwri_flush(
+	pb_target_t		*target,
+	u_long			flags,
+	int			*pinptr)
+{
+	page_buf_t		*pb;
+	struct list_head	*curr, *next, tmp;
+	int			pincount = 0;
+
+	spin_lock(&pb_daemon->pb_delwrite_lock);
+	INIT_LIST_HEAD(&tmp);
+
+	list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) {
+		pb = list_entry(curr, page_buf_t, pb_list);
+
+		/*
+		 * Skip other targets, markers and in progress buffers
+		 */
+
+		if ((pb->pb_flags == 0) || (pb->pb_target != target) ||
+		    !(pb->pb_flags & PBF_DELWRI)) {
+			continue;
+		}
+
+		PB_TRACE(pb, PB_TRACE_REC(walkq2), pagebuf_ispin(pb));
+		if (pagebuf_ispin(pb)) {
+			pincount++;
+			continue;
+		}
+
+		if (flags & PBDF_TRYLOCK) {
+			if (!pagebuf_cond_lock(pb)) {
+				pincount++;
+				continue;
+			}
+		}
+
+		list_del_init(&pb->pb_list);
+		if (flags & PBDF_WAIT) {
+			list_add(&pb->pb_list, &tmp);
+			pb->pb_flags &= ~PBF_ASYNC;
+		}
+
+		spin_unlock(&pb_daemon->pb_delwrite_lock);
+
+		if ((flags & PBDF_TRYLOCK) == 0) {
+			pagebuf_lock(pb);
+		}
+
+		pb->pb_flags &= ~PBF_DELWRI;
+		pb->pb_flags |= PBF_WRITE;
+
+		__pagebuf_iorequest(pb);
+
+		spin_lock(&pb_daemon->pb_delwrite_lock);
+	}
+
+	spin_unlock(&pb_daemon->pb_delwrite_lock);
+
+	run_task_queue(&tq_disk);
+
+	if (pinptr)
+		*pinptr = pincount;
+
+	if ((flags & PBDF_WAIT) == 0)
+		return;
+
+	while (!list_empty(&tmp)) {
+		pb = list_entry(tmp.next, page_buf_t, pb_list);
+
+		list_del_init(&pb->pb_list);
+		pagebuf_iowait(pb);
+		if (!pb->pb_relse)
+			pagebuf_unlock(pb);
+		pagebuf_rele(pb);
+	}
+}
+
+STATIC int
+pagebuf_daemon_start(void)
+{
+	if (!pb_daemon) {
+		int		cpu;
+
+		pb_daemon = (pagebuf_daemon_t *)
+				kmalloc(sizeof(pagebuf_daemon_t), GFP_KERNEL);
+		if (!pb_daemon) {
+			return -1; /* error */
+		}
+
+		pb_daemon->active = 1;
+		pb_daemon->io_active = 1;
+		pb_daemon->pb_delwri_cnt = 0;
+		pb_daemon->pb_delwrite_lock = SPIN_LOCK_UNLOCKED;
+
+		INIT_LIST_HEAD(&pb_daemon->pb_delwrite_l);
+
+		kernel_thread(pagebuf_daemon, (void *)pb_daemon,
+				CLONE_FS|CLONE_FILES|CLONE_VM);
+		for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+			if (kernel_thread(pagebuf_iodone_daemon,
+					(void *)(long) cpu,
+					CLONE_FS|CLONE_FILES|CLONE_VM) < 0) {
+				printk("pagebuf_daemon_start failed\n");
+			} else {
+				while (!pb_daemons[cpu_logical_map(cpu)]) {
+					current->policy |= SCHED_YIELD;
+					schedule();
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * pagebuf_daemon_stop
+ * 
+ * Note: do not mark as __exit, it is called from pagebuf_terminate.
+ */
+STATIC void
+pagebuf_daemon_stop(void)
+{
+	if (pb_daemon) {
+		int		cpu;
+
+		pb_daemon->active = 0;
+		pb_daemon->io_active = 0;
+
+		wake_up_interruptible(&pbd_waitq);
+		while (pb_daemon->active == 0) {
+			interruptible_sleep_on(&pbd_waitq);
+		}
+		for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+			pb_daemons[cpu_logical_map(cpu)] = 0;
+			wake_up(&pagebuf_iodone_wait[cpu_logical_map(cpu)]);
+			while (pb_daemons[cpu_logical_map(cpu)] != -1) {
+				interruptible_sleep_on(
+				    &pagebuf_iodone_wait[cpu_logical_map(cpu)]);
+			}
+		}
+
+		kfree(pb_daemon);
+		pb_daemon = NULL;
+	}
+}
+
+
+/*
+ * Pagebuf sysctl interface
+ */
+
+STATIC int
+pb_stats_clear_handler(
+	ctl_table		*ctl,
+	int			write,
+	struct file		*filp,
+	void			*buffer,
+	size_t			*lenp)
+{
+	int			ret;
+	int			*valp = ctl->data;
+
+	ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp);
+
+	if (!ret && write && *valp) {
+		printk("XFS Clearing pbstats\n");
+		memset(&pbstats, 0, sizeof(pbstats));
+		pb_params.p_un.stats_clear = 0;
+	}
+
+	return ret;
+}
+
+STATIC struct ctl_table_header *pagebuf_table_header;
+
+STATIC ctl_table pagebuf_table[] = {
+	{PB_FLUSH_INT, "flush_int", &pb_params.data[0],
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
+	&sysctl_intvec, NULL, &pagebuf_min[0], &pagebuf_max[0]},
+
+	{PB_FLUSH_AGE, "flush_age", &pb_params.data[1],
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
+	&sysctl_intvec, NULL, &pagebuf_min[1], &pagebuf_max[1]},
+
+	{PB_STATS_CLEAR, "stats_clear", &pb_params.data[3],
+	sizeof(ulong), 0644, NULL, &pb_stats_clear_handler,
+	&sysctl_intvec, NULL, &pagebuf_min[3], &pagebuf_max[3]},
+
+#ifdef PAGEBUF_TRACE
+	{PB_DEBUG, "debug", &pb_params.data[4],
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_minmax,
+	&sysctl_intvec, NULL, &pagebuf_min[4], &pagebuf_max[4]},
+#endif
+	{0}
+};
+
+STATIC ctl_table pagebuf_dir_table[] = {
+	{VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table},
+	{0}
+};
+
+STATIC ctl_table pagebuf_root_table[] = {
+	{CTL_VM, "vm",	NULL, 0, 0555, pagebuf_dir_table},
+	{0}
+};
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+pagebuf_readstats(
+	char			*buffer,
+	char			**start,
+	off_t			offset,
+	int			count,
+	int			*eof,
+	void			*data)
+{
+	int			i, len;
+
+	len = 0;
+	len += sprintf(buffer + len, "pagebuf");
+	for (i = 0; i < sizeof(pbstats) / sizeof(u_int32_t); i++) {
+		len += sprintf(buffer + len, " %u",
+			*(((u_int32_t*)&pbstats) + i));
+	}
+	buffer[len++] = '\n';
+
+	if (offset >= len) {
+		*start = buffer;
+		*eof = 1;
+		return 0;
+	}
+	*start = buffer + offset;
+	if ((len -= offset) > count)
+		return count;
+	*eof = 1;
+
+	return len;
+}
+#endif	/* CONFIG_PROC_FS */
+
+STATIC void
+pagebuf_shaker(void)
+{
+	pagebuf_daemon_wakeup(1);
+}
+
+
+/*
+ *	Initialization and Termination
+ */
+
+int __init
+pagebuf_init(void)
+{
+	int			i;
+
+	pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);
+
+#ifdef CONFIG_PROC_FS
+	if (proc_mkdir("fs/pagebuf", 0))
+		create_proc_read_entry(
+			"fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL);
+#endif
+
+	pagebuf_cache = kmem_cache_create("page_buf_t",
+			sizeof(page_buf_private_t), 0,
+			SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (pagebuf_cache == NULL) {
+		printk("pagebuf: couldn't init pagebuf cache\n");
+		pagebuf_terminate();
+		return -ENOMEM;
+	}
+
+	if (_pagebuf_prealloc_bh(NR_RESERVED_BH) < NR_RESERVED_BH) {
+		printk("pagebuf: couldn't pre-allocate %d buffer heads\n",
+			NR_RESERVED_BH);
+		pagebuf_terminate();
+		return -ENOMEM;
+	}
+
+	init_waitqueue_head(&pb_resv_bh_wait);
+
+	for (i = 0; i < NHASH; i++) {
+		spin_lock_init(&pbhash[i].pb_hash_lock);
+		INIT_LIST_HEAD(&pbhash[i].pb_hash);
+	}
+
+#ifdef PAGEBUF_TRACE
+# if 1
+	pb_trace.buf = (pagebuf_trace_t *)kmalloc(
+			PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t), GFP_KERNEL);
+# else
+	/* Alternatively, for really really long trace bufs */
+	pb_trace.buf = (pagebuf_trace_t *)vmalloc(
+			PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t));
+# endif
+	memset(pb_trace.buf, 0, PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t));
+	pb_trace.start = 0;
+	pb_trace.end = PB_TRACE_BUFSIZE - 1;
+#endif
+
+	pagebuf_daemon_start();
+	kmem_shake_register(pagebuf_shaker);
+	return 0;
+}
+
+/*
+ *	pagebuf_terminate. 
+ * 
+ *	Note: do not mark as __exit, this is also called from the __init code.
+ */
+void
+pagebuf_terminate(void)
+{
+	pagebuf_daemon_stop();
+
+	kmem_cache_destroy(pagebuf_cache);
+	kmem_shake_deregister(pagebuf_shaker);
+
+	unregister_sysctl_table(pagebuf_table_header);
+#ifdef	CONFIG_PROC_FS
+	remove_proc_entry("fs/pagebuf/stat", NULL);
+	remove_proc_entry("fs/pagebuf", NULL);
+#endif
+}
+
+
+/*
+ *	Module management (for kernel debugger module)
+ */
+EXPORT_SYMBOL(pagebuf_offset);
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf.h linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI
+ */
+
+#ifndef __PAGE_BUF_H__
+#define __PAGE_BUF_H__
+
+#include <linux/version.h>
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/uio.h>
+
+
+/*
+ * Turn this on to get pagebuf lock ownership
+#define PAGEBUF_LOCK_TRACKING
+*/
+
+/*
+ *	Base types
+ */
+
+/* daddr must be signed since -1 is used for bmaps that are not yet allocated */
+typedef loff_t page_buf_daddr_t;
+
+#define PAGE_BUF_DADDR_NULL ((page_buf_daddr_t) (-1LL))
+
+typedef size_t page_buf_dsize_t;		/* size of buffer in blocks */
+
+#define page_buf_ctob(pp)	((pp) * PAGE_CACHE_SIZE)
+#define page_buf_btoc(dd)	(((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)
+#define page_buf_btoct(dd)	((dd) >> PAGE_CACHE_SHIFT)
+#define page_buf_poff(aa)	((aa) & ~PAGE_CACHE_MASK)
+
+typedef enum page_buf_rw_e {
+	PBRW_READ = 1,			/* transfer into target memory */
+	PBRW_WRITE = 2,			/* transfer from target memory */
+	PBRW_ZERO = 3			/* Zero target memory */
+} page_buf_rw_t;
+
+typedef enum {				/* pbm_flags values */
+	PBMF_EOF =		0x01,	/* mapping contains EOF		*/
+	PBMF_HOLE =		0x02,	/* mapping covers a hole	*/
+	PBMF_DELAY =		0x04,	/* mapping covers delalloc region  */
+	PBMF_UNWRITTEN =	0x20	/* mapping covers allocated	*/
+					/* but uninitialized XFS data	*/
+} bmap_flags_t;
+
+typedef enum page_buf_flags_e {		/* pb_flags values */
+	PBF_READ = (1 << 0),	/* buffer intended for reading from device */
+	PBF_WRITE = (1 << 1),	/* buffer intended for writing to device   */
+	PBF_MAPPED = (1 << 2),	/* buffer mapped (pb_addr valid)	   */
+	PBF_PARTIAL = (1 << 3), /* buffer partially read		   */
+	PBF_ASYNC = (1 << 4),	/* initiator will not wait for completion  */
+	PBF_NONE = (1 << 5),	/* buffer not read at all		   */
+	PBF_DELWRI = (1 << 6),	/* buffer has dirty pages		   */
+	PBF_FREED = (1 << 7),	/* buffer has been freed and is invalid	   */
+	PBF_SYNC = (1 << 8),	/* force updates to disk		   */
+	PBF_MAPPABLE = (1 << 9),/* use directly-addressable pages	   */
+	PBF_STALE = (1 << 10),	/* buffer has been staled, do not find it  */
+	PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory  */
+	PBF_RELEASE = (1 << 12),/* buffer to be released after I/O is done */
+
+	/* flags used only as arguments to access routines */
+	PBF_LOCK = (1 << 13),	/* lock requested			   */
+	PBF_TRYLOCK = (1 << 14), /* lock requested, but do not wait	   */
+	PBF_ALLOCATE = (1 << 15), /* allocate all pages		  (UNUSED) */
+	PBF_FILE_ALLOCATE = (1 << 16), /* allocate all file space	   */
+	PBF_DONT_BLOCK = (1 << 17), /* do not block in current thread	   */
+	PBF_DIRECT = (1 << 18),	  /* direct I/O desired			   */
+	PBF_ENTER_PAGES = (1 << 21), /* create invalid pages for all	   */
+				/* pages in the range of the buffer	   */
+				/* not already associated with buffer	   */
+
+	/* flags used only internally */
+	_PBF_LOCKABLE = (1 << 19), /* page_buf_t may be locked		   */
+	_PBF_PRIVATE_BH = (1 << 20), /* do not use public buffer heads	   */
+	_PBF_ALL_PAGES_MAPPED = (1 << 22),
+				/* all pages in rage are mapped		   */
+	_PBF_SOME_INVALID_PAGES = (1 << 23),
+				/* some mapped pages are not valid	   */
+	_PBF_ADDR_ALLOCATED = (1 << 24),
+				/* pb_addr space was allocated		   */
+	_PBF_MEM_ALLOCATED = (1 << 25),
+				/* pb_mem and underlying pages allocated   */
+
+	PBF_FORCEIO = (1 << 27),
+	PBF_FLUSH = (1 << 28),	/* flush disk write cache */
+	PBF_READ_AHEAD = (1 << 29),
+	PBF_FS_RESERVED_3 = (1 << 31)	/* reserved (XFS use: XFS_B_STALE) */
+
+} page_buf_flags_t;
+
+#define PBF_UPDATE (PBF_READ | PBF_WRITE)
+#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0)
+#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0)
+
+#define PBR_SECTOR_ONLY	1	/* only use sector size buffer heads */
+#define PBR_ALIGNED_ONLY 2	/* only use aligned I/O */
+
+typedef struct pb_target {
+	int			pbr_flags;
+	dev_t			pbr_dev;
+	kdev_t			pbr_kdev;
+	struct block_device	*pbr_bdev;
+	struct address_space	*pbr_mapping;
+	unsigned int		pbr_blocksize;
+	unsigned int		pbr_blocksize_bits;
+} pb_target_t;
+
+/*
+ *	page_buf_bmap_t:  File system I/O map
+ *
+ * The pbm_bn, pbm_offset and pbm_length fields are expressed in disk blocks.
+ * The pbm_length field specifies the size of the underlying backing store
+ * for the particular mapping.
+ *
+ * The pbm_bsize, pbm_size and pbm_delta fields are in bytes and indicate
+ * the size of the mapping, the number of bytes that are valid to access
+ * (read or write), and the offset into the mapping, given the offset
+ * supplied to the file I/O map routine.  pbm_delta is the offset of the
+ * desired data from the beginning of the mapping.
+ *
+ * When a request is made to read beyond the logical end of the object,
+ * pbm_size may be set to 0, but pbm_offset and pbm_length should be set to
+ * the actual amount of underlying storage that has been allocated, if any.
+ */
+
+typedef struct page_buf_bmap_s {
+	page_buf_daddr_t pbm_bn;	/* block number in file system	    */
+	pb_target_t	*pbm_target;	/* device to do I/O to		    */
+	loff_t		pbm_offset;	/* byte offset of mapping in file   */
+	size_t		pbm_delta;	/* offset of request into bmap	    */
+	size_t		pbm_bsize;	/* size of this mapping in bytes    */
+	bmap_flags_t	pbm_flags;	/* options flags for mapping	    */
+} page_buf_bmap_t;
+
+typedef page_buf_bmap_t pb_bmap_t;
+
+
+/*
+ *	page_buf_t:  Buffer structure for page cache-based buffers
+ *
+ * This buffer structure is used by the page cache buffer management routines
+ * to refer to an assembly of pages forming a logical buffer.  The actual
+ * I/O is performed with buffer_head or bio structures, as required by drivers,
+ * for drivers which do not understand this structure.	The buffer structure is
+ * used on temporary basis only, and discarded when released.
+ *
+ * The real data storage is recorded in the page cache.	 Metadata is
+ * hashed to the inode for the block device on which the file system resides.
+ * File data is hashed to the inode for the file.  Pages which are only
+ * partially filled with data have bits set in their block_map entry
+ * to indicate which disk blocks in the page are not valid.
+ */
+
+struct page_buf_s;
+typedef void (*page_buf_iodone_t)(struct page_buf_s *);
+			/* call-back function on I/O completion */
+typedef void (*page_buf_relse_t)(struct page_buf_s *);
+			/* call-back function on I/O completion */
+typedef int (*page_buf_bdstrat_t)(struct page_buf_s *);
+
+#define PB_PAGES	4
+
+typedef struct page_buf_s {
+	struct list_head	pb_list;
+	page_buf_flags_t	pb_flags;	/* status flags */
+	struct list_head	pb_hash_list;
+	struct pb_target	*pb_target;	/* logical object */
+	atomic_t		pb_hold;	/* reference count */
+	page_buf_daddr_t	pb_bn;		/* block number for I/O */
+	loff_t			pb_file_offset; /* offset in file */
+	size_t			pb_buffer_length; /* size of buffer in bytes */
+	size_t			pb_count_desired; /* desired transfer size */
+	void			*pb_addr;	/* virtual address of buffer */
+	struct tq_struct	pb_iodone_sched;
+	page_buf_iodone_t	pb_iodone;	/* I/O completion function */
+	page_buf_relse_t	pb_relse;	/* releasing function */
+	page_buf_bdstrat_t	pb_strat;	/* pre-write function */
+	struct semaphore	pb_iodonesema;	/* Semaphore for I/O waiters */
+	void			*pb_fspriv;
+	void			*pb_fspriv2;
+	void			*pb_fspriv3;
+	unsigned short		pb_error;	/* error code on I/O */
+	unsigned short		pb_page_count;	/* size of page array */
+	unsigned short		pb_offset;	/* page offset in first page */
+	unsigned char		pb_locked;	/* page array is locked */
+	unsigned char		pb_hash_index;	/* hash table index	*/
+	struct page		**pb_pages;	/* array of page pointers */
+	struct page		*pb_page_array[PB_PAGES]; /* inline pages */
+} page_buf_t;
+
+
+/*
+ * page_buf module entry points
+ */
+
+/* Finding and Reading Buffers */
+
+extern page_buf_t *pagebuf_find(	/* find buffer for block if	*/
+					/* the block is in memory	*/
+		struct pb_target *,	/* inode for block		*/
+		loff_t,			/* starting offset of range	*/
+		size_t,			/* length of range		*/
+		page_buf_flags_t);	/* PBF_LOCK			*/
+
+extern page_buf_t *pagebuf_get(		/* allocate a buffer		*/
+		struct pb_target *,	/* inode for buffer		*/
+		loff_t,			/* starting offset of range	*/
+		size_t,			/* length of range		*/
+		page_buf_flags_t);	/* PBF_LOCK, PBF_READ, PBF_ALLOCATE, */
+					/* PBF_ASYNC,			*/	
+
+extern page_buf_t *pagebuf_lookup(
+		struct pb_target *,
+		struct inode *,
+		loff_t,			/* starting offset of range	*/
+		size_t,			/* length of range		*/
+		int);			/* PBF_ENTER_PAGES		*/
+
+extern page_buf_t *pagebuf_get_empty(	/* allocate pagebuf struct with */
+					/*  no memory or disk address	*/
+		struct pb_target *);	/* mount point "fake" inode	*/
+
+extern page_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct	*/
+					/* without disk address		*/
+		size_t len,
+		struct pb_target *);	/* mount point "fake" inode	*/
+
+extern int	pagebuf_associate_memory(
+		page_buf_t *,
+		void *,
+		size_t);
+
+
+extern void pagebuf_hold(		/* increment reference count	*/
+		page_buf_t *);		/* buffer to hold		*/
+
+extern void pagebuf_readahead(		/* read ahead into cache	*/
+		struct pb_target  *,	/* target for buffer (or NULL)	*/
+		loff_t,			/* starting offset of range	*/
+		size_t,			/* length of range		*/
+		int);			/* additional read flags	*/
+
+/* Writing and Releasing Buffers */
+
+extern void pagebuf_free(		/* deallocate a buffer		*/
+		page_buf_t *);		/* buffer to deallocate		*/
+
+extern void pagebuf_rele(		/* release hold on a buffer	*/
+		page_buf_t *);		/* buffer to release		*/
+
+/* Locking and Unlocking Buffers */
+
+extern int pagebuf_cond_lock(		/* lock buffer, if not locked	*/
+					/* (returns -EBUSY if locked)	*/
+		page_buf_t *);		/* buffer to lock		*/
+
+extern int pagebuf_lock_value(		/* return count on lock		*/
+		page_buf_t *);		/* buffer to check		*/
+
+extern int pagebuf_lock(		/* lock buffer			*/
+		page_buf_t *);		/* buffer to lock		*/
+
+extern void pagebuf_lock_disable(	/* disable buffer locking	*/
+		struct pb_target *,	/* inode for buffers		*/
+		int);			/* do blkdev_put?		*/
+
+extern struct pb_target *pagebuf_lock_enable(
+		dev_t,
+		int);			/* do blkdev_get?		*/
+
+extern void pagebuf_target_blocksize(
+		pb_target_t *,
+		unsigned int);		/* block size			*/
+
+extern void pagebuf_target_clear(struct pb_target *);
+
+extern void pagebuf_unlock(		/* unlock buffer		*/
+		page_buf_t *);		/* buffer to unlock		*/
+
+/* Buffer Utility Routines */
+
+#define pagebuf_geterror(pb)	((pb)->pb_error)
+
+extern void pagebuf_queue_task(
+		struct tq_struct *);
+
+extern void pagebuf_iodone(		/* mark buffer I/O complete	*/
+		page_buf_t *);		/* buffer to mark		*/
+
+extern void pagebuf_ioerror(		/* mark buffer in error (or not) */
+		page_buf_t *,		/* buffer to mark		*/
+		unsigned int);		/* error to store (0 if none)	*/
+
+extern int pagebuf_iostart(		/* start I/O on a buffer	*/
+		page_buf_t *,		/* buffer to start		*/
+		page_buf_flags_t);	/* PBF_LOCK, PBF_ASYNC, PBF_READ, */
+					/* PBF_WRITE, PBF_ALLOCATE,	*/
+					/* PBF_DELWRI, 			*/
+					/* PBF_SYNC			*/
+
+extern int pagebuf_iorequest(		/* start real I/O		*/
+		page_buf_t *);		/* buffer to convey to device	*/
+
+	/*
+	 * pagebuf_iorequest is the core I/O request routine.
+	 * It assumes that the buffer is well-formed and
+	 * mapped and ready for physical I/O, unlike
+	 * pagebuf_iostart() and pagebuf_iophysio().  Those
+	 * routines call the inode pagebuf_ioinitiate routine to start I/O,
+	 * if it is present, or else call pagebuf_iorequest()
+	 * directly if the inode pagebuf_ioinitiate routine is not present.
+	 */
+
+extern int pagebuf_iowait(		/* wait for buffer I/O done	*/
+		page_buf_t *);		/* buffer to wait on		*/
+
+extern caddr_t	pagebuf_offset(page_buf_t *, off_t);
+
+extern void pagebuf_iomove(		/* move data in/out of pagebuf	*/
+		page_buf_t *,		/* buffer to manipulate		*/
+		off_t,			/* starting buffer offset	*/
+		size_t,			/* length in buffer		*/
+		caddr_t,		/* data pointer			*/
+		page_buf_rw_t);		/* direction			*/
+
+/* Pinning Buffer Storage in Memory */
+
+extern void pagebuf_pin(		/* pin buffer in memory		*/
+		page_buf_t *);		/* buffer to pin		*/
+
+extern void pagebuf_unpin(		/* unpin buffered data		*/
+		page_buf_t *);		/* buffer to unpin		*/
+
+extern int pagebuf_ispin( page_buf_t *); /* check if pagebuf is pinned	*/
+
+/* Reading and writing pages */
+
+extern int pagebuf_write_full_page(	/* write a page via pagebuf	*/
+		struct page *,		/* page to write		*/
+		int delalloc);		/* delalloc bh present		*/
+
+extern int pagebuf_release_page(	/* Attempt to convert a delalloc page */
+		struct page *);		/* page to release		*/
+
+extern void pagebuf_delwri_queue(page_buf_t *, int);
+extern void pagebuf_delwri_dequeue(page_buf_t *);
+
+#define PBDF_WAIT    0x01
+#define PBDF_TRYLOCK 0x02
+extern void pagebuf_delwri_flush(
+		struct pb_target *,
+		unsigned long,
+		int *);
+
+extern int pagebuf_init(void);
+extern void pagebuf_terminate(void);
+
+static __inline__ int __pagebuf_iorequest(page_buf_t *pb)
+{
+	if (pb->pb_strat)
+		return pb->pb_strat(pb);
+	return pagebuf_iorequest(pb);
+}
+
+#endif /* __PAGE_BUF_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf.lst linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf.lst
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf.lst	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf.lst	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,7580 @@
+
+/repo/linux-2.4-xfs/fs/xfs/pagebuf/page_buf.o:     file format elf32-i386
+
+Disassembly of section .text:
+
+0000000000000000 <_bhash>:
+       0:	56                   	push   %esi
+       1:	53                   	push   %ebx
+       2:	8b 4c 24 10          	mov    0x10(%esp,1),%ecx
+       6:	8b 5c 24 14          	mov    0x14(%esp,1),%ebx
+       a:	0f ac d9 09          	shrd   $0x9,%ebx,%ecx
+       e:	c1 fb 09             	sar    $0x9,%ebx
+      11:	0f b7 44 24 0c       	movzwl 0xc(%esp,1),%eax
+      16:	31 d2                	xor    %edx,%edx
+      18:	31 c1                	xor    %eax,%ecx
+      1a:	31 d3                	xor    %edx,%ebx
+      1c:	31 f6                	xor    %esi,%esi
+      1e:	89 c8                	mov    %ecx,%eax
+      20:	09 d8                	or     %ebx,%eax
+      22:	74 1c                	je     40 <_bhash+0x40>
+      24:	89 c8                	mov    %ecx,%eax
+      26:	83 e0 1f             	and    $0x1f,%eax
+      29:	31 c6                	xor    %eax,%esi
+      2b:	0f ac d9 05          	shrd   $0x5,%ebx,%ecx
+      2f:	c1 fb 05             	sar    $0x5,%ebx
+      32:	83 c2 05             	add    $0x5,%edx
+      35:	89 c8                	mov    %ecx,%eax
+      37:	09 d8                	or     %ebx,%eax
+      39:	74 05                	je     40 <_bhash+0x40>
+      3b:	83 fa 3f             	cmp    $0x3f,%edx
+      3e:	76 e4                	jbe    24 <_bhash+0x24>
+      40:	89 f0                	mov    %esi,%eax
+      42:	5b                   	pop    %ebx
+      43:	5e                   	pop    %esi
+      44:	c3                   	ret    
+      45:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000000048 <free_address>:
+{
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+      48:	81 3d 44 00 00 00 ad 	cmpl   $0xdead4ead,0x44
+      4f:	4e ad de 
+      52:	74 1a                	je     6e <free_address+0x26>
+printk("eip: %p\n", &&here);
+      54:	68 48 00 00 00       	push   $0x48
+      59:	68 2b 00 00 00       	push   $0x2b
+      5e:	e8 fc ff ff ff       	call   5f <free_address+0x17>
+		BUG();
+      63:	0f 0b                	ud2a   
+      65:	85 00                	test   %eax,(%eax)
+      67:	00 00                	add    %al,(%eax)
+      69:	00 00                	add    %al,(%eax)
+	}
+      6b:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+      6e:	f0 fe 0d 40 00 00 00 	lock decb 0x40
+      75:	0f 88 4e 2d 00 00    	js     2dc9 <Letext>
+{
+	a_list_t	*aentry;
+
+	spin_lock(&as_lock);
+	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
+      7b:	6a 20                	push   $0x20
+      7d:	6a 08                	push   $0x8
+      7f:	e8 fc ff ff ff       	call   80 <free_address+0x38>
+      84:	89 c2                	mov    %eax,%edx
+	aentry->next = as_free_head;
+      86:	a1 e0 05 00 00       	mov    0x5e0,%eax
+      8b:	89 42 04             	mov    %eax,0x4(%edx)
+	aentry->vm_addr = addr;
+      8e:	8b 44 24 0c          	mov    0xc(%esp,1),%eax
+      92:	89 02                	mov    %eax,(%edx)
+	as_free_head = aentry;
+      94:	89 15 e0 05 00 00    	mov    %edx,0x5e0
+	as_list_len++;
+      9a:	ff 05 e4 05 00 00    	incl   0x5e4
+		:"=q" (oldval), "=m" (lock->lock) \
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+      a0:	83 c4 08             	add    $0x8,%esp
+	char oldval = 1;
+      a3:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+      a5:	81 3d 44 00 00 00 ad 	cmpl   $0xdead4ead,0x44
+      ac:	4e ad de 
+      af:	74 08                	je     b9 <free_address+0x71>
+		BUG();
+      b1:	0f 0b                	ud2a   
+      b3:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+      b9:	a0 40 00 00 00       	mov    0x40,%al
+      be:	84 c0                	test   %al,%al
+      c0:	7e 08                	jle    ca <free_address+0x82>
+		BUG();
+      c2:	0f 0b                	ud2a   
+      c4:	6b 00 00             	imul   $0x0,(%eax),%eax
+      c7:	00 00                	add    %al,(%eax)
+      c9:	00 86 15 40 00 00    	add    %al,0x4015(%esi)
+#endif
+	__asm__ __volatile__(
+      cf:	00 c3                	add    %al,%bl
+	spin_unlock(&as_lock);
+}
+      d1:	8d 76 00             	lea    0x0(%esi),%esi
+
+00000000000000d4 <purge_addresses>:
+
+STATIC void
+purge_addresses(void)
+{
+	a_list_t	*aentry, *old;
+
+	if (as_free_head == NULL) return;
+      d4:	56                   	push   %esi
+      d5:	53                   	push   %ebx
+      d6:	83 3d e0 05 00 00 00 	cmpl   $0x0,0x5e0
+      dd:	0f 84 97 00 00 00    	je     17a <purge_addresses+0xa6>
+{
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+      e3:	81 3d 44 00 00 00 ad 	cmpl   $0xdead4ead,0x44
+      ea:	4e ad de 
+      ed:	74 1a                	je     109 <purge_addresses+0x35>
+printk("eip: %p\n", &&here);
+      ef:	68 e3 00 00 00       	push   $0xe3
+      f4:	68 2b 00 00 00       	push   $0x2b
+      f9:	e8 fc ff ff ff       	call   fa <purge_addresses+0x26>
+		BUG();
+      fe:	0f 0b                	ud2a   
+     100:	85 00                	test   %eax,(%eax)
+     102:	00 00                	add    %al,(%eax)
+     104:	00 00                	add    %al,(%eax)
+	}
+     106:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+     109:	f0 fe 0d 40 00 00 00 	lock decb 0x40
+     110:	0f 88 c3 2c 00 00    	js     2dd9 <Letext+0x10>
+
+	spin_lock(&as_lock);
+	aentry = as_free_head;
+     116:	8b 1d e0 05 00 00    	mov    0x5e0,%ebx
+	as_free_head = NULL;
+     11c:	c7 05 e0 05 00 00 00 	movl   $0x0,0x5e0
+     123:	00 00 00 
+	as_list_len = 0;
+     126:	c7 05 e4 05 00 00 00 	movl   $0x0,0x5e4
+     12d:	00 00 00 
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+     130:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+     132:	81 3d 44 00 00 00 ad 	cmpl   $0xdead4ead,0x44
+     139:	4e ad de 
+     13c:	74 08                	je     146 <purge_addresses+0x72>
+		BUG();
+     13e:	0f 0b                	ud2a   
+     140:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+     146:	a0 40 00 00 00       	mov    0x40,%al
+     14b:	84 c0                	test   %al,%al
+     14d:	7e 08                	jle    157 <purge_addresses+0x83>
+		BUG();
+     14f:	0f 0b                	ud2a   
+     151:	6b 00 00             	imul   $0x0,(%eax),%eax
+     154:	00 00                	add    %al,(%eax)
+     156:	00 86 15 40 00 00    	add    %al,0x4015(%esi)
+#endif
+	__asm__ __volatile__(
+     15c:	00 eb                	add    %ch,%bl
+	spin_unlock(&as_lock);
+
+	while ((old = aentry) != NULL) {
+     15e:	15 90 8b 03 50       	adc    $0x50038b90,%eax
+		vfree(aentry->vm_addr);
+     163:	e8 fc ff ff ff       	call   164 <purge_addresses+0x90>
+		aentry = aentry->next;
+     168:	8b 5b 04             	mov    0x4(%ebx),%ebx
+		kfree(old);
+     16b:	56                   	push   %esi
+     16c:	e8 fc ff ff ff       	call   16d <purge_addresses+0x99>
+	}
+     171:	83 c4 08             	add    $0x8,%esp
+     174:	89 de                	mov    %ebx,%esi
+     176:	85 db                	test   %ebx,%ebx
+     178:	75 e6                	jne    160 <purge_addresses+0x8c>
+}
+     17a:	5b                   	pop    %ebx
+     17b:	5e                   	pop    %esi
+     17c:	c3                   	ret    
+     17d:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000000180 <_pagebuf_initialize>:
+
+/*
+ *	Locking model:
+ *
+ *	Buffers associated with inodes for which buffer locking
+ *	is not enabled are not protected by semaphores, and are
+ *	assumed to be exclusively owned by the caller.	There is
+ *	spinlock in the buffer, for use by the caller when concurrent
+ *	access is possible.
+ */
+
+/*
+ *	Internal pagebuf object manipulation
+ */
+
+STATIC void
+_pagebuf_initialize(
+	page_buf_t		*pb,
+	pb_target_t		*target,
+	loff_t			range_base,
+	size_t			range_length,
+	page_buf_flags_t	flags)
+{
+     180:	55                   	push   %ebp
+     181:	57                   	push   %edi
+     182:	56                   	push   %esi
+     183:	53                   	push   %ebx
+     184:	8b 74 24 24          	mov    0x24(%esp,1),%esi
+	/*
+	 * We don't want certain flags to appear in pb->pb_flags.
+	 */
+	flags &= ~(PBF_LOCK|PBF_ENTER_PAGES|PBF_MAPPED);
+	flags &= ~(PBF_DONT_BLOCK|PBF_READ_AHEAD);
+     188:	8b 6c 24 28          	mov    0x28(%esp,1),%ebp
+     18c:	81 e5 fb df dd df    	and    $0xdfdddffb,%ebp
+ * This looks horribly ugly, but the compiler can optimize it totally,
+ * as we by now know that both pattern and count is constant..
+ */
+static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
+{
+     192:	31 c0                	xor    %eax,%eax
+	switch (count) {
+		case 0:
+			return s;
+		case 1:
+			*(unsigned char *)s = pattern;
+			return s;
+		case 2:
+			*(unsigned short *)s = pattern;
+			return s;
+		case 3:
+			*(unsigned short *)s = pattern;
+			*(2+(unsigned char *)s) = pattern;
+			return s;
+		case 4:
+			*(unsigned long *)s = pattern;
+			return s;
+	}
+#define COMMON(x) \
+__asm__  __volatile__( \
+	"rep ; stosl" \
+	x \
+	: "=&c" (d0), "=&D" (d1) \
+	: "a" (pattern),"0" (count/4),"1" ((long) s) \
+	: "memory")
+{
+	int d0, d1;
+	switch (count % 4) {
+		case 0: COMMON(""); return s;
+     194:	8b 7c 24 14          	mov    0x14(%esp,1),%edi
+     198:	b9 33 00 00 00       	mov    $0x33,%ecx
+     19d:	f3 ab                	repz stos %eax,%es:(%edi)
+
+	pb_tracking_get(pb);
+
+	memset(pb, 0, sizeof(page_buf_private_t));
+	atomic_set(&pb->pb_hold, 1);
+     19f:	8b 44 24 14          	mov    0x14(%esp,1),%eax
+     1a3:	c7 40 18 01 00 00 00 	movl   $0x1,0x18(%eax)
+ *
+ * i'd rather use the more flexible initialization above, but sadly
+ * GCC 2.7.2.3 emits a bogus warning. EGCS doesnt. Oh well.
+ */
+	atomic_set(&sem->count, val);
+     1aa:	c7 40 58 00 00 00 00 	movl   $0x0,0x58(%eax)
+	sem->sleepers = 0;
+     1b1:	c7 40 5c 00 00 00 00 	movl   $0x0,0x5c(%eax)
+#if WAITQUEUE_DEBUG
+	if (!q)
+		WQ_BUG();
+#endif
+	q->lock = WAITQUEUE_RW_LOCK_UNLOCKED;
+     1b8:	8b 5c 24 14          	mov    0x14(%esp,1),%ebx
+     1bc:	89 c1                	mov    %eax,%ecx
+     1be:	83 c1 60             	add    $0x60,%ecx
+     1c1:	b8 01 00 00 00       	mov    $0x1,%eax
+     1c6:	ba ad 4e ad de       	mov    $0xdead4ead,%edx
+     1cb:	89 43 60             	mov    %eax,0x60(%ebx)
+     1ce:	89 53 64             	mov    %edx,0x64(%ebx)
+	INIT_LIST_HEAD(&q->task_list);
+     1d1:	89 d8                	mov    %ebx,%eax
+     1d3:	83 c0 68             	add    $0x68,%eax
+     1d6:	89 41 08             	mov    %eax,0x8(%ecx)
+     1d9:	89 41 0c             	mov    %eax,0xc(%ecx)
+	init_MUTEX_LOCKED(&pb->pb_iodonesema);
+	INIT_LIST_HEAD(&pb->pb_list);
+     1dc:	89 1b                	mov    %ebx,(%ebx)
+     1de:	89 5b 04             	mov    %ebx,0x4(%ebx)
+	INIT_LIST_HEAD(&pb->pb_hash_list);
+     1e1:	83 c0 a4             	add    $0xffffffa4,%eax
+     1e4:	89 43 0c             	mov    %eax,0xc(%ebx)
+     1e7:	89 43 10             	mov    %eax,0x10(%ebx)
+ *
+ * i'd rather use the more flexible initialization above, but sadly
+ * GCC 2.7.2.3 emits a bogus warning. EGCS doesnt. Oh well.
+ */
+	atomic_set(&sem->count, val);
+     1ea:	c7 83 98 00 00 00 00 	movl   $0x0,0x98(%ebx)
+     1f1:	00 00 00 
+	sem->sleepers = 0;
+     1f4:	c7 83 9c 00 00 00 00 	movl   $0x0,0x9c(%ebx)
+     1fb:	00 00 00 
+#if WAITQUEUE_DEBUG
+	if (!q)
+		WQ_BUG();
+#endif
+	q->lock = WAITQUEUE_RW_LOCK_UNLOCKED;
+     1fe:	8b 7c 24 14          	mov    0x14(%esp,1),%edi
+     202:	89 d9                	mov    %ebx,%ecx
+     204:	81 c1 a0 00 00 00    	add    $0xa0,%ecx
+     20a:	b8 01 00 00 00       	mov    $0x1,%eax
+     20f:	89 87 a0 00 00 00    	mov    %eax,0xa0(%edi)
+     215:	89 97 a4 00 00 00    	mov    %edx,0xa4(%edi)
+	INIT_LIST_HEAD(&q->task_list);
+     21b:	89 f8                	mov    %edi,%eax
+     21d:	05 a8 00 00 00       	add    $0xa8,%eax
+     222:	89 41 08             	mov    %eax,0x8(%ecx)
+     225:	89 41 0c             	mov    %eax,0xc(%ecx)
+	init_MUTEX_LOCKED(&PBP(pb)->pb_sema); /* held, no waiters */
+	PB_SET_OWNER(pb);
+	pb->pb_target = target;
+     228:	8b 44 24 18          	mov    0x18(%esp,1),%eax
+     22c:	89 47 14             	mov    %eax,0x14(%edi)
+	pb->pb_file_offset = range_base;
+     22f:	8b 44 24 1c          	mov    0x1c(%esp,1),%eax
+     233:	8b 54 24 20          	mov    0x20(%esp,1),%edx
+     237:	8b 4c 24 14          	mov    0x14(%esp,1),%ecx
+     23b:	89 41 24             	mov    %eax,0x24(%ecx)
+     23e:	89 51 28             	mov    %edx,0x28(%ecx)
+	/*
+	 * Set buffer_length and count_desired to the same value initially.
+	 * IO routines should use count_desired, which will be the same in
+	 * most cases but may be reset (e.g. XFS recovery).
+	 */
+	pb->pb_buffer_length = pb->pb_count_desired = range_length;
+     241:	89 71 30             	mov    %esi,0x30(%ecx)
+     244:	89 71 2c             	mov    %esi,0x2c(%ecx)
+	pb->pb_flags = flags | PBF_NONE;
+     247:	83 cd 20             	or     $0x20,%ebp
+     24a:	89 69 08             	mov    %ebp,0x8(%ecx)
+	pb->pb_bn = PAGE_BUF_DADDR_NULL;
+     24d:	8b 5c 24 14          	mov    0x14(%esp,1),%ebx
+     251:	c7 43 1c ff ff ff ff 	movl   $0xffffffff,0x1c(%ebx)
+     258:	c7 43 20 ff ff ff ff 	movl   $0xffffffff,0x20(%ebx)
+	atomic_set(&PBP(pb)->pb_pin_count, 0);
+     25f:	c7 83 b8 00 00 00 00 	movl   $0x0,0xb8(%ebx)
+     266:	00 00 00 
+#if WAITQUEUE_DEBUG
+	if (!q)
+		WQ_BUG();
+#endif
+	q->lock = WAITQUEUE_RW_LOCK_UNLOCKED;
+     269:	8b 7c 24 14          	mov    0x14(%esp,1),%edi
+     26d:	89 d9                	mov    %ebx,%ecx
+     26f:	81 c1 bc 00 00 00    	add    $0xbc,%ecx
+     275:	b8 01 00 00 00       	mov    $0x1,%eax
+     27a:	ba ad 4e ad de       	mov    $0xdead4ead,%edx
+     27f:	89 87 bc 00 00 00    	mov    %eax,0xbc(%edi)
+     285:	89 97 c0 00 00 00    	mov    %edx,0xc0(%edi)
+	INIT_LIST_HEAD(&q->task_list);
+     28b:	89 f8                	mov    %edi,%eax
+     28d:	05 c4 00 00 00       	add    $0xc4,%eax
+     292:	89 41 08             	mov    %eax,0x8(%ecx)
+     295:	89 41 0c             	mov    %eax,0xc(%ecx)
+	init_waitqueue_head(&PBP(pb)->pb_waiters);
+
+	PB_STATS_INC(pbstats.pb_create);
+     298:	ff 05 04 00 00 00    	incl   0x4
+     29e:	5b                   	pop    %ebx
+     29f:	5e                   	pop    %esi
+     2a0:	5f                   	pop    %edi
+     2a1:	5d                   	pop    %ebp
+     2a2:	c3                   	ret    
+	PB_TRACE(pb, PB_TRACE_REC(get), target);
+}
+     2a3:	90                   	nop    
+
+00000000000002a4 <_pagebuf_get_pages>:
+
+/*
+ * Allocate a page array capable of holding a specified number
+ * of pages, and point the page buf at it.
+ */
+STATIC int
+_pagebuf_get_pages(
+	page_buf_t		*pb,
+	int			page_count,
+	int			flags)
+{
+     2a4:	57                   	push   %edi
+     2a5:	53                   	push   %ebx
+     2a6:	8b 7c 24 0c          	mov    0xc(%esp,1),%edi
+     2aa:	8b 5c 24 10          	mov    0x10(%esp,1),%ebx
+     2ae:	8b 54 24 14          	mov    0x14(%esp,1),%edx
+	int			gpf_mask = pb_to_gfp(flags);
+     2b2:	f7 c2 00 00 00 20    	test   $0x20000000,%edx
+     2b8:	75 17                	jne    2d1 <_pagebuf_get_pages+0x2d>
+     2ba:	81 e2 00 00 02 00    	and    $0x20000,%edx
+     2c0:	b9 f0 01 00 00       	mov    $0x1f0,%ecx
+     2c5:	b8 f0 00 00 00       	mov    $0xf0,%eax
+     2ca:	85 d2                	test   %edx,%edx
+     2cc:	0f 45 c8             	cmovne %eax,%ecx
+     2cf:	eb 02                	jmp    2d3 <_pagebuf_get_pages+0x2f>
+     2d1:	31 c9                	xor    %ecx,%ecx
+
+	/* Make sure that we have a page list */
+	if (pb->pb_pages == NULL) {
+     2d3:	83 bf 84 00 00 00 00 	cmpl   $0x0,0x84(%edi)
+     2da:	75 66                	jne    342 <_pagebuf_get_pages+0x9e>
+		pb->pb_offset = page_buf_poff(pb->pb_file_offset);
+     2dc:	0f b7 47 24          	movzwl 0x24(%edi),%eax
+     2e0:	80 e4 0f             	and    $0xf,%ah
+     2e3:	66 89 87 80 00 00 00 	mov    %ax,0x80(%edi)
+		pb->pb_page_count = page_count;
+     2ea:	66 89 5f 7e          	mov    %bx,0x7e(%edi)
+		if (page_count <= PB_PAGES) {
+     2ee:	83 fb 04             	cmp    $0x4,%ebx
+     2f1:	7e 20                	jle    313 <_pagebuf_get_pages+0x6f>
+			pb->pb_pages = pb->pb_page_array;
+		} else {
+			pb->pb_pages = kmalloc(sizeof(struct page *) *
+     2f3:	51                   	push   %ecx
+     2f4:	c1 e3 02             	shl    $0x2,%ebx
+     2f7:	53                   	push   %ebx
+     2f8:	e8 fc ff ff ff       	call   2f9 <_pagebuf_get_pages+0x55>
+     2fd:	89 87 84 00 00 00    	mov    %eax,0x84(%edi)
+					page_count, gpf_mask);
+			if (pb->pb_pages == NULL)
+     303:	83 c4 08             	add    $0x8,%esp
+     306:	89 da                	mov    %ebx,%edx
+     308:	85 c0                	test   %eax,%eax
+     30a:	75 1a                	jne    326 <_pagebuf_get_pages+0x82>
+				return -ENOMEM;
+     30c:	b8 f4 ff ff ff       	mov    $0xfffffff4,%eax
+     311:	eb 31                	jmp    344 <_pagebuf_get_pages+0xa0>
+     313:	8d 87 88 00 00 00    	lea    0x88(%edi),%eax
+     319:	89 87 84 00 00 00    	mov    %eax,0x84(%edi)
+     31f:	8d 14 9d 00 00 00 00 	lea    0x0(,%ebx,4),%edx
+ * things 32 bits at a time even when we don't know the size of the
+ * area at compile-time..
+ */
+static inline void * __constant_c_memset(void * s, unsigned long c, size_t count)
+{
+     326:	8b bf 84 00 00 00    	mov    0x84(%edi),%edi
+		}
+		memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
+     32c:	31 c0                	xor    %eax,%eax
+ */
+static inline void * __constant_c_memset(void * s, unsigned long c, size_t count)
+{
+int d0, d1;
+__asm__ __volatile__(
+     32e:	89 d1                	mov    %edx,%ecx
+     330:	c1 e9 02             	shr    $0x2,%ecx
+     333:	f3 ab                	repz stos %eax,%es:(%edi)
+     335:	f6 c2 02             	test   $0x2,%dl
+     338:	74 02                	je     33c <_pagebuf_get_pages+0x98>
+     33a:	66 ab                	stos   %ax,%es:(%edi)
+     33c:	f6 c2 01             	test   $0x1,%dl
+     33f:	74 01                	je     342 <_pagebuf_get_pages+0x9e>
+     341:	aa                   	stos   %al,%es:(%edi)
+	}
+	return 0;
+     342:	31 c0                	xor    %eax,%eax
+     344:	5b                   	pop    %ebx
+     345:	5f                   	pop    %edi
+     346:	c3                   	ret    
+}
+     347:	90                   	nop    
+
+0000000000000348 <_pagebuf_free_object>:
+
+/*
+ * Walk a pagebuf releasing all the pages contained within it.
+ */
+STATIC inline void
+_pagebuf_freepages(
+	page_buf_t		*pb)
+{
+	int			buf_index;
+
+	for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) {
+		struct page	*page = pb->pb_pages[buf_index];
+
+		if (page) {
+			pb->pb_pages[buf_index] = NULL;
+			page_cache_release(page);
+		}
+	}
+
+	if (pb->pb_pages != pb->pb_page_array)
+		kfree(pb->pb_pages);
+}
+
+/*
+ *	_pagebuf_free_object
+ *
+ *	_pagebuf_free_object releases the contents specified buffer.
+ *	The modification state of any associated pages is left unchanged.
+ */
+void
+_pagebuf_free_object(
+	pb_hash_t	*hash,	/* hash bucket for buffer	*/
+	page_buf_t	*pb)	/* buffer to deallocate		*/
+{
+     348:	57                   	push   %edi
+     349:	56                   	push   %esi
+     34a:	53                   	push   %ebx
+     34b:	8b 74 24 14          	mov    0x14(%esp,1),%esi
+     34f:	8b 5c 24 10          	mov    0x10(%esp,1),%ebx
+	int		pb_flags = pb->pb_flags;
+     353:	8b 7e 08             	mov    0x8(%esi),%edi
+
+	PB_TRACE(pb, PB_TRACE_REC(free_obj), 0);
+	pb->pb_flags |= PBF_FREED;
+     356:	89 f8                	mov    %edi,%eax
+     358:	0c 80                	or     $0x80,%al
+     35a:	89 46 08             	mov    %eax,0x8(%esi)
+
+	if (hash) {
+     35d:	85 db                	test   %ebx,%ebx
+     35f:	74 44                	je     3a5 <_pagebuf_free_object+0x5d>
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static __inline__ int list_empty(struct list_head *head)
+{
+     361:	8d 4e 0c             	lea    0xc(%esi),%ecx
+		if (!list_empty(&pb->pb_hash_list)) {
+     364:	39 4e 0c             	cmp    %ecx,0xc(%esi)
+     367:	74 17                	je     380 <_pagebuf_free_object+0x38>
+			hash->pb_count--;
+     369:	ff 4b 08             	decl   0x8(%ebx)
+ * the prev/next entries already!
+ */
+static __inline__ void __list_del(struct list_head * prev,
+				  struct list_head * next)
+{
+     36c:	8b 51 04             	mov    0x4(%ecx),%edx
+     36f:	8b 46 0c             	mov    0xc(%esi),%eax
+	next->prev = prev;
+     372:	89 50 04             	mov    %edx,0x4(%eax)
+	prev->next = next;
+     375:	89 02                	mov    %eax,(%edx)
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static __inline__ void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static __inline__ void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry); 
+     377:	89 4e 0c             	mov    %ecx,0xc(%esi)
+     37a:	89 49 04             	mov    %ecx,0x4(%ecx)
+}
+     37d:	8d 76 00             	lea    0x0(%esi),%esi
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+     380:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+     382:	81 7b 10 ad 4e ad de 	cmpl   $0xdead4ead,0x10(%ebx)
+     389:	74 08                	je     393 <_pagebuf_free_object+0x4b>
+		BUG();
+     38b:	0f 0b                	ud2a   
+     38d:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+     393:	8a 43 0c             	mov    0xc(%ebx),%al
+     396:	84 c0                	test   %al,%al
+     398:	7e 08                	jle    3a2 <_pagebuf_free_object+0x5a>
+		BUG();
+     39a:	0f 0b                	ud2a   
+     39c:	6b 00 00             	imul   $0x0,(%eax),%eax
+     39f:	00 00                	add    %al,(%eax)
+     3a1:	00 86 53 0c 89 f8    	add    %al,0xf8890c53(%esi)
+			list_del_init(&pb->pb_hash_list);
+		}
+		spin_unlock(&hash->pb_hash_lock);
+	}
+
+	if (!(pb_flags & PBF_FREED)) {
+     3a7:	84 c0                	test   %al,%al
+     3a9:	0f 8c 96 00 00 00    	jl     445 <_pagebuf_free_object+0xfd>
+		/* release any virtual mapping */ ;
+		if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
+     3af:	f6 46 0b 01          	testb  $0x1,0xb(%esi)
+     3b3:	74 16                	je     3cb <_pagebuf_free_object+0x83>
+			void *vaddr = pagebuf_mapout_locked(pb);
+     3b5:	56                   	push   %esi
+     3b6:	e8 39 1b 00 00       	call   1ef4 <pagebuf_mapout_locked>
+			if (vaddr) {
+     3bb:	83 c4 04             	add    $0x4,%esp
+     3be:	85 c0                	test   %eax,%eax
+     3c0:	74 09                	je     3cb <_pagebuf_free_object+0x83>
+				free_address(vaddr);
+     3c2:	50                   	push   %eax
+     3c3:	e8 80 fc ff ff       	call   48 <free_address>
+			}
+     3c8:	83 c4 04             	add    $0x4,%esp
+		}
+
+		if (pb->pb_flags & _PBF_MEM_ALLOCATED) {
+     3cb:	f6 46 0b 02          	testb  $0x2,0xb(%esi)
+     3cf:	74 74                	je     445 <_pagebuf_free_object+0xfd>
+			if (pb->pb_pages) {
+     3d1:	8b 96 84 00 00 00    	mov    0x84(%esi),%edx
+     3d7:	85 d2                	test   %edx,%edx
+     3d9:	74 66                	je     441 <_pagebuf_free_object+0xf9>
+				/* release the pages in the address list */
+				if (pb->pb_pages[0] &&
+     3db:	8b 02                	mov    (%edx),%eax
+     3dd:	85 c0                	test   %eax,%eax
+     3df:	74 0f                	je     3f0 <_pagebuf_free_object+0xa8>
+#endif
+
+static __inline__ int constant_test_bit(int nr, const volatile void * addr)
+{
+	return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+     3e1:	8b 40 18             	mov    0x18(%eax),%eax
+				    PageSlab(pb->pb_pages[0])) {
+     3e4:	f6 c4 01             	test   $0x1,%ah
+     3e7:	74 07                	je     3f0 <_pagebuf_free_object+0xa8>
+					/*
+					 * This came from the slab
+					 * allocator free it as such
+					 */
+					kfree(pb->pb_addr);
+     3e9:	8b 46 34             	mov    0x34(%esi),%eax
+     3ec:	50                   	push   %eax
+				} else {
+     3ed:	eb 40                	jmp    42f <_pagebuf_free_object+0xe7>
+     3ef:	90                   	nop    
+     3f0:	31 db                	xor    %ebx,%ebx
+     3f2:	8d be 88 00 00 00    	lea    0x88(%esi),%edi
+     3f8:	66 83 7e 7e 00       	cmpw   $0x0,0x7e(%esi)
+     3fd:	74 2b                	je     42a <_pagebuf_free_object+0xe2>
+     3ff:	90                   	nop    
+     400:	8b 96 84 00 00 00    	mov    0x84(%esi),%edx
+     406:	8b 04 9a             	mov    (%edx,%ebx,4),%eax
+     409:	85 c0                	test   %eax,%eax
+     40b:	74 0e                	je     41b <_pagebuf_free_object+0xd3>
+     40d:	c7 04 9a 00 00 00 00 	movl   $0x0,(%edx,%ebx,4)
+     414:	31 d2                	xor    %edx,%edx
+     416:	e8 fc ff ff ff       	call   417 <_pagebuf_free_object+0xcf>
+     41b:	8b 96 84 00 00 00    	mov    0x84(%esi),%edx
+     421:	43                   	inc    %ebx
+     422:	0f b7 46 7e          	movzwl 0x7e(%esi),%eax
+     426:	39 c3                	cmp    %eax,%ebx
+     428:	7c d6                	jl     400 <_pagebuf_free_object+0xb8>
+     42a:	39 fa                	cmp    %edi,%edx
+     42c:	74 09                	je     437 <_pagebuf_free_object+0xef>
+     42e:	52                   	push   %edx
+     42f:	e8 fc ff ff ff       	call   430 <_pagebuf_free_object+0xe8>
+     434:	83 c4 04             	add    $0x4,%esp
+					_pagebuf_freepages(pb);
+				}
+
+				pb->pb_pages = NULL;
+     437:	c7 86 84 00 00 00 00 	movl   $0x0,0x84(%esi)
+     43e:	00 00 00 
+			}
+			pb->pb_flags &= ~_PBF_MEM_ALLOCATED;
+     441:	80 66 0b fd          	andb   $0xfd,0xb(%esi)
+		}
+	}
+
+	pb_tracking_free(pb);
+	pagebuf_deallocate(pb);
+     445:	a1 24 00 00 00       	mov    0x24,%eax
+     44a:	56                   	push   %esi
+     44b:	50                   	push   %eax
+     44c:	e8 fc ff ff ff       	call   44d <_pagebuf_free_object+0x105>
+}
+     451:	83 c4 08             	add    $0x8,%esp
+     454:	5b                   	pop    %ebx
+     455:	5e                   	pop    %esi
+     456:	5f                   	pop    %edi
+     457:	c3                   	ret    
+
+0000000000000458 <_pagebuf_lookup_pages>:
+
+/*
+ *	_pagebuf_lookup_pages
+ *
+ *	_pagebuf_lookup_pages finds all pages which match the buffer
+ *	in question and the range of file offsets supplied,
+ *	and builds the page list for the buffer, if the
+ *	page list is not already formed or if not all of the pages are
+ *	already in the list. Invalid pages (pages which have not yet been
+ *	read in from disk) are assigned for any pages which are not found.
+ */
+STATIC int
+_pagebuf_lookup_pages(
+	page_buf_t		*pb,
+	struct address_space	*aspace,
+	page_buf_flags_t	flags)
+{
+     458:	83 ec 28             	sub    $0x28,%esp
+     45b:	55                   	push   %ebp
+     45c:	57                   	push   %edi
+     45d:	56                   	push   %esi
+     45e:	53                   	push   %ebx
+     45f:	8b 7c 24 3c          	mov    0x3c(%esp,1),%edi
+	loff_t			next_buffer_offset;
+	unsigned long		page_count, pi, index;
+	struct page		*page;
+	int			gfp_mask, retry_count = 5, rval = 0;
+	int			all_mapped, good_pages;
+	size_t			blocksize;
+
+	/* For pagebufs where we want to map an address, do not use
+	 * highmem pages - so that we do not need to use kmap resources
+	 * to access the data.
+	 *
+	 * For pages where the caller has indicated there may be resource
+	 * contention (e.g. called from a transaction) do not flush
+	 * delalloc pages to obtain memory.
+	 */
+
+	if (flags & PBF_READ_AHEAD) {
+     463:	8b 44 24 44          	mov    0x44(%esp,1),%eax
+     467:	c7 44 24 2c 05 00 00 	movl   $0x5,0x2c(%esp,1)
+     46e:	00 
+     46f:	c7 44 24 28 00 00 00 	movl   $0x0,0x28(%esp,1)
+     476:	00 
+     477:	a9 00 00 00 20       	test   $0x20000000,%eax
+     47c:	74 12                	je     490 <_pagebuf_lookup_pages+0x38>
+		gfp_mask = GFP_READAHEAD;
+     47e:	c7 44 24 30 00 00 00 	movl   $0x0,0x30(%esp,1)
+     485:	00 
+		retry_count = 0;
+     486:	c7 44 24 2c 00 00 00 	movl   $0x0,0x2c(%esp,1)
+     48d:	00 
+	} else if (flags & PBF_DONT_BLOCK) {
+     48e:	eb 37                	jmp    4c7 <_pagebuf_lookup_pages+0x6f>
+     490:	8b 54 24 44          	mov    0x44(%esp,1),%edx
+     494:	f7 c2 00 00 02 00    	test   $0x20000,%edx
+     49a:	74 0a                	je     4a6 <_pagebuf_lookup_pages+0x4e>
+		gfp_mask = GFP_NOFS;
+     49c:	c7 44 24 30 f0 00 00 	movl   $0xf0,0x30(%esp,1)
+     4a3:	00 
+	} else if (flags & PBF_MAPPABLE) {
+     4a4:	eb 21                	jmp    4c7 <_pagebuf_lookup_pages+0x6f>
+     4a6:	8b 44 24 44          	mov    0x44(%esp,1),%eax
+     4aa:	25 00 02 00 00       	and    $0x200,%eax
+		gfp_mask = GFP_KERNEL;
+	} else {
+		gfp_mask = GFP_HIGHUSER;
+     4af:	c7 44 24 30 d2 01 00 	movl   $0x1d2,0x30(%esp,1)
+     4b6:	00 
+     4b7:	ba f0 01 00 00       	mov    $0x1f0,%edx
+     4bc:	85 c0                	test   %eax,%eax
+     4be:	0f 44 54 24 30       	cmove  0x30(%esp,1),%edx
+     4c3:	89 54 24 30          	mov    %edx,0x30(%esp,1)
+	}
+
+	next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length;
+     4c7:	8b 4f 24             	mov    0x24(%edi),%ecx
+     4ca:	8b 5f 28             	mov    0x28(%edi),%ebx
+     4cd:	89 c8                	mov    %ecx,%eax
+     4cf:	89 da                	mov    %ebx,%edx
+     4d1:	03 47 2c             	add    0x2c(%edi),%eax
+     4d4:	83 d2 00             	adc    $0x0,%edx
+
+	good_pages = page_count = (page_buf_btoc(next_buffer_offset) -
+     4d7:	05 ff 0f 00 00       	add    $0xfff,%eax
+     4dc:	83 d2 00             	adc    $0x0,%edx
+     4df:	0f ac d0 0c          	shrd   $0xc,%edx,%eax
+     4e3:	c1 fa 0c             	sar    $0xc,%edx
+     4e6:	0f ac d9 0c          	shrd   $0xc,%ebx,%ecx
+     4ea:	c1 fb 0c             	sar    $0xc,%ebx
+     4ed:	29 c8                	sub    %ecx,%eax
+     4ef:	89 44 24 34          	mov    %eax,0x34(%esp,1)
+     4f3:	89 44 24 20          	mov    %eax,0x20(%esp,1)
+				   page_buf_btoct(pb->pb_file_offset));
+
+	if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) {
+     4f7:	f6 47 0a 40          	testb  $0x40,0xa(%edi)
+     4fb:	74 4a                	je     547 <_pagebuf_lookup_pages+0xef>
+		/* Bring pages forward in cache */
+		for (pi = 0; pi < page_count; pi++) {
+     4fd:	8b 5c 24 44          	mov    0x44(%esp,1),%ebx
+     501:	31 f6                	xor    %esi,%esi
+     503:	83 e3 04             	and    $0x4,%ebx
+     506:	39 c6                	cmp    %eax,%esi
+     508:	73 1b                	jae    525 <_pagebuf_lookup_pages+0xcd>
+     50a:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+			mark_page_accessed(pb->pb_pages[pi]);
+     510:	8b 87 84 00 00 00    	mov    0x84(%edi),%eax
+     516:	8b 04 b0             	mov    (%eax,%esi,4),%eax
+     519:	e8 fc ff ff ff       	call   51a <_pagebuf_lookup_pages+0xc2>
+     51e:	46                   	inc    %esi
+     51f:	3b 74 24 34          	cmp    0x34(%esp,1),%esi
+     523:	72 eb                	jb     510 <_pagebuf_lookup_pages+0xb8>
+		}
+		if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) {
+     525:	85 db                	test   %ebx,%ebx
+     527:	74 17                	je     540 <_pagebuf_lookup_pages+0xe8>
+     529:	f6 47 08 04          	testb  $0x4,0x8(%edi)
+     52d:	75 11                	jne    540 <_pagebuf_lookup_pages+0xe8>
+			all_mapped = 1;
+     52f:	c7 44 24 24 01 00 00 	movl   $0x1,0x24(%esp,1)
+     536:	00 
+			goto mapit;
+     537:	e9 6e 01 00 00       	jmp    6aa <_pagebuf_lookup_pages+0x252>
+     53c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+		}
+		return 0;
+     540:	31 c0                	xor    %eax,%eax
+     542:	e9 21 02 00 00       	jmp    768 <_pagebuf_lookup_pages+0x310>
+	}
+
+	/* Ensure pb_pages field has been initialised */
+	rval = _pagebuf_get_pages(pb, page_count, flags);
+     547:	8b 4c 24 44          	mov    0x44(%esp,1),%ecx
+     54b:	51                   	push   %ecx
+     54c:	8b 5c 24 38          	mov    0x38(%esp,1),%ebx
+     550:	53                   	push   %ebx
+     551:	57                   	push   %edi
+     552:	e8 4d fd ff ff       	call   2a4 <_pagebuf_get_pages>
+     557:	89 44 24 34          	mov    %eax,0x34(%esp,1)
+	if (rval)
+     55b:	83 c4 0c             	add    $0xc,%esp
+     55e:	85 c0                	test   %eax,%eax
+     560:	0f 85 fe 01 00 00    	jne    764 <_pagebuf_lookup_pages+0x30c>
+		return rval;
+
+	rval = pi = 0;
+	blocksize = pb->pb_target->pbr_blocksize;
+     566:	8b 47 14             	mov    0x14(%edi),%eax
+     569:	31 f6                	xor    %esi,%esi
+     56b:	8b 40 10             	mov    0x10(%eax),%eax
+     56e:	89 44 24 1c          	mov    %eax,0x1c(%esp,1)
+
+	/* Enter the pages in the page list */
+	index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT;
+     572:	0f b7 87 80 00 00 00 	movzwl 0x80(%edi),%eax
+     579:	8b 4f 24             	mov    0x24(%edi),%ecx
+     57c:	8b 5f 28             	mov    0x28(%edi),%ebx
+     57f:	29 c1                	sub    %eax,%ecx
+     581:	83 db 00             	sbb    $0x0,%ebx
+     584:	89 da                	mov    %ebx,%edx
+	for (all_mapped = 1; pi < page_count; pi++, index++) {
+     586:	8b 5c 24 34          	mov    0x34(%esp,1),%ebx
+     58a:	89 c8                	mov    %ecx,%eax
+     58c:	0f ac d0 0c          	shrd   $0xc,%edx,%eax
+     590:	c1 fa 0c             	sar    $0xc,%edx
+     593:	89 c5                	mov    %eax,%ebp
+     595:	c7 44 24 24 01 00 00 	movl   $0x1,0x24(%esp,1)
+     59c:	00 
+     59d:	39 5c 24 28          	cmp    %ebx,0x28(%esp,1)
+     5a1:	0f 83 dd 00 00 00    	jae    684 <_pagebuf_lookup_pages+0x22c>
+     5a7:	b8 00 e0 ff ff       	mov    $0xffffe000,%eax
+     5ac:	21 e0                	and    %esp,%eax
+     5ae:	89 44 24 18          	mov    %eax,0x18(%esp,1)
+		if (pb->pb_pages[pi] == 0) {
+     5b2:	8b 87 84 00 00 00    	mov    0x84(%edi),%eax
+     5b8:	8d 56 01             	lea    0x1(%esi),%edx
+     5bb:	8d 4d 01             	lea    0x1(%ebp),%ecx
+     5be:	8b 04 b0             	mov    (%eax,%esi,4),%eax
+     5c1:	89 54 24 14          	mov    %edx,0x14(%esp,1)
+     5c5:	89 4c 24 10          	mov    %ecx,0x10(%esp,1)
+     5c9:	85 c0                	test   %eax,%eax
+     5cb:	75 7b                	jne    648 <_pagebuf_lookup_pages+0x1f0>
+		      retry:
+			page = find_or_create_page(aspace, index, gfp_mask);
+     5cd:	8b 5c 24 30          	mov    0x30(%esp,1),%ebx
+     5d1:	53                   	push   %ebx
+     5d2:	55                   	push   %ebp
+     5d3:	8b 44 24 48          	mov    0x48(%esp,1),%eax
+     5d7:	50                   	push   %eax
+     5d8:	e8 fc ff ff ff       	call   5d9 <_pagebuf_lookup_pages+0x181>
+     5dd:	89 c3                	mov    %eax,%ebx
+			if (!page) {
+     5df:	83 c4 0c             	add    $0xc,%esp
+     5e2:	85 db                	test   %ebx,%ebx
+     5e4:	75 4a                	jne    630 <_pagebuf_lookup_pages+0x1d8>
+				if (--retry_count > 0) {
+     5e6:	ff 4c 24 2c          	decl   0x2c(%esp,1)
+     5ea:	83 7c 24 2c 00       	cmpl   $0x0,0x2c(%esp,1)
+     5ef:	7e 26                	jle    617 <_pagebuf_lookup_pages+0x1bf>
+					PB_STATS_INC(pbstats.pb_page_retries);
+     5f1:	ff 05 18 00 00 00    	incl   0x18
+					pagebuf_daemon_wakeup(1);
+     5f7:	6a 01                	push   $0x1
+     5f9:	e8 8e 1f 00 00       	call   258c <pagebuf_daemon_wakeup>
+
+struct task_struct;
+
+static inline struct task_struct * get_current(void)
+{
+     5fe:	83 c4 04             	add    $0x4,%esp
+					current->state = TASK_UNINTERRUPTIBLE;
+     601:	8b 54 24 18          	mov    0x18(%esp,1),%edx
+     605:	c7 02 02 00 00 00    	movl   $0x2,(%edx)
+					schedule_timeout(10);
+     60b:	b8 0a 00 00 00       	mov    $0xa,%eax
+     610:	e8 fc ff ff ff       	call   611 <_pagebuf_lookup_pages+0x1b9>
+					goto retry;
+     615:	eb b6                	jmp    5cd <_pagebuf_lookup_pages+0x175>
+				}
+				rval = -ENOMEM;
+     617:	c7 44 24 28 f4 ff ff 	movl   $0xfffffff4,0x28(%esp,1)
+     61e:	ff 
+				all_mapped = 0;
+     61f:	c7 44 24 24 00 00 00 	movl   $0x0,0x24(%esp,1)
+     626:	00 
+				continue;
+     627:	eb 49                	jmp    672 <_pagebuf_lookup_pages+0x21a>
+     629:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
+			}
+			PB_STATS_INC(pbstats.pb_page_found);
+     630:	ff 05 1c 00 00 00    	incl   0x1c
+			mark_page_accessed(page);
+     636:	89 d8                	mov    %ebx,%eax
+     638:	e8 fc ff ff ff       	call   639 <_pagebuf_lookup_pages+0x1e1>
+			pb->pb_pages[pi] = page;
+     63d:	8b 87 84 00 00 00    	mov    0x84(%edi),%eax
+     643:	89 1c b0             	mov    %ebx,(%eax,%esi,4)
+		} else {
+     646:	eb 07                	jmp    64f <_pagebuf_lookup_pages+0x1f7>
+			page = pb->pb_pages[pi];
+     648:	89 c3                	mov    %eax,%ebx
+			lock_page(page);
+     64a:	e8 fc ff ff ff       	call   64b <_pagebuf_lookup_pages+0x1f3>
+#endif
+
+static __inline__ int constant_test_bit(int nr, const volatile void * addr)
+{
+	return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+     64f:	8b 43 18             	mov    0x18(%ebx),%eax
+		}
+
+		/* If we need to do I/O on a page record the fact */
+		if (!Page_Uptodate(page)) {
+     652:	a8 08                	test   $0x8,%al
+     654:	75 1c                	jne    672 <_pagebuf_lookup_pages+0x21a>
+			good_pages--;
+     656:	ff 4c 24 20          	decl   0x20(%esp,1)
+			if ((blocksize == PAGE_CACHE_SIZE) &&
+     65a:	81 7c 24 1c 00 10 00 	cmpl   $0x1000,0x1c(%esp,1)
+     661:	00 
+     662:	75 0e                	jne    672 <_pagebuf_lookup_pages+0x21a>
+     664:	f6 44 24 44 01       	testb  $0x1,0x44(%esp,1)
+     669:	74 07                	je     672 <_pagebuf_lookup_pages+0x21a>
+			    (flags & PBF_READ))
+				pb->pb_locked = 1;
+     66b:	c6 87 82 00 00 00 01 	movb   $0x1,0x82(%edi)
+     672:	8b 74 24 14          	mov    0x14(%esp,1),%esi
+     676:	8b 6c 24 10          	mov    0x10(%esp,1),%ebp
+     67a:	3b 74 24 34          	cmp    0x34(%esp,1),%esi
+     67e:	0f 82 2e ff ff ff    	jb     5b2 <_pagebuf_lookup_pages+0x15a>
+		}
+	}
+
+	if (!pb->pb_locked) {
+     684:	80 bf 82 00 00 00 00 	cmpb   $0x0,0x82(%edi)
+     68b:	75 1d                	jne    6aa <_pagebuf_lookup_pages+0x252>
+		for (pi = 0; pi < page_count; pi++) {
+     68d:	31 f6                	xor    %esi,%esi
+     68f:	3b 74 24 34          	cmp    0x34(%esp,1),%esi
+     693:	73 15                	jae    6aa <_pagebuf_lookup_pages+0x252>
+			unlock_page(pb->pb_pages[pi]);
+     695:	8b 87 84 00 00 00    	mov    0x84(%edi),%eax
+     69b:	8b 04 b0             	mov    (%eax,%esi,4),%eax
+     69e:	e8 fc ff ff ff       	call   69f <_pagebuf_lookup_pages+0x247>
+     6a3:	46                   	inc    %esi
+     6a4:	3b 74 24 34          	cmp    0x34(%esp,1),%esi
+     6a8:	72 eb                	jb     695 <_pagebuf_lookup_pages+0x23d>
+		}
+	}
+
+mapit:
+	pb->pb_flags |= _PBF_MEM_ALLOCATED;
+     6aa:	8b 47 08             	mov    0x8(%edi),%eax
+     6ad:	89 c1                	mov    %eax,%ecx
+     6af:	81 c9 00 00 00 02    	or     $0x2000000,%ecx
+     6b5:	89 4f 08             	mov    %ecx,0x8(%edi)
+	if (all_mapped) {
+     6b8:	83 7c 24 24 00       	cmpl   $0x0,0x24(%esp,1)
+     6bd:	0f 84 83 00 00 00    	je     746 <_pagebuf_lookup_pages+0x2ee>
+		pb->pb_flags |= _PBF_ALL_PAGES_MAPPED;
+     6c3:	89 c1                	mov    %eax,%ecx
+     6c5:	81 c9 00 00 40 02    	or     $0x2400000,%ecx
+     6cb:	89 4f 08             	mov    %ecx,0x8(%edi)
+
+		/* A single page buffer is always mappable */
+		if (page_count == 1) {
+     6ce:	83 7c 24 34 01       	cmpl   $0x1,0x34(%esp,1)
+     6d3:	75 1d                	jne    6f2 <_pagebuf_lookup_pages+0x29a>
+			pb->pb_addr = (caddr_t)
+     6d5:	8b 87 84 00 00 00    	mov    0x84(%edi),%eax
+     6db:	8b 10                	mov    (%eax),%edx
+			    	page_address(pb->pb_pages[0]) + pb->pb_offset;
+			pb->pb_flags |= PBF_MAPPED;
+     6dd:	83 c9 04             	or     $0x4,%ecx
+     6e0:	0f b7 87 80 00 00 00 	movzwl 0x80(%edi),%eax
+     6e7:	03 42 2c             	add    0x2c(%edx),%eax
+     6ea:	89 47 34             	mov    %eax,0x34(%edi)
+     6ed:	89 4f 08             	mov    %ecx,0x8(%edi)
+		} else if (flags & PBF_MAPPED) {
+     6f0:	eb 54                	jmp    746 <_pagebuf_lookup_pages+0x2ee>
+     6f2:	8b 5c 24 44          	mov    0x44(%esp,1),%ebx
+     6f6:	f6 c3 04             	test   $0x4,%bl
+     6f9:	74 4b                	je     746 <_pagebuf_lookup_pages+0x2ee>
+			if (as_list_len > 64)
+     6fb:	83 3d e4 05 00 00 40 	cmpl   $0x40,0x5e4
+     702:	7e 05                	jle    709 <_pagebuf_lookup_pages+0x2b1>
+				purge_addresses();
+     704:	e8 cb f9 ff ff       	call   d4 <purge_addresses>
+			pb->pb_addr = remap_page_array(pb->pb_pages,
+     709:	8b 44 24 30          	mov    0x30(%esp,1),%eax
+     70d:	50                   	push   %eax
+     70e:	8b 54 24 38          	mov    0x38(%esp,1),%edx
+     712:	52                   	push   %edx
+     713:	8b 87 84 00 00 00    	mov    0x84(%edi),%eax
+     719:	50                   	push   %eax
+     71a:	e8 fc ff ff ff       	call   71b <_pagebuf_lookup_pages+0x2c3>
+     71f:	89 c2                	mov    %eax,%edx
+     721:	89 57 34             	mov    %edx,0x34(%edi)
+							page_count, gfp_mask);
+			if (!pb->pb_addr)
+     724:	83 c4 0c             	add    $0xc,%esp
+     727:	85 d2                	test   %edx,%edx
+     729:	75 08                	jne    733 <_pagebuf_lookup_pages+0x2db>
+				BUG();
+     72b:	0f 0b                	ud2a   
+     72d:	48                   	dec    %eax
+     72e:	02 34 00             	add    (%eax,%eax,1),%dh
+     731:	00 00                	add    %al,(%eax)
+			pb->pb_addr += pb->pb_offset;
+     733:	0f b7 87 80 00 00 00 	movzwl 0x80(%edi),%eax
+     73a:	01 c2                	add    %eax,%edx
+     73c:	89 57 34             	mov    %edx,0x34(%edi)
+			pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED;
+     73f:	81 4f 08 04 00 00 01 	orl    $0x1000004,0x8(%edi)
+		}
+	}
+	/* If some pages were found with data in them
+	 * we are not in PBF_NONE state.
+	 */
+	if (good_pages != 0) {
+     746:	83 7c 24 20 00       	cmpl   $0x0,0x20(%esp,1)
+     74b:	74 17                	je     764 <_pagebuf_lookup_pages+0x30c>
+		pb->pb_flags &= ~(PBF_NONE);
+     74d:	8b 47 08             	mov    0x8(%edi),%eax
+     750:	24 df                	and    $0xdf,%al
+     752:	89 47 08             	mov    %eax,0x8(%edi)
+		if (good_pages != page_count) {
+     755:	8b 4c 24 34          	mov    0x34(%esp,1),%ecx
+     759:	39 4c 24 20          	cmp    %ecx,0x20(%esp,1)
+     75d:	74 05                	je     764 <_pagebuf_lookup_pages+0x30c>
+			pb->pb_flags |= PBF_PARTIAL;
+     75f:	0c 08                	or     $0x8,%al
+     761:	89 47 08             	mov    %eax,0x8(%edi)
+		}
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(look_pg), good_pages);
+
+	return rval;
+     764:	8b 44 24 28          	mov    0x28(%esp,1),%eax
+     768:	5b                   	pop    %ebx
+     769:	5e                   	pop    %esi
+     76a:	5f                   	pop    %edi
+     76b:	5d                   	pop    %ebp
+     76c:	83 c4 28             	add    $0x28,%esp
+     76f:	c3                   	ret    
+
+0000000000000770 <_pagebuf_prealloc_bh>:
+}
+
+
+/*
+ *	Pre-allocation of a pool of buffer heads for use in
+ *	low-memory situations.
+ */
+
+/*
+ *	_pagebuf_prealloc_bh
+ *
+ *	Pre-allocate a pool of "count" buffer heads at startup.
+ *	Puts them on a list at "pb_resv_bh"
+ *	Returns number of bh actually allocated to pool.
+ */
+STATIC int
+_pagebuf_prealloc_bh(
+	int			count)
+{
+     770:	56                   	push   %esi
+     771:	53                   	push   %ebx
+     772:	8b 74 24 0c          	mov    0xc(%esp,1),%esi
+	struct buffer_head	*bh;
+	int			i;
+
+	for (i = 0; i < count; i++) {
+     776:	31 db                	xor    %ebx,%ebx
+     778:	eb 21                	jmp    79b <_pagebuf_prealloc_bh+0x2b>
+     77a:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
+		if (!bh)
+			break;
+		bh->b_pprev = &pb_resv_bh;
+		bh->b_next = pb_resv_bh;
+     780:	a1 00 00 00 00       	mov    0x0,%eax
+     785:	c7 42 30 00 00 00 00 	movl   $0x0,0x30(%edx)
+     78c:	89 02                	mov    %eax,(%edx)
+		pb_resv_bh = bh;
+     78e:	89 15 00 00 00 00    	mov    %edx,0x0
+		pb_resv_bh_cnt++;
+     794:	ff 05 00 00 00 00    	incl   0x0
+     79a:	43                   	inc    %ebx
+     79b:	39 f3                	cmp    %esi,%ebx
+     79d:	7d 19                	jge    7b8 <_pagebuf_prealloc_bh+0x48>
+     79f:	a1 00 00 00 00       	mov    0x0,%eax
+     7a4:	68 f0 01 00 00       	push   $0x1f0
+     7a9:	50                   	push   %eax
+     7aa:	e8 fc ff ff ff       	call   7ab <_pagebuf_prealloc_bh+0x3b>
+     7af:	89 c2                	mov    %eax,%edx
+     7b1:	83 c4 08             	add    $0x8,%esp
+     7b4:	85 d2                	test   %edx,%edx
+     7b6:	75 c8                	jne    780 <_pagebuf_prealloc_bh+0x10>
+	}
+	return i;
+     7b8:	89 d8                	mov    %ebx,%eax
+     7ba:	5b                   	pop    %ebx
+     7bb:	5e                   	pop    %esi
+     7bc:	c3                   	ret    
+}
+     7bd:	8d 76 00             	lea    0x0(%esi),%esi
+
+00000000000007c0 <_pagebuf_get_prealloc_bh>:
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+     7c0:	83 ec 14             	sub    $0x14,%esp
+     7c3:	55                   	push   %ebp
+     7c4:	57                   	push   %edi
+     7c5:	bf 00 e0 ff ff       	mov    $0xffffe000,%edi
+     7ca:	21 e7                	and    %esp,%edi
+     7cc:	56                   	push   %esi
+     7cd:	53                   	push   %ebx
+
+/*
+ *	_pagebuf_get_prealloc_bh
+ *
+ *	Get one buffer head from our pre-allocated pool.
+ *	If pool is empty, sleep 'til one comes back in.
+ *	Returns aforementioned buffer head.
+ */
+STATIC struct buffer_head *
+_pagebuf_get_prealloc_bh(void)
+{
+	unsigned long		flags;
+	struct buffer_head	*bh = NULL;
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE	(wait, tsk);
+     7ce:	c7 44 24 14 00 00 00 	movl   $0x0,0x14(%esp,1)
+     7d5:	00 
+     7d6:	c7 44 24 1c 00 00 00 	movl   $0x0,0x1c(%esp,1)
+     7dd:	00 
+     7de:	c7 44 24 20 00 00 00 	movl   $0x0,0x20(%esp,1)
+     7e5:	00 
+     7e6:	89 7c 24 18          	mov    %edi,0x18(%esp,1)
+
+	spin_lock_irqsave(&pb_resv_bh_lock, flags);
+     7ea:	9c                   	pushf  
+     7eb:	5e                   	pop    %esi
+     7ec:	fa                   	cli    
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+     7ed:	bd 00 00 00 00       	mov    $0x0,%ebp
+     7f2:	89 eb                	mov    %ebp,%ebx
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+     7f4:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+     7fb:	4e ad de 
+     7fe:	74 1a                	je     81a <_pagebuf_get_prealloc_bh+0x5a>
+printk("eip: %p\n", &&here);
+     800:	68 f4 07 00 00       	push   $0x7f4
+     805:	68 2b 00 00 00       	push   $0x2b
+     80a:	e8 fc ff ff ff       	call   80b <_pagebuf_get_prealloc_bh+0x4b>
+		BUG();
+     80f:	0f 0b                	ud2a   
+     811:	85 00                	test   %eax,(%eax)
+     813:	00 00                	add    %al,(%eax)
+     815:	00 00                	add    %al,(%eax)
+	}
+     817:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+     81a:	f0 fe 0b             	lock decb (%ebx)
+     81d:	0f 88 c6 25 00 00    	js     2de9 <Letext+0x20>
+
+	if (pb_resv_bh_cnt < 1) {
+     823:	83 3d 00 00 00 00 00 	cmpl   $0x0,0x0
+     82a:	0f 8f d4 00 00 00    	jg     904 <_pagebuf_get_prealloc_bh+0x144>
+
+		add_wait_queue(&pb_resv_bh_wait, &wait);
+     830:	8d 5c 24 14          	lea    0x14(%esp,1),%ebx
+     834:	89 da                	mov    %ebx,%edx
+     836:	b8 40 03 00 00       	mov    $0x340,%eax
+     83b:	e8 fc ff ff ff       	call   83c <_pagebuf_get_prealloc_bh+0x7c>
+     840:	89 5c 24 10          	mov    %ebx,0x10(%esp,1)
+extern void __run_task_queue(task_queue *list);
+
+static inline void run_task_queue(task_queue *list)
+{
+	if (TQ_ACTIVE(*list))
+     844:	81 3d 00 00 00 00 00 	cmpl   $0x0,0x0
+     84b:	00 00 00 
+     84e:	74 0d                	je     85d <_pagebuf_get_prealloc_bh+0x9d>
+		__run_task_queue(list);
+     850:	68 00 00 00 00       	push   $0x0
+     855:	e8 fc ff ff ff       	call   856 <_pagebuf_get_prealloc_bh+0x96>
+     85a:	83 c4 04             	add    $0x4,%esp
+		do {
+			run_task_queue(&tq_disk);
+			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+     85d:	b8 02 00 00 00       	mov    $0x2,%eax
+				:"m" (*__xg(ptr)), "0" (x)
+				:"memory");
+			break;
+		case 4:
+			__asm__ __volatile__("xchgl %0,%1"
+     862:	87 07                	xchg   %eax,(%edi)
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+     864:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+     866:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+     86d:	4e ad de 
+     870:	74 0e                	je     880 <_pagebuf_get_prealloc_bh+0xc0>
+		BUG();
+     872:	0f 0b                	ud2a   
+     874:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+     87a:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+	if (!spin_is_locked(lock))
+     880:	a0 00 00 00 00       	mov    0x0,%al
+     885:	84 c0                	test   %al,%al
+     887:	7e 08                	jle    891 <_pagebuf_get_prealloc_bh+0xd1>
+		BUG();
+     889:	0f 0b                	ud2a   
+     88b:	6b 00 00             	imul   $0x0,(%eax),%eax
+     88e:	00 00                	add    %al,(%eax)
+     890:	00 86 15 00 00 00    	add    %al,0x15(%esi)
+#endif
+	__asm__ __volatile__(
+     896:	00 56 9d             	add    %dl,0xffffff9d(%esi)
+			spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
+			schedule();
+     899:	e8 fc ff ff ff       	call   89a <_pagebuf_get_prealloc_bh+0xda>
+			spin_lock_irqsave(&pb_resv_bh_lock, flags);
+     89e:	9c                   	pushf  
+     89f:	5e                   	pop    %esi
+     8a0:	fa                   	cli    
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+     8a1:	89 eb                	mov    %ebp,%ebx
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+     8a3:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+     8aa:	4e ad de 
+     8ad:	74 1a                	je     8c9 <_pagebuf_get_prealloc_bh+0x109>
+printk("eip: %p\n", &&here);
+     8af:	68 a3 08 00 00       	push   $0x8a3
+     8b4:	68 2b 00 00 00       	push   $0x2b
+     8b9:	e8 fc ff ff ff       	call   8ba <_pagebuf_get_prealloc_bh+0xfa>
+		BUG();
+     8be:	0f 0b                	ud2a   
+     8c0:	85 00                	test   %eax,(%eax)
+     8c2:	00 00                	add    %al,(%eax)
+     8c4:	00 00                	add    %al,(%eax)
+	}
+     8c6:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+     8c9:	f0 fe 0b             	lock decb (%ebx)
+     8cc:	0f 88 23 25 00 00    	js     2df5 <Letext+0x2c>
+		} while (pb_resv_bh_cnt < 1);
+     8d2:	83 3d 00 00 00 00 00 	cmpl   $0x0,0x0
+     8d9:	0f 8e 65 ff ff ff    	jle    844 <_pagebuf_get_prealloc_bh+0x84>
+		tsk->state = TASK_RUNNING;
+     8df:	c7 07 00 00 00 00    	movl   $0x0,(%edi)
+		remove_wait_queue(&pb_resv_bh_wait, &wait);
+     8e5:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+     8e9:	b8 40 03 00 00       	mov    $0x340,%eax
+     8ee:	e8 fc ff ff ff       	call   8ef <_pagebuf_get_prealloc_bh+0x12f>
+	}
+
+	if (pb_resv_bh_cnt < 1)
+     8f3:	83 3d 00 00 00 00 00 	cmpl   $0x0,0x0
+     8fa:	7f 08                	jg     904 <_pagebuf_get_prealloc_bh+0x144>
+		BUG();
+     8fc:	0f 0b                	ud2a   
+     8fe:	9c                   	pushf  
+     8ff:	02 34 00             	add    (%eax,%eax,1),%dh
+     902:	00 00                	add    %al,(%eax)
+
+	bh = pb_resv_bh;
+     904:	8b 0d 00 00 00 00    	mov    0x0,%ecx
+
+	if (!bh)
+     90a:	85 c9                	test   %ecx,%ecx
+     90c:	75 08                	jne    916 <_pagebuf_get_prealloc_bh+0x156>
+		BUG();
+     90e:	0f 0b                	ud2a   
+     910:	a1 02 34 00 00       	mov    0x3402,%eax
+     915:	00 8b 01 a3 00 00    	add    %cl,0xa301(%ebx)
+
+	pb_resv_bh = bh->b_next;
+     91b:	00 00                	add    %al,(%eax)
+	bh->b_state = 0;
+     91d:	c7 41 18 00 00 00 00 	movl   $0x0,0x18(%ecx)
+	pb_resv_bh_cnt--;
+     924:	ff 0d 00 00 00 00    	decl   0x0
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+     92a:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+     92c:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+     933:	4e ad de 
+     936:	74 08                	je     940 <_pagebuf_get_prealloc_bh+0x180>
+		BUG();
+     938:	0f 0b                	ud2a   
+     93a:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+     940:	a0 00 00 00 00       	mov    0x0,%al
+     945:	84 c0                	test   %al,%al
+     947:	7e 08                	jle    951 <_pagebuf_get_prealloc_bh+0x191>
+		BUG();
+     949:	0f 0b                	ud2a   
+     94b:	6b 00 00             	imul   $0x0,(%eax),%eax
+     94e:	00 00                	add    %al,(%eax)
+     950:	00 86 15 00 00 00    	add    %al,0x15(%esi)
+#endif
+	__asm__ __volatile__(
+     956:	00 56 9d             	add    %dl,0xffffff9d(%esi)
+
+	spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
+
+	return bh;
+     959:	89 c8                	mov    %ecx,%eax
+     95b:	5b                   	pop    %ebx
+     95c:	5e                   	pop    %esi
+     95d:	5f                   	pop    %edi
+     95e:	5d                   	pop    %ebp
+     95f:	83 c4 14             	add    $0x14,%esp
+     962:	c3                   	ret    
+}
+     963:	90                   	nop    
+
+0000000000000964 <_pagebuf_find>:
+
+/*
+ *	_pagebuf_free_bh
+ *
+ *	Take care of buffer heads that we're finished with.
+ *	Call this instead of just kmem_cache_free(bh_cachep, bh)
+ *	when you're done with a bh.
+ *
+ *	If our pre-allocated pool is full, just free the buffer head.
+ *	Otherwise, put it back in the pool, and wake up anybody
+ *	waiting for one.
+ */
+STATIC inline void
+_pagebuf_free_bh(
+	struct buffer_head	*bh)
+{
+	unsigned long		flags;
+
+	if (pb_resv_bh_cnt == NR_RESERVED_BH){
+		kmem_cache_free(bh_cachep, bh);
+	} else {
+		spin_lock_irqsave(&pb_resv_bh_lock, flags);
+
+		bh->b_pprev = &pb_resv_bh;
+		bh->b_next = pb_resv_bh;
+		pb_resv_bh = bh;
+		pb_resv_bh_cnt++;
+
+		if (waitqueue_active(&pb_resv_bh_wait)) {
+			wake_up(&pb_resv_bh_wait);
+		}
+
+		spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
+	}
+}
+
+/*
+ *	Finding and Reading Buffers
+ */
+
+/*
+ *	_pagebuf_find
+ *
+ *	Looks up, and creates if absent, a lockable buffer for
+ *	a given range of an inode.  The buffer is returned
+ *	locked.	 If other overlapping buffers exist, they are
+ *	released before the new buffer is created and locked,
+ *	which may imply that this call will block until those buffers
+ *	are unlocked.  No I/O is implied by this call.
+ */
+STATIC page_buf_t *
+_pagebuf_find(				/* find buffer for block	*/
+	pb_target_t		*target,/* target for block		*/
+	loff_t			ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	page_buf_flags_t	flags,	/* PBF_TRYLOCK			*/
+	page_buf_t		*new_pb)/* newly allocated buffer	*/
+{
+     964:	83 ec 0c             	sub    $0xc,%esp
+     967:	55                   	push   %ebp
+     968:	57                   	push   %edi
+     969:	56                   	push   %esi
+     96a:	53                   	push   %ebx
+     96b:	8b 74 24 24          	mov    0x24(%esp,1),%esi
+     96f:	8b 7c 24 28          	mov    0x28(%esp,1),%edi
+	loff_t			range_base;
+	size_t			range_length;
+	int			hval;
+	pb_hash_t		*h;
+	struct list_head	*p;
+	page_buf_t		*pb;
+	int			not_locked;
+
+	range_base = (ioff << SECTOR_SHIFT);
+	range_length = (isize << SECTOR_SHIFT);
+     973:	8b 44 24 2c          	mov    0x2c(%esp,1),%eax
+     977:	0f a4 f7 09          	shld   $0x9,%esi,%edi
+     97b:	c1 e6 09             	shl    $0x9,%esi
+     97e:	c1 e0 09             	shl    $0x9,%eax
+     981:	89 44 24 18          	mov    %eax,0x18(%esp,1)
+
+	hval = _bhash(target->pbr_bdev->bd_dev, range_base);
+     985:	57                   	push   %edi
+     986:	56                   	push   %esi
+     987:	8b 54 24 28          	mov    0x28(%esp,1),%edx
+     98b:	8b 42 08             	mov    0x8(%edx),%eax
+     98e:	0f b7 40 10          	movzwl 0x10(%eax),%eax
+     992:	50                   	push   %eax
+     993:	e8 68 f6 ff ff       	call   0 <_bhash>
+     998:	89 44 24 20          	mov    %eax,0x20(%esp,1)
+	h = &pbhash[hval];
+     99c:	8d 04 80             	lea    (%eax,%eax,4),%eax
+     99f:	8d 2c 85 60 03 00 00 	lea    0x360(,%eax,4),%ebp
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+     9a6:	8d 5d 0c             	lea    0xc(%ebp),%ebx
+     9a9:	83 c4 0c             	add    $0xc,%esp
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+     9ac:	81 7b 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%ebx)
+     9b3:	74 1a                	je     9cf <_pagebuf_find+0x6b>
+printk("eip: %p\n", &&here);
+     9b5:	68 ac 09 00 00       	push   $0x9ac
+     9ba:	68 2b 00 00 00       	push   $0x2b
+     9bf:	e8 fc ff ff ff       	call   9c0 <_pagebuf_find+0x5c>
+		BUG();
+     9c4:	0f 0b                	ud2a   
+     9c6:	85 00                	test   %eax,(%eax)
+     9c8:	00 00                	add    %al,(%eax)
+     9ca:	00 00                	add    %al,(%eax)
+	}
+     9cc:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+     9cf:	f0 fe 0b             	lock decb (%ebx)
+     9d2:	0f 88 29 24 00 00    	js     2e01 <Letext+0x38>
+
+	spin_lock(&h->pb_hash_lock);
+	list_for_each(p, &h->pb_hash) {
+     9d8:	8b 45 00             	mov    0x0(%ebp),%eax
+     9db:	8d 4d 0c             	lea    0xc(%ebp),%ecx
+     9de:	89 4c 24 10          	mov    %ecx,0x10(%esp,1)
+		pb = list_entry(p, page_buf_t, pb_hash_list);
+
+		if ((target == pb->pb_target) &&
+		    (pb->pb_file_offset == range_base) &&
+		    (pb->pb_buffer_length == range_length)) {
+			if (pb->pb_flags & PBF_FREED)
+     9e2:	eb 24                	jmp    a08 <_pagebuf_find+0xa4>
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static __inline__ void list_del(struct list_head *entry)
+{
+     9e4:	8d 4b 0c             	lea    0xc(%ebx),%ecx
+     9e7:	8b 51 04             	mov    0x4(%ecx),%edx
+     9ea:	8b 43 0c             	mov    0xc(%ebx),%eax
+     9ed:	89 50 04             	mov    %edx,0x4(%eax)
+     9f0:	89 02                	mov    %eax,(%edx)
+     9f2:	8b 45 00             	mov    0x0(%ebp),%eax
+     9f5:	89 48 04             	mov    %ecx,0x4(%eax)
+     9f8:	89 43 0c             	mov    %eax,0xc(%ebx)
+     9fb:	89 69 04             	mov    %ebp,0x4(%ecx)
+     9fe:	89 4d 00             	mov    %ecx,0x0(%ebp)
+				break;
+			/* If we look at something bring it to the
+			 * front of the list for next time
+			 */
+			list_del(&pb->pb_hash_list);
+			list_add(&pb->pb_hash_list, &h->pb_hash);
+			goto found;
+     a01:	e9 ba 00 00 00       	jmp    ac0 <_pagebuf_find+0x15c>
+     a06:	8b 00                	mov    (%eax),%eax
+		}
+	}
+     a08:	39 e8                	cmp    %ebp,%eax
+     a0a:	74 25                	je     a31 <_pagebuf_find+0xcd>
+     a0c:	8b 54 24 20          	mov    0x20(%esp,1),%edx
+     a10:	8d 58 f4             	lea    0xfffffff4(%eax),%ebx
+     a13:	3b 53 14             	cmp    0x14(%ebx),%edx
+     a16:	75 ee                	jne    a06 <_pagebuf_find+0xa2>
+     a18:	39 73 24             	cmp    %esi,0x24(%ebx)
+     a1b:	75 e9                	jne    a06 <_pagebuf_find+0xa2>
+     a1d:	39 7b 28             	cmp    %edi,0x28(%ebx)
+     a20:	75 e4                	jne    a06 <_pagebuf_find+0xa2>
+     a22:	8b 4c 24 18          	mov    0x18(%esp,1),%ecx
+     a26:	39 4b 2c             	cmp    %ecx,0x2c(%ebx)
+     a29:	75 db                	jne    a06 <_pagebuf_find+0xa2>
+     a2b:	80 78 fc 00          	cmpb   $0x0,0xfffffffc(%eax)
+     a2f:	7d b3                	jge    9e4 <_pagebuf_find+0x80>
+
+	/* No match found */
+	if (new_pb) {
+     a31:	83 7c 24 34 00       	cmpl   $0x0,0x34(%esp,1)
+     a36:	74 4a                	je     a82 <_pagebuf_find+0x11e>
+		_pagebuf_initialize(new_pb, target, range_base,
+     a38:	80 4c 24 32 08       	orb    $0x8,0x32(%esp,1)
+     a3d:	8b 44 24 30          	mov    0x30(%esp,1),%eax
+     a41:	50                   	push   %eax
+     a42:	8b 54 24 1c          	mov    0x1c(%esp,1),%edx
+     a46:	52                   	push   %edx
+     a47:	57                   	push   %edi
+     a48:	56                   	push   %esi
+     a49:	8b 4c 24 30          	mov    0x30(%esp,1),%ecx
+     a4d:	51                   	push   %ecx
+     a4e:	8b 44 24 48          	mov    0x48(%esp,1),%eax
+     a52:	50                   	push   %eax
+     a53:	e8 28 f7 ff ff       	call   180 <_pagebuf_initialize>
+				range_length, flags | _PBF_LOCKABLE);
+		new_pb->pb_hash_index = hval;
+     a58:	8b 4c 24 4c          	mov    0x4c(%esp,1),%ecx
+     a5c:	8a 54 24 2c          	mov    0x2c(%esp,1),%dl
+     a60:	88 91 83 00 00 00    	mov    %dl,0x83(%ecx)
+ */
+static __inline__ void __list_add(struct list_head * new,
+	struct list_head * prev,
+	struct list_head * next)
+{
+     a66:	8b 45 00             	mov    0x0(%ebp),%eax
+		h->pb_count++;
+     a69:	ff 45 08             	incl   0x8(%ebp)
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static __inline__ void list_add(struct list_head *new, struct list_head *head)
+{
+     a6c:	89 ca                	mov    %ecx,%edx
+     a6e:	83 c2 0c             	add    $0xc,%edx
+     a71:	83 c4 18             	add    $0x18,%esp
+     a74:	89 50 04             	mov    %edx,0x4(%eax)
+     a77:	89 41 0c             	mov    %eax,0xc(%ecx)
+     a7a:	89 6a 04             	mov    %ebp,0x4(%edx)
+     a7d:	89 55 00             	mov    %edx,0x0(%ebp)
+		list_add(&new_pb->pb_hash_list, &h->pb_hash);
+	} else {
+     a80:	eb 06                	jmp    a88 <_pagebuf_find+0x124>
+		PB_STATS_INC(pbstats.pb_miss_locked);
+     a82:	ff 05 14 00 00 00    	incl   0x14
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+     a88:	8b 44 24 10          	mov    0x10(%esp,1),%eax
+     a8c:	b2 01                	mov    $0x1,%dl
+     a8e:	81 78 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%eax)
+     a95:	74 08                	je     a9f <_pagebuf_find+0x13b>
+		BUG();
+     a97:	0f 0b                	ud2a   
+     a99:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+     a9f:	8a 45 0c             	mov    0xc(%ebp),%al
+     aa2:	84 c0                	test   %al,%al
+     aa4:	7e 08                	jle    aae <_pagebuf_find+0x14a>
+		BUG();
+     aa6:	0f 0b                	ud2a   
+     aa8:	6b 00 00             	imul   $0x0,(%eax),%eax
+     aab:	00 00                	add    %al,(%eax)
+     aad:	00 86 55 0c 8b 44    	add    %al,0x448b0c55(%esi)
+	}
+
+	spin_unlock(&h->pb_hash_lock);
+	return (new_pb);
+     ab3:	24 34                	and    $0x34,%al
+     ab5:	e9 96 00 00 00       	jmp    b50 <_pagebuf_find+0x1ec>
+     aba:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_inc(atomic_t *v)
+{
+	__asm__ __volatile__(
+     ac0:	f0 ff 43 18          	lock incl 0x18(%ebx)
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+     ac4:	8b 4c 24 10          	mov    0x10(%esp,1),%ecx
+     ac8:	b2 01                	mov    $0x1,%dl
+     aca:	81 79 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%ecx)
+     ad1:	74 08                	je     adb <_pagebuf_find+0x177>
+		BUG();
+     ad3:	0f 0b                	ud2a   
+     ad5:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+     adb:	8a 45 0c             	mov    0xc(%ebp),%al
+     ade:	84 c0                	test   %al,%al
+     ae0:	7e 08                	jle    aea <_pagebuf_find+0x186>
+		BUG();
+     ae2:	0f 0b                	ud2a   
+     ae4:	6b 00 00             	imul   $0x0,(%eax),%eax
+     ae7:	00 00                	add    %al,(%eax)
+     ae9:	00 86 55 0c 8d 8b    	add    %al,0x8b8d0c55(%esi)
+ * Non-blockingly attempt to down() a semaphore.
+ * Returns zero if we acquired it
+ */
+static inline int down_trylock(struct semaphore * sem)
+{
+     aef:	98                   	cwtl   
+     af0:	00 00                	add    %al,(%eax)
+     af2:	00 f0                	add    %dh,%al
+	int result;
+
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+
+	__asm__ __volatile__(
+     af4:	ff 8b 98 00 00 00    	decl   0x98(%ebx)
+     afa:	0f 88 0d 23 00 00    	js     2e0d <Letext+0x44>
+     b00:	31 c0                	xor    %eax,%eax
+
+found:
+	atomic_inc(&pb->pb_hold);
+	spin_unlock(&h->pb_hash_lock);
+
+	/* Attempt to get the semaphore without sleeping,
+	 * if this does not work then we need to drop the
+	 * spinlock and do a hard attempt on the semaphore.
+	 */
+	not_locked = down_trylock(&PBP(pb)->pb_sema);
+	if (not_locked) {
+     b02:	85 c0                	test   %eax,%eax
+     b04:	74 2b                	je     b31 <_pagebuf_find+0x1cd>
+		if (!(flags & PBF_TRYLOCK)) {
+     b06:	8b 44 24 30          	mov    0x30(%esp,1),%eax
+     b0a:	f6 c4 40             	test   $0x40,%ah
+     b0d:	74 13                	je     b22 <_pagebuf_find+0x1be>
+			/* wait for buffer ownership */
+			PB_TRACE(pb, PB_TRACE_REC(get_lk), 0);
+			pagebuf_lock(pb);
+			PB_STATS_INC(pbstats.pb_get_locked_waited);
+		} else {
+			/* We asked for a trylock and failed, no need
+			 * to look at file offset and length here, we
+			 * know that this pagebuf at least overlaps our
+			 * pagebuf and is locked, therefore our buffer
+			 * either does not exist, or is this buffer
+			 */
+
+			pagebuf_rele(pb);
+     b0f:	53                   	push   %ebx
+     b10:	e8 fc ff ff ff       	call   b11 <_pagebuf_find+0x1ad>
+			PB_STATS_INC(pbstats.pb_busy_locked);
+			return (NULL);
+     b15:	31 c0                	xor    %eax,%eax
+     b17:	ff 05 10 00 00 00    	incl   0x10
+     b1d:	83 c4 04             	add    $0x4,%esp
+     b20:	eb 2e                	jmp    b50 <_pagebuf_find+0x1ec>
+     b22:	53                   	push   %ebx
+     b23:	e8 fc ff ff ff       	call   b24 <_pagebuf_find+0x1c0>
+     b28:	ff 05 0c 00 00 00    	incl   0xc
+     b2e:	83 c4 04             	add    $0x4,%esp
+		}
+	} else {
+		/* trylock worked */
+		PB_SET_OWNER(pb);
+	}
+
+	if (pb->pb_flags & PBF_STALE)
+     b31:	8b 43 08             	mov    0x8(%ebx),%eax
+     b34:	f6 c4 04             	test   $0x4,%ah
+     b37:	74 08                	je     b41 <_pagebuf_find+0x1dd>
+		pb->pb_flags &= PBF_MAPPABLE | \
+     b39:	25 04 02 c8 03       	and    $0x3c80204,%eax
+     b3e:	89 43 08             	mov    %eax,0x8(%ebx)
+				PBF_MAPPED | \
+				_PBF_LOCKABLE | \
+				_PBF_ALL_PAGES_MAPPED | \
+				_PBF_SOME_INVALID_PAGES | \
+				_PBF_ADDR_ALLOCATED | \
+				_PBF_MEM_ALLOCATED;
+	PB_TRACE(pb, PB_TRACE_REC(got_lk), 0);
+	PB_STATS_INC(pbstats.pb_get_locked);
+     b41:	ff 05 08 00 00 00    	incl   0x8
+	return (pb);
+     b47:	89 d8                	mov    %ebx,%eax
+}
+     b49:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
+     b50:	5b                   	pop    %ebx
+     b51:	5e                   	pop    %esi
+     b52:	5f                   	pop    %edi
+     b53:	5d                   	pop    %ebp
+     b54:	83 c4 0c             	add    $0xc,%esp
+     b57:	c3                   	ret    
+
+0000000000000b58 <pagebuf_find>:
+
+
+/*
+ *	pagebuf_find
+ *
+ *	pagebuf_find returns a buffer matching the specified range of
+ *	data for the specified target, if any of the relevant blocks
+ *	are in memory.	The buffer may have unallocated holes, if
+ *	some, but not all, of the blocks are in memory.	 Even where
+ *	pages are present in the buffer, not all of every page may be
+ *	valid.	The file system may use pagebuf_segment to visit the
+ *	various segments of the buffer.
+ */
+page_buf_t *
+pagebuf_find(				/* find buffer for block	*/
+					/* if the block is in memory	*/
+	pb_target_t		*target,/* target for block		*/
+	loff_t			ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	page_buf_flags_t	flags)	/* PBF_TRYLOCK			*/
+{
+     b58:	8b 4c 24 04          	mov    0x4(%esp,1),%ecx
+     b5c:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+     b60:	8b 44 24 14          	mov    0x14(%esp,1),%eax
+	return _pagebuf_find(target, ioff, isize, flags, NULL);
+     b64:	6a 00                	push   $0x0
+     b66:	50                   	push   %eax
+     b67:	52                   	push   %edx
+     b68:	8b 44 24 14          	mov    0x14(%esp,1),%eax
+     b6c:	8b 54 24 18          	mov    0x18(%esp,1),%edx
+     b70:	52                   	push   %edx
+     b71:	50                   	push   %eax
+     b72:	51                   	push   %ecx
+     b73:	e8 ec fd ff ff       	call   964 <_pagebuf_find>
+     b78:	83 c4 18             	add    $0x18,%esp
+     b7b:	c3                   	ret    
+
+0000000000000b7c <pagebuf_get>:
+}
+
+/*
+ *	pagebuf_get
+ *
+ *	pagebuf_get assembles a buffer covering the specified range.
+ *	Some or all of the blocks in the range may be valid.  The file
+ *	system may use pagebuf_segment to visit the various segments
+ *	of the buffer.	Storage in memory for all portions of the
+ *	buffer will be allocated, although backing storage may not be.
+ *	If PBF_READ is set in flags, pagebuf_read
+ */
+page_buf_t *
+pagebuf_get(				/* allocate a buffer		*/
+	pb_target_t		*target,/* target for buffer 		*/
+	loff_t			ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	page_buf_flags_t	flags)	/* PBF_TRYLOCK			*/
+{
+     b7c:	57                   	push   %edi
+     b7d:	56                   	push   %esi
+     b7e:	53                   	push   %ebx
+     b7f:	8b 7c 24 20          	mov    0x20(%esp,1),%edi
+	page_buf_t		*pb, *new_pb;
+	int			error;
+
+	new_pb = pagebuf_allocate(flags);
+     b83:	f7 c7 00 00 00 20    	test   $0x20000000,%edi
+     b89:	75 18                	jne    ba3 <pagebuf_get+0x27>
+     b8b:	89 f8                	mov    %edi,%eax
+     b8d:	25 00 00 02 00       	and    $0x20000,%eax
+     b92:	b9 f0 01 00 00       	mov    $0x1f0,%ecx
+     b97:	ba f0 00 00 00       	mov    $0xf0,%edx
+     b9c:	85 c0                	test   %eax,%eax
+     b9e:	0f 45 ca             	cmovne %edx,%ecx
+     ba1:	eb 02                	jmp    ba5 <pagebuf_get+0x29>
+     ba3:	31 c9                	xor    %ecx,%ecx
+     ba5:	a1 24 00 00 00       	mov    0x24,%eax
+     baa:	51                   	push   %ecx
+     bab:	50                   	push   %eax
+     bac:	e8 fc ff ff ff       	call   bad <pagebuf_get+0x31>
+     bb1:	89 c6                	mov    %eax,%esi
+	if (unlikely(!new_pb))
+     bb3:	83 c4 08             	add    $0x8,%esp
+     bb6:	85 f6                	test   %esi,%esi
+     bb8:	74 37                	je     bf1 <pagebuf_get+0x75>
+		return (NULL);
+
+	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
+     bba:	56                   	push   %esi
+     bbb:	57                   	push   %edi
+     bbc:	8b 44 24 24          	mov    0x24(%esp,1),%eax
+     bc0:	50                   	push   %eax
+     bc1:	8b 44 24 20          	mov    0x20(%esp,1),%eax
+     bc5:	8b 54 24 24          	mov    0x24(%esp,1),%edx
+     bc9:	52                   	push   %edx
+     bca:	50                   	push   %eax
+     bcb:	8b 44 24 24          	mov    0x24(%esp,1),%eax
+     bcf:	50                   	push   %eax
+     bd0:	e8 8f fd ff ff       	call   964 <_pagebuf_find>
+     bd5:	89 c3                	mov    %eax,%ebx
+	if (pb != new_pb) {
+     bd7:	83 c4 18             	add    $0x18,%esp
+     bda:	39 f3                	cmp    %esi,%ebx
+     bdc:	74 1a                	je     bf8 <pagebuf_get+0x7c>
+		pagebuf_deallocate(new_pb);
+     bde:	a1 24 00 00 00       	mov    0x24,%eax
+     be3:	56                   	push   %esi
+     be4:	50                   	push   %eax
+     be5:	e8 fc ff ff ff       	call   be6 <pagebuf_get+0x6a>
+		if (unlikely(!pb))
+     bea:	83 c4 08             	add    $0x8,%esp
+     bed:	85 db                	test   %ebx,%ebx
+     bef:	75 07                	jne    bf8 <pagebuf_get+0x7c>
+			return (NULL);
+     bf1:	31 c0                	xor    %eax,%eax
+     bf3:	e9 8f 00 00 00       	jmp    c87 <pagebuf_get+0x10b>
+	}
+
+	PB_STATS_INC(pbstats.pb_get);
+     bf8:	ff 05 00 00 00 00    	incl   0x0
+
+	/* fill in any missing pages */
+	error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags);
+     bfe:	57                   	push   %edi
+     bff:	8b 43 14             	mov    0x14(%ebx),%eax
+     c02:	8b 40 0c             	mov    0xc(%eax),%eax
+     c05:	50                   	push   %eax
+     c06:	53                   	push   %ebx
+     c07:	e8 4c f8 ff ff       	call   458 <_pagebuf_lookup_pages>
+	if (unlikely(error)) {
+     c0c:	83 c4 0c             	add    $0xc,%esp
+     c0f:	85 c0                	test   %eax,%eax
+     c11:	74 0d                	je     c20 <pagebuf_get+0xa4>
+		pagebuf_free(pb);
+     c13:	53                   	push   %ebx
+     c14:	e8 fc ff ff ff       	call   c15 <pagebuf_get+0x99>
+		return (NULL);
+     c19:	eb 59                	jmp    c74 <pagebuf_get+0xf8>
+     c1b:	90                   	nop    
+     c1c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+	}
+
+	/*
+	 * Always fill in the block number now, the mapped cases can do
+	 * their own overlay of this later.
+	 */
+	pb->pb_bn = ioff;
+     c20:	8b 44 24 14          	mov    0x14(%esp,1),%eax
+     c24:	8b 54 24 18          	mov    0x18(%esp,1),%edx
+     c28:	89 43 1c             	mov    %eax,0x1c(%ebx)
+	pb->pb_count_desired = pb->pb_buffer_length;
+     c2b:	8b 43 2c             	mov    0x2c(%ebx),%eax
+     c2e:	89 53 20             	mov    %edx,0x20(%ebx)
+     c31:	89 43 30             	mov    %eax,0x30(%ebx)
+
+	if (flags & PBF_READ) {
+     c34:	f7 c7 01 00 00 00    	test   $0x1,%edi
+     c3a:	74 49                	je     c85 <pagebuf_get+0x109>
+		if (PBF_NOT_DONE(pb)) {
+     c3c:	8b 43 08             	mov    0x8(%ebx),%eax
+     c3f:	a8 28                	test   $0x28,%al
+     c41:	74 12                	je     c55 <pagebuf_get+0xd9>
+			PB_TRACE(pb, PB_TRACE_REC(get_read), flags);
+			PB_STATS_INC(pbstats.pb_get_read);
+     c43:	ff 05 20 00 00 00    	incl   0x20
+			pagebuf_iostart(pb, flags);
+     c49:	57                   	push   %edi
+     c4a:	53                   	push   %ebx
+     c4b:	e8 fc ff ff ff       	call   c4c <pagebuf_get+0xd0>
+		} else if (flags & PBF_ASYNC) {
+     c50:	83 c4 08             	add    $0x8,%esp
+     c53:	eb 30                	jmp    c85 <pagebuf_get+0x109>
+     c55:	f7 c7 10 00 00 00    	test   $0x10,%edi
+     c5b:	74 23                	je     c80 <pagebuf_get+0x104>
+			/*
+			 * Read ahead call which is already satisfied,
+			 * drop the buffer
+			 */
+			if (flags & (PBF_LOCK | PBF_TRYLOCK))
+     c5d:	f7 c7 00 60 00 00    	test   $0x6000,%edi
+     c63:	74 09                	je     c6e <pagebuf_get+0xf2>
+				pagebuf_unlock(pb);
+     c65:	53                   	push   %ebx
+     c66:	e8 fc ff ff ff       	call   c67 <pagebuf_get+0xeb>
+     c6b:	83 c4 04             	add    $0x4,%esp
+			pagebuf_rele(pb);
+     c6e:	53                   	push   %ebx
+     c6f:	e8 fc ff ff ff       	call   c70 <pagebuf_get+0xf4>
+			return NULL;
+     c74:	31 c0                	xor    %eax,%eax
+     c76:	83 c4 04             	add    $0x4,%esp
+     c79:	eb 0c                	jmp    c87 <pagebuf_get+0x10b>
+     c7b:	90                   	nop    
+     c7c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+		} else {
+			/* We do not want read in the flags */
+			pb->pb_flags &= ~PBF_READ;
+     c80:	24 fe                	and    $0xfe,%al
+     c82:	89 43 08             	mov    %eax,0x8(%ebx)
+		}
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(get_obj), flags);
+	return (pb);
+     c85:	89 d8                	mov    %ebx,%eax
+}
+     c87:	5b                   	pop    %ebx
+     c88:	5e                   	pop    %esi
+     c89:	5f                   	pop    %edi
+     c8a:	c3                   	ret    
+     c8b:	90                   	nop    
+
+0000000000000c8c <pagebuf_lookup>:
+
+/*
+ * Create a pagebuf and populate it with pages from the address
+ * space of the passed in inode.
+ */
+page_buf_t *
+pagebuf_lookup(
+	struct pb_target	*target,
+	struct inode		*inode,
+	loff_t			ioff,
+	size_t			isize,
+	int			flags)
+{
+	page_buf_t		*pb = NULL;
+	int			status;
+
+	flags |= _PBF_PRIVATE_BH;
+     c8c:	56                   	push   %esi
+     c8d:	53                   	push   %ebx
+     c8e:	8b 74 24 20          	mov    0x20(%esp,1),%esi
+     c92:	81 ce 00 00 10 00    	or     $0x100000,%esi
+	pb = pagebuf_allocate(flags);
+     c98:	f7 c6 00 00 00 20    	test   $0x20000000,%esi
+     c9e:	75 18                	jne    cb8 <pagebuf_lookup+0x2c>
+     ca0:	89 f0                	mov    %esi,%eax
+     ca2:	25 00 00 02 00       	and    $0x20000,%eax
+     ca7:	b9 f0 01 00 00       	mov    $0x1f0,%ecx
+     cac:	ba f0 00 00 00       	mov    $0xf0,%edx
+     cb1:	85 c0                	test   %eax,%eax
+     cb3:	0f 45 ca             	cmovne %edx,%ecx
+     cb6:	eb 02                	jmp    cba <pagebuf_lookup+0x2e>
+     cb8:	31 c9                	xor    %ecx,%ecx
+     cba:	a1 24 00 00 00       	mov    0x24,%eax
+     cbf:	51                   	push   %ecx
+     cc0:	50                   	push   %eax
+     cc1:	e8 fc ff ff ff       	call   cc2 <pagebuf_lookup+0x36>
+     cc6:	89 c3                	mov    %eax,%ebx
+	if (pb) {
+     cc8:	83 c4 08             	add    $0x8,%esp
+     ccb:	85 db                	test   %ebx,%ebx
+     ccd:	74 51                	je     d20 <pagebuf_lookup+0x94>
+		_pagebuf_initialize(pb, target, ioff, isize, flags);
+     ccf:	56                   	push   %esi
+     cd0:	8b 44 24 20          	mov    0x20(%esp,1),%eax
+     cd4:	50                   	push   %eax
+     cd5:	8b 44 24 1c          	mov    0x1c(%esp,1),%eax
+     cd9:	8b 54 24 20          	mov    0x20(%esp,1),%edx
+     cdd:	52                   	push   %edx
+     cde:	50                   	push   %eax
+     cdf:	8b 44 24 1c          	mov    0x1c(%esp,1),%eax
+     ce3:	50                   	push   %eax
+     ce4:	53                   	push   %ebx
+     ce5:	e8 96 f4 ff ff       	call   180 <_pagebuf_initialize>
+		if (flags & PBF_ENTER_PAGES) {
+     cea:	83 c4 18             	add    $0x18,%esp
+     ced:	f7 c6 00 00 20 00    	test   $0x200000,%esi
+     cf3:	74 2b                	je     d20 <pagebuf_lookup+0x94>
+			status = _pagebuf_lookup_pages(pb, &inode->i_data, 0);
+     cf5:	6a 00                	push   $0x0
+     cf7:	8b 44 24 14          	mov    0x14(%esp,1),%eax
+     cfb:	05 c0 00 00 00       	add    $0xc0,%eax
+     d00:	50                   	push   %eax
+     d01:	53                   	push   %ebx
+     d02:	e8 51 f7 ff ff       	call   458 <_pagebuf_lookup_pages>
+			if (status != 0) {
+     d07:	83 c4 0c             	add    $0xc,%esp
+     d0a:	85 c0                	test   %eax,%eax
+     d0c:	74 12                	je     d20 <pagebuf_lookup+0x94>
+				pagebuf_free(pb);
+     d0e:	53                   	push   %ebx
+     d0f:	e8 fc ff ff ff       	call   d10 <pagebuf_lookup+0x84>
+				return (NULL);
+     d14:	31 c0                	xor    %eax,%eax
+     d16:	83 c4 04             	add    $0x4,%esp
+     d19:	eb 07                	jmp    d22 <pagebuf_lookup+0x96>
+     d1b:	90                   	nop    
+     d1c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+			}
+		}
+	}
+	return pb;
+     d20:	89 d8                	mov    %ebx,%eax
+}
+     d22:	5b                   	pop    %ebx
+     d23:	5e                   	pop    %esi
+     d24:	c3                   	ret    
+     d25:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000000d28 <pagebuf_readahead>:
+
+/*
+ * If we are not low on memory then do the readahead in a deadlock
+ * safe manner.
+ */
+void
+pagebuf_readahead(
+	pb_target_t		*target,
+	loff_t			ioff,
+	size_t			isize,
+	int			flags)
+{
+     d28:	8b 4c 24 04          	mov    0x4(%esp,1),%ecx
+     d2c:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+	flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD);
+     d30:	8b 44 24 14          	mov    0x14(%esp,1),%eax
+     d34:	0d 11 42 00 20       	or     $0x20004211,%eax
+	pagebuf_get(target, ioff, isize, flags);
+     d39:	50                   	push   %eax
+     d3a:	52                   	push   %edx
+     d3b:	8b 44 24 10          	mov    0x10(%esp,1),%eax
+     d3f:	8b 54 24 14          	mov    0x14(%esp,1),%edx
+     d43:	52                   	push   %edx
+     d44:	50                   	push   %eax
+     d45:	51                   	push   %ecx
+     d46:	e8 fc ff ff ff       	call   d47 <pagebuf_readahead+0x1f>
+}
+     d4b:	83 c4 14             	add    $0x14,%esp
+     d4e:	c3                   	ret    
+     d4f:	90                   	nop    
+
+0000000000000d50 <pagebuf_get_empty>:
+
+page_buf_t *
+pagebuf_get_empty(
+	pb_target_t		*target)
+{
+	page_buf_t		*pb;
+
+	pb = pagebuf_allocate(_PBF_LOCKABLE);
+     d50:	a1 24 00 00 00       	mov    0x24,%eax
+     d55:	53                   	push   %ebx
+     d56:	68 f0 01 00 00       	push   $0x1f0
+     d5b:	50                   	push   %eax
+     d5c:	e8 fc ff ff ff       	call   d5d <pagebuf_get_empty+0xd>
+     d61:	89 c3                	mov    %eax,%ebx
+	if (pb)
+     d63:	83 c4 08             	add    $0x8,%esp
+     d66:	85 db                	test   %ebx,%ebx
+     d68:	74 19                	je     d83 <pagebuf_get_empty+0x33>
+		_pagebuf_initialize(pb, target, 0, 0, _PBF_LOCKABLE);
+     d6a:	68 00 00 08 00       	push   $0x80000
+     d6f:	6a 00                	push   $0x0
+     d71:	6a 00                	push   $0x0
+     d73:	6a 00                	push   $0x0
+     d75:	8b 44 24 18          	mov    0x18(%esp,1),%eax
+     d79:	50                   	push   %eax
+     d7a:	53                   	push   %ebx
+     d7b:	e8 00 f4 ff ff       	call   180 <_pagebuf_initialize>
+     d80:	83 c4 18             	add    $0x18,%esp
+	return pb;
+     d83:	89 d8                	mov    %ebx,%eax
+     d85:	5b                   	pop    %ebx
+     d86:	c3                   	ret    
+}
+     d87:	90                   	nop    
+
+0000000000000d88 <pagebuf_associate_memory>:
+
+static inline struct page *
+mem_to_page(
+	void			*addr)
+{
+	if (((unsigned long)addr < VMALLOC_START) ||
+            ((unsigned long)addr >= VMALLOC_END)) {
+		return virt_to_page(addr);
+	} else {
+		return vmalloc_to_page(addr);
+	}
+}
+
+int
+pagebuf_associate_memory(
+	page_buf_t		*pb,
+	void			*mem,
+	size_t			len)
+{
+     d88:	55                   	push   %ebp
+     d89:	57                   	push   %edi
+     d8a:	56                   	push   %esi
+     d8b:	53                   	push   %ebx
+     d8c:	8b 7c 24 14          	mov    0x14(%esp,1),%edi
+	int			rval;
+	int			i = 0;
+	size_t			ptr;
+	size_t			end, end_cur;
+	off_t			offset;
+	int			page_count;
+
+	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+     d90:	8b 5c 24 1c          	mov    0x1c(%esp,1),%ebx
+	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
+     d94:	8b 74 24 18          	mov    0x18(%esp,1),%esi
+     d98:	81 c3 ff 0f 00 00    	add    $0xfff,%ebx
+     d9e:	c1 eb 0c             	shr    $0xc,%ebx
+     da1:	89 f5                	mov    %esi,%ebp
+     da3:	81 e5 ff 0f 00 00    	and    $0xfff,%ebp
+	if (offset && (len > PAGE_CACHE_SIZE))
+     da9:	74 0e                	je     db9 <pagebuf_associate_memory+0x31>
+     dab:	8d 43 01             	lea    0x1(%ebx),%eax
+     dae:	81 7c 24 1c 01 10 00 	cmpl   $0x1001,0x1c(%esp,1)
+     db5:	00 
+     db6:	0f 43 d8             	cmovae %eax,%ebx
+		page_count++;
+
+	/* Free any previous set of page pointers */
+	if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) {
+     db9:	8b 97 84 00 00 00    	mov    0x84(%edi),%edx
+     dbf:	85 d2                	test   %edx,%edx
+     dc1:	74 13                	je     dd6 <pagebuf_associate_memory+0x4e>
+     dc3:	8d 87 88 00 00 00    	lea    0x88(%edi),%eax
+     dc9:	39 c2                	cmp    %eax,%edx
+     dcb:	74 09                	je     dd6 <pagebuf_associate_memory+0x4e>
+		kfree(pb->pb_pages);
+     dcd:	52                   	push   %edx
+     dce:	e8 fc ff ff ff       	call   dcf <pagebuf_associate_memory+0x47>
+	}
+     dd3:	83 c4 04             	add    $0x4,%esp
+	pb->pb_pages = NULL;
+     dd6:	c7 87 84 00 00 00 00 	movl   $0x0,0x84(%edi)
+     ddd:	00 00 00 
+	pb->pb_addr = mem;
+     de0:	89 77 34             	mov    %esi,0x34(%edi)
+
+	rval = _pagebuf_get_pages(pb, page_count, 0);
+     de3:	6a 00                	push   $0x0
+     de5:	53                   	push   %ebx
+     de6:	57                   	push   %edi
+     de7:	e8 b8 f4 ff ff       	call   2a4 <_pagebuf_get_pages>
+	if (rval)
+     dec:	83 c4 0c             	add    $0xc,%esp
+     def:	85 c0                	test   %eax,%eax
+     df1:	0f 85 f6 00 00 00    	jne    eed <pagebuf_associate_memory+0x165>
+		return rval;
+
+	pb->pb_offset = offset;
+     df7:	66 89 af 80 00 00 00 	mov    %bp,0x80(%edi)
+	ptr = (size_t) mem & PAGE_CACHE_MASK;
+	end = PAGE_CACHE_ALIGN((size_t) mem + len);
+     dfe:	8b 44 24 1c          	mov    0x1c(%esp,1),%eax
+     e02:	8d ac 30 ff 0f 00 00 	lea    0xfff(%eax,%esi,1),%ebp
+     e09:	a1 00 00 00 00       	mov    0x0,%eax
+     e0e:	89 f3                	mov    %esi,%ebx
+     e10:	81 e3 00 f0 ff ff    	and    $0xfffff000,%ebx
+     e16:	81 e5 00 f0 ff ff    	and    $0xfffff000,%ebp
+     e1c:	05 ff ff ff 00       	add    $0xffffff,%eax
+     e21:	25 00 00 80 ff       	and    $0xff800000,%eax
+     e26:	39 c6                	cmp    %eax,%esi
+     e28:	72 08                	jb     e32 <pagebuf_associate_memory+0xaa>
+     e2a:	81 fe ff df ff fd    	cmp    $0xfdffdfff,%esi
+     e30:	76 1e                	jbe    e50 <pagebuf_associate_memory+0xc8>
+     e32:	8d 86 00 00 00 40    	lea    0x40000000(%esi),%eax
+     e38:	c1 e8 0c             	shr    $0xc,%eax
+     e3b:	8d 14 40             	lea    (%eax,%eax,2),%edx
+     e3e:	c1 e2 04             	shl    $0x4,%edx
+     e41:	03 15 00 00 00 00    	add    0x0,%edx
+     e47:	eb 12                	jmp    e5b <pagebuf_associate_memory+0xd3>
+     e49:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
+     e50:	56                   	push   %esi
+     e51:	e8 fc ff ff ff       	call   e52 <pagebuf_associate_memory+0xca>
+     e56:	89 c2                	mov    %eax,%edx
+     e58:	83 c4 04             	add    $0x4,%esp
+	end_cur = end;
+	/* set up first page */
+	pb->pb_pages[0] = mem_to_page(mem);
+     e5b:	8b 87 84 00 00 00    	mov    0x84(%edi),%eax
+     e61:	89 10                	mov    %edx,(%eax)
+
+	ptr += PAGE_CACHE_SIZE;
+     e63:	81 c3 00 10 00 00    	add    $0x1000,%ebx
+	pb->pb_page_count = ++i;
+     e69:	be 01 00 00 00       	mov    $0x1,%esi
+     e6e:	66 c7 47 7e 01 00    	movw   $0x1,0x7e(%edi)
+	while (ptr < end) {
+     e74:	39 eb                	cmp    %ebp,%ebx
+     e76:	73 5b                	jae    ed3 <pagebuf_associate_memory+0x14b>
+     e78:	a1 00 00 00 00       	mov    0x0,%eax
+     e7d:	05 ff ff ff 00       	add    $0xffffff,%eax
+     e82:	25 00 00 80 ff       	and    $0xff800000,%eax
+     e87:	39 c3                	cmp    %eax,%ebx
+     e89:	72 08                	jb     e93 <pagebuf_associate_memory+0x10b>
+     e8b:	81 fb ff df ff fd    	cmp    $0xfdffdfff,%ebx
+     e91:	76 1d                	jbe    eb0 <pagebuf_associate_memory+0x128>
+     e93:	8d 83 00 00 00 40    	lea    0x40000000(%ebx),%eax
+     e99:	c1 e8 0c             	shr    $0xc,%eax
+     e9c:	8d 14 40             	lea    (%eax,%eax,2),%edx
+     e9f:	c1 e2 04             	shl    $0x4,%edx
+     ea2:	03 15 00 00 00 00    	add    0x0,%edx
+     ea8:	eb 11                	jmp    ebb <pagebuf_associate_memory+0x133>
+     eaa:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+     eb0:	53                   	push   %ebx
+     eb1:	e8 fc ff ff ff       	call   eb2 <pagebuf_associate_memory+0x12a>
+     eb6:	89 c2                	mov    %eax,%edx
+     eb8:	83 c4 04             	add    $0x4,%esp
+		pb->pb_pages[i] = mem_to_page((void *)ptr);
+     ebb:	8b 87 84 00 00 00    	mov    0x84(%edi),%eax
+     ec1:	89 14 b0             	mov    %edx,(%eax,%esi,4)
+		pb->pb_page_count = ++i;
+     ec4:	46                   	inc    %esi
+     ec5:	66 89 77 7e          	mov    %si,0x7e(%edi)
+		ptr += PAGE_CACHE_SIZE;
+     ec9:	81 c3 00 10 00 00    	add    $0x1000,%ebx
+	}
+     ecf:	39 eb                	cmp    %ebp,%ebx
+     ed1:	72 a5                	jb     e78 <pagebuf_associate_memory+0xf0>
+	pb->pb_locked = 0;
+     ed3:	c6 87 82 00 00 00 00 	movb   $0x0,0x82(%edi)
+
+	pb->pb_count_desired = pb->pb_buffer_length = len;
+     eda:	8b 44 24 1c          	mov    0x1c(%esp,1),%eax
+     ede:	89 47 2c             	mov    %eax,0x2c(%edi)
+     ee1:	89 47 30             	mov    %eax,0x30(%edi)
+	pb->pb_flags |= PBF_MAPPED | _PBF_PRIVATE_BH;
+     ee4:	81 4f 08 04 00 10 00 	orl    $0x100004,0x8(%edi)
+
+	return 0;
+     eeb:	31 c0                	xor    %eax,%eax
+     eed:	5b                   	pop    %ebx
+     eee:	5e                   	pop    %esi
+     eef:	5f                   	pop    %edi
+     ef0:	5d                   	pop    %ebp
+     ef1:	c3                   	ret    
+}
+     ef2:	89 f6                	mov    %esi,%esi
+
+0000000000000ef4 <pagebuf_get_no_daddr>:
+
+page_buf_t *
+pagebuf_get_no_daddr(
+	size_t			len,
+	pb_target_t		*target)
+{
+     ef4:	55                   	push   %ebp
+     ef5:	57                   	push   %edi
+     ef6:	56                   	push   %esi
+     ef7:	53                   	push   %ebx
+     ef8:	8b 6c 24 14          	mov    0x14(%esp,1),%ebp
+	int			rval;
+	void			*rmem = NULL;
+     efc:	31 f6                	xor    %esi,%esi
+	int			flags = _PBF_LOCKABLE | PBF_FORCEIO;
+	page_buf_t		*pb;
+	size_t			tlen = 0;
+     efe:	31 ff                	xor    %edi,%edi
+
+	if (len > 0x20000)
+     f00:	81 fd 00 00 02 00    	cmp    $0x20000,%ebp
+     f06:	77 19                	ja     f21 <pagebuf_get_no_daddr+0x2d>
+		return(NULL);
+
+	pb = pagebuf_allocate(flags);
+     f08:	a1 24 00 00 00       	mov    0x24,%eax
+     f0d:	68 f0 01 00 00       	push   $0x1f0
+     f12:	50                   	push   %eax
+     f13:	e8 fc ff ff ff       	call   f14 <pagebuf_get_no_daddr+0x20>
+     f18:	89 c3                	mov    %eax,%ebx
+	if (!pb)
+     f1a:	83 c4 08             	add    $0x8,%esp
+     f1d:	85 db                	test   %ebx,%ebx
+     f1f:	75 07                	jne    f28 <pagebuf_get_no_daddr+0x34>
+		return NULL;
+     f21:	31 c0                	xor    %eax,%eax
+     f23:	e9 98 00 00 00       	jmp    fc0 <pagebuf_get_no_daddr+0xcc>
+
+	_pagebuf_initialize(pb, target, 0, len, flags);
+     f28:	68 00 00 08 08       	push   $0x8080000
+     f2d:	55                   	push   %ebp
+     f2e:	6a 00                	push   $0x0
+     f30:	6a 00                	push   $0x0
+     f32:	8b 44 24 28          	mov    0x28(%esp,1),%eax
+     f36:	50                   	push   %eax
+     f37:	53                   	push   %ebx
+     f38:	e8 43 f2 ff ff       	call   180 <_pagebuf_initialize>
+
+	do {
+     f3d:	83 c4 18             	add    $0x18,%esp
+		if (tlen == 0) {
+     f40:	85 ff                	test   %edi,%edi
+     f42:	75 04                	jne    f48 <pagebuf_get_no_daddr+0x54>
+			tlen = len; /* first time */
+     f44:	89 ef                	mov    %ebp,%edi
+		} else {
+     f46:	eb 0b                	jmp    f53 <pagebuf_get_no_daddr+0x5f>
+			kfree(rmem); /* free the mem from the previous try */
+     f48:	56                   	push   %esi
+     f49:	e8 fc ff ff ff       	call   f4a <pagebuf_get_no_daddr+0x56>
+			tlen <<= 1; /* double the size and try again */
+     f4e:	01 ff                	add    %edi,%edi
+			/*
+			printk(
+			"pb_get_no_daddr NOT block 0x%p mask 0x%p len %d\n",
+				rmem, ((size_t)rmem & (size_t)~SECTOR_MASK),
+				len);
+			*/
+		}
+     f50:	83 c4 04             	add    $0x4,%esp
+		if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) {
+     f53:	68 f0 01 00 00       	push   $0x1f0
+     f58:	57                   	push   %edi
+     f59:	e8 fc ff ff ff       	call   f5a <pagebuf_get_no_daddr+0x66>
+     f5e:	89 c6                	mov    %eax,%esi
+     f60:	83 c4 08             	add    $0x8,%esp
+     f63:	85 f6                	test   %esi,%esi
+     f65:	74 2b                	je     f92 <pagebuf_get_no_daddr+0x9e>
+			pagebuf_free(pb);
+			return NULL;
+		}
+	} while ((size_t)rmem != ((size_t)rmem & (size_t)~SECTOR_MASK));
+     f67:	25 00 fe ff ff       	and    $0xfffffe00,%eax
+     f6c:	39 c6                	cmp    %eax,%esi
+     f6e:	75 d0                	jne    f40 <pagebuf_get_no_daddr+0x4c>
+
+	if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) {
+     f70:	55                   	push   %ebp
+     f71:	56                   	push   %esi
+     f72:	53                   	push   %ebx
+     f73:	e8 fc ff ff ff       	call   f74 <pagebuf_get_no_daddr+0x80>
+     f78:	83 c4 0c             	add    $0xc,%esp
+     f7b:	85 c0                	test   %eax,%eax
+     f7d:	74 21                	je     fa0 <pagebuf_get_no_daddr+0xac>
+		kfree(rmem);
+     f7f:	56                   	push   %esi
+     f80:	e8 fc ff ff ff       	call   f81 <pagebuf_get_no_daddr+0x8d>
+		pagebuf_free(pb);
+     f85:	53                   	push   %ebx
+     f86:	e8 fc ff ff ff       	call   f87 <pagebuf_get_no_daddr+0x93>
+		return NULL;
+     f8b:	31 c0                	xor    %eax,%eax
+     f8d:	83 c4 08             	add    $0x8,%esp
+     f90:	eb 2e                	jmp    fc0 <pagebuf_get_no_daddr+0xcc>
+     f92:	53                   	push   %ebx
+     f93:	e8 fc ff ff ff       	call   f94 <pagebuf_get_no_daddr+0xa0>
+     f98:	31 c0                	xor    %eax,%eax
+     f9a:	83 c4 04             	add    $0x4,%esp
+     f9d:	eb 21                	jmp    fc0 <pagebuf_get_no_daddr+0xcc>
+     f9f:	90                   	nop    
+	}
+	/* otherwise pagebuf_free just ignores it */
+	pb->pb_flags |= _PBF_MEM_ALLOCATED;
+     fa0:	80 4b 0b 02          	orb    $0x2,0xb(%ebx)
+ * The default case (no contention) will result in NO
+ * jumps for both down() and up().
+ */
+static inline void up(struct semaphore * sem)
+{
+     fa4:	8d 8b 98 00 00 00    	lea    0x98(%ebx),%ecx
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+	__asm__ __volatile__(
+     faa:	f0 ff 83 98 00 00 00 	lock incl 0x98(%ebx)
+     fb1:	0f 8e 60 1e 00 00    	jle    2e17 <Letext+0x4e>
+	up(&PBP(pb)->pb_sema);	/* Return unlocked pagebuf */
+
+	PB_TRACE(pb, PB_TRACE_REC(no_daddr), rmem);
+
+	return pb;
+     fb7:	89 d8                	mov    %ebx,%eax
+}
+     fb9:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
+     fc0:	5b                   	pop    %ebx
+     fc1:	5e                   	pop    %esi
+     fc2:	5f                   	pop    %edi
+     fc3:	5d                   	pop    %ebp
+     fc4:	c3                   	ret    
+     fc5:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000000fc8 <pagebuf_hold>:
+
+
+/*
+ *	pagebuf_hold
+ *
+ *	Increment reference count on buffer, to hold the buffer concurrently
+ *	with another thread which may release (free) the buffer asynchronously.
+ *
+ *	Must hold the buffer already to call this function.
+ */
+void
+pagebuf_hold(
+	page_buf_t		*pb)
+{
+     fc8:	8b 44 24 04          	mov    0x4(%esp,1),%eax
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_inc(atomic_t *v)
+{
+	__asm__ __volatile__(
+     fcc:	f0 ff 40 18          	lock incl 0x18(%eax)
+     fd0:	c3                   	ret    
+	atomic_inc(&pb->pb_hold);
+	PB_TRACE(pb, PB_TRACE_REC(hold), 0);
+}
+     fd1:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000000fd4 <pagebuf_free>:
+
+/*
+ *	pagebuf_free
+ *
+ *	pagebuf_free releases the specified buffer.  The modification
+ *	state of any associated pages is left unchanged.
+ */
+void
+pagebuf_free(
+	page_buf_t		*pb)
+{
+     fd4:	57                   	push   %edi
+     fd5:	56                   	push   %esi
+     fd6:	53                   	push   %ebx
+     fd7:	8b 7c 24 10          	mov    0x10(%esp,1),%edi
+	if (pb->pb_flags & _PBF_LOCKABLE) {
+     fdb:	f6 47 0a 08          	testb  $0x8,0xa(%edi)
+     fdf:	74 44                	je     1025 <pagebuf_free+0x51>
+		pb_hash_t	*h = pb_hash(pb);
+     fe1:	0f b6 87 83 00 00 00 	movzbl 0x83(%edi),%eax
+     fe8:	8d 04 80             	lea    (%eax,%eax,4),%eax
+     feb:	8d 34 85 60 03 00 00 	lea    0x360(,%eax,4),%esi
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+     ff2:	8d 5e 0c             	lea    0xc(%esi),%ebx
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+     ff5:	81 7b 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%ebx)
+     ffc:	74 1a                	je     1018 <pagebuf_free+0x44>
+printk("eip: %p\n", &&here);
+     ffe:	68 f5 0f 00 00       	push   $0xff5
+    1003:	68 2b 00 00 00       	push   $0x2b
+    1008:	e8 fc ff ff ff       	call   1009 <pagebuf_free+0x35>
+		BUG();
+    100d:	0f 0b                	ud2a   
+    100f:	85 00                	test   %eax,(%eax)
+    1011:	00 00                	add    %al,(%eax)
+    1013:	00 00                	add    %al,(%eax)
+	}
+    1015:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    1018:	f0 fe 0b             	lock decb (%ebx)
+    101b:	0f 88 00 1e 00 00    	js     2e21 <Letext+0x58>
+
+		spin_lock(&h->pb_hash_lock);
+		_pagebuf_free_object(h, pb);
+    1021:	57                   	push   %edi
+    1022:	56                   	push   %esi
+	} else {
+    1023:	eb 03                	jmp    1028 <pagebuf_free+0x54>
+		_pagebuf_free_object(NULL, pb);
+    1025:	57                   	push   %edi
+    1026:	6a 00                	push   $0x0
+    1028:	e8 fc ff ff ff       	call   1029 <pagebuf_free+0x55>
+	}
+    102d:	83 c4 08             	add    $0x8,%esp
+    1030:	5b                   	pop    %ebx
+    1031:	5e                   	pop    %esi
+    1032:	5f                   	pop    %edi
+    1033:	c3                   	ret    
+
+0000000000001034 <pagebuf_rele>:
+}
+
+/*
+ *	pagebuf_rele
+ *
+ *	pagebuf_rele releases a hold on the specified buffer.  If the
+ *	the hold count is 1, pagebuf_rele calls pagebuf_free.
+ */
+void
+pagebuf_rele(
+	page_buf_t		*pb)
+{
+    1034:	57                   	push   %edi
+    1035:	56                   	push   %esi
+    1036:	53                   	push   %ebx
+    1037:	8b 74 24 10          	mov    0x10(%esp,1),%esi
+	pb_hash_t		*h;
+
+	PB_TRACE(pb, PB_TRACE_REC(rele), pb->pb_relse);
+	if (pb->pb_flags & _PBF_LOCKABLE) {
+    103b:	f6 46 0a 08          	testb  $0x8,0xa(%esi)
+    103f:	74 42                	je     1083 <pagebuf_rele+0x4f>
+		h = pb_hash(pb);
+    1041:	0f b6 86 83 00 00 00 	movzbl 0x83(%esi),%eax
+    1048:	8d 04 80             	lea    (%eax,%eax,4),%eax
+    104b:	8d 1c 85 60 03 00 00 	lea    0x360(,%eax,4),%ebx
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    1052:	8d 7b 0c             	lea    0xc(%ebx),%edi
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    1055:	81 7f 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%edi)
+    105c:	74 1a                	je     1078 <pagebuf_rele+0x44>
+printk("eip: %p\n", &&here);
+    105e:	68 55 10 00 00       	push   $0x1055
+    1063:	68 2b 00 00 00       	push   $0x2b
+    1068:	e8 fc ff ff ff       	call   1069 <pagebuf_rele+0x35>
+		BUG();
+    106d:	0f 0b                	ud2a   
+    106f:	85 00                	test   %eax,(%eax)
+    1071:	00 00                	add    %al,(%eax)
+    1073:	00 00                	add    %al,(%eax)
+	}
+    1075:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    1078:	f0 fe 0f             	lock decb (%edi)
+    107b:	0f 88 ac 1d 00 00    	js     2e2d <Letext+0x64>
+		spin_lock(&h->pb_hash_lock);
+	} else {
+    1081:	eb 02                	jmp    1085 <pagebuf_rele+0x51>
+		h = NULL;
+    1083:	31 db                	xor    %ebx,%ebx
+static __inline__ int atomic_dec_and_test(atomic_t *v)
+{
+	unsigned char c;
+
+	__asm__ __volatile__(
+    1085:	f0 ff 4e 18          	lock decl 0x18(%esi)
+    1089:	0f 94 c0             	sete   %al
+	}
+
+	if (atomic_dec_and_test(&pb->pb_hold)) {
+    108c:	84 c0                	test   %al,%al
+    108e:	0f 84 cd 00 00 00    	je     1161 <pagebuf_rele+0x12d>
+		int		do_free = 1;
+    1094:	ba 01 00 00 00       	mov    $0x1,%edx
+
+		if (pb->pb_relse) {
+    1099:	83 7e 50 00          	cmpl   $0x0,0x50(%esi)
+    109d:	74 36                	je     10d5 <pagebuf_rele+0xa1>
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_inc(atomic_t *v)
+{
+	__asm__ __volatile__(
+    109f:	f0 ff 46 18          	lock incl 0x18(%esi)
+			atomic_inc(&pb->pb_hold);
+			if (h)
+    10a3:	85 db                	test   %ebx,%ebx
+    10a5:	74 23                	je     10ca <pagebuf_rele+0x96>
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    10a7:	81 7b 10 ad 4e ad de 	cmpl   $0xdead4ead,0x10(%ebx)
+    10ae:	74 08                	je     10b8 <pagebuf_rele+0x84>
+		BUG();
+    10b0:	0f 0b                	ud2a   
+    10b2:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    10b8:	8a 43 0c             	mov    0xc(%ebx),%al
+    10bb:	84 c0                	test   %al,%al
+    10bd:	7e 08                	jle    10c7 <pagebuf_rele+0x93>
+		BUG();
+    10bf:	0f 0b                	ud2a   
+    10c1:	6b 00 00             	imul   $0x0,(%eax),%eax
+    10c4:	00 00                	add    %al,(%eax)
+    10c6:	00 86 53 0c 56 8b    	add    %al,0x8b560c53(%esi)
+				spin_unlock(&h->pb_hash_lock);
+			(*(pb->pb_relse)) (pb);
+    10cc:	46                   	inc    %esi
+    10cd:	50                   	push   %eax
+    10ce:	ff d0                	call   *%eax
+			do_free = 0;
+    10d0:	31 d2                	xor    %edx,%edx
+		}
+    10d2:	83 c4 04             	add    $0x4,%esp
+		if (pb->pb_flags & PBF_DELWRI) {
+    10d5:	8b 46 08             	mov    0x8(%esi),%eax
+    10d8:	a8 40                	test   $0x40,%al
+    10da:	74 45                	je     1121 <pagebuf_rele+0xed>
+			pb->pb_flags |= PBF_ASYNC;
+    10dc:	0c 10                	or     $0x10,%al
+    10de:	89 46 08             	mov    %eax,0x8(%esi)
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_inc(atomic_t *v)
+{
+	__asm__ __volatile__(
+    10e1:	f0 ff 46 18          	lock incl 0x18(%esi)
+			atomic_inc(&pb->pb_hold);
+			if (h && do_free)
+    10e5:	85 db                	test   %ebx,%ebx
+    10e7:	74 29                	je     1112 <pagebuf_rele+0xde>
+    10e9:	85 d2                	test   %edx,%edx
+    10eb:	74 25                	je     1112 <pagebuf_rele+0xde>
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    10ed:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    10ef:	81 7b 10 ad 4e ad de 	cmpl   $0xdead4ead,0x10(%ebx)
+    10f6:	74 08                	je     1100 <pagebuf_rele+0xcc>
+		BUG();
+    10f8:	0f 0b                	ud2a   
+    10fa:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    1100:	8a 43 0c             	mov    0xc(%ebx),%al
+    1103:	84 c0                	test   %al,%al
+    1105:	7e 08                	jle    110f <pagebuf_rele+0xdb>
+		BUG();
+    1107:	0f 0b                	ud2a   
+    1109:	6b 00 00             	imul   $0x0,(%eax),%eax
+    110c:	00 00                	add    %al,(%eax)
+    110e:	00 86 53 0c 6a 00    	add    %al,0x6a0c53(%esi)
+				spin_unlock(&h->pb_hash_lock);
+			pagebuf_delwri_queue(pb, 0);
+    1114:	56                   	push   %esi
+    1115:	e8 fc ff ff ff       	call   1116 <pagebuf_rele+0xe2>
+			do_free = 0;
+    111a:	31 d2                	xor    %edx,%edx
+		} else if (pb->pb_flags & PBF_FS_MANAGED) {
+    111c:	83 c4 08             	add    $0x8,%esp
+    111f:	eb 30                	jmp    1151 <pagebuf_rele+0x11d>
+    1121:	f6 c4 08             	test   $0x8,%ah
+    1124:	74 2b                	je     1151 <pagebuf_rele+0x11d>
+			if (h)
+    1126:	85 db                	test   %ebx,%ebx
+    1128:	74 25                	je     114f <pagebuf_rele+0x11b>
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    112a:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    112c:	81 7b 10 ad 4e ad de 	cmpl   $0xdead4ead,0x10(%ebx)
+    1133:	74 08                	je     113d <pagebuf_rele+0x109>
+		BUG();
+    1135:	0f 0b                	ud2a   
+    1137:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    113d:	8a 43 0c             	mov    0xc(%ebx),%al
+    1140:	84 c0                	test   %al,%al
+    1142:	7e 08                	jle    114c <pagebuf_rele+0x118>
+		BUG();
+    1144:	0f 0b                	ud2a   
+    1146:	6b 00 00             	imul   $0x0,(%eax),%eax
+    1149:	00 00                	add    %al,(%eax)
+    114b:	00 86 53 0c 31 d2    	add    %al,0xd2310c53(%esi)
+				spin_unlock(&h->pb_hash_lock);
+			do_free = 0;
+		}
+
+		if (do_free) {
+    1151:	85 d2                	test   %edx,%edx
+    1153:	74 35                	je     118a <pagebuf_rele+0x156>
+			_pagebuf_free_object(h, pb);
+    1155:	56                   	push   %esi
+    1156:	53                   	push   %ebx
+    1157:	e8 fc ff ff ff       	call   1158 <pagebuf_rele+0x124>
+		}
+    115c:	83 c4 08             	add    $0x8,%esp
+	} else if (h) {
+    115f:	eb 29                	jmp    118a <pagebuf_rele+0x156>
+    1161:	85 db                	test   %ebx,%ebx
+    1163:	74 25                	je     118a <pagebuf_rele+0x156>
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    1165:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    1167:	81 7b 10 ad 4e ad de 	cmpl   $0xdead4ead,0x10(%ebx)
+    116e:	74 08                	je     1178 <pagebuf_rele+0x144>
+		BUG();
+    1170:	0f 0b                	ud2a   
+    1172:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    1178:	8a 43 0c             	mov    0xc(%ebx),%al
+    117b:	84 c0                	test   %al,%al
+    117d:	7e 08                	jle    1187 <pagebuf_rele+0x153>
+		BUG();
+    117f:	0f 0b                	ud2a   
+    1181:	6b 00 00             	imul   $0x0,(%eax),%eax
+    1184:	00 00                	add    %al,(%eax)
+    1186:	00 86 53 0c 5b 5e    	add    %al,0x5e5b0c53(%esi)
+		spin_unlock(&h->pb_hash_lock);
+	}
+    118c:	5f                   	pop    %edi
+    118d:	c3                   	ret    
+}
+    118e:	89 f6                	mov    %esi,%esi
+
+0000000000001190 <pagebuf_pin>:
+
+
+/*
+ *	Pinning Buffer Storage in Memory
+ */
+
+/*
+ *	pagebuf_pin
+ *
+ *	pagebuf_pin locks all of the memory represented by a buffer in
+ *	memory.	 Multiple calls to pagebuf_pin and pagebuf_unpin, for
+ *	the same or different buffers affecting a given page, will
+ *	properly count the number of outstanding "pin" requests.  The
+ *	buffer may be released after the pagebuf_pin and a different
+ *	buffer used when calling pagebuf_unpin, if desired.
+ *	pagebuf_pin should be used by the file system when it wants be
+ *	assured that no attempt will be made to force the affected
+ *	memory to disk.	 It does not assure that a given logical page
+ *	will not be moved to a different physical page.
+ */
+void
+pagebuf_pin(
+	page_buf_t		*pb)
+{
+    1190:	8b 44 24 04          	mov    0x4(%esp,1),%eax
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_inc(atomic_t *v)
+{
+	__asm__ __volatile__(
+    1194:	f0 ff 80 b8 00 00 00 	lock incl 0xb8(%eax)
+    119b:	c3                   	ret    
+
+000000000000119c <pagebuf_unpin>:
+	atomic_inc(&PBP(pb)->pb_pin_count);
+	PB_TRACE(pb, PB_TRACE_REC(pin), PBP(pb)->pb_pin_count.counter);
+}
+
+/*
+ *	pagebuf_unpin
+ *
+ *	pagebuf_unpin reverses the locking of memory performed by
+ *	pagebuf_pin.  Note that both functions affected the logical
+ *	pages associated with the buffer, not the buffer itself.
+ */
+void
+pagebuf_unpin(
+	page_buf_t		*pb)
+{
+    119c:	8b 54 24 04          	mov    0x4(%esp,1),%edx
+static __inline__ int atomic_dec_and_test(atomic_t *v)
+{
+	unsigned char c;
+
+	__asm__ __volatile__(
+    11a0:	f0 ff 8a b8 00 00 00 	lock decl 0xb8(%edx)
+    11a7:	0f 94 c0             	sete   %al
+	if (atomic_dec_and_test(&PBP(pb)->pb_pin_count)) {
+    11aa:	84 c0                	test   %al,%al
+    11ac:	74 12                	je     11c0 <pagebuf_unpin+0x24>
+		wake_up_all(&PBP(pb)->pb_waiters);
+    11ae:	8d 82 bc 00 00 00    	lea    0xbc(%edx),%eax
+    11b4:	31 c9                	xor    %ecx,%ecx
+    11b6:	ba 03 00 00 00       	mov    $0x3,%edx
+    11bb:	e8 fc ff ff ff       	call   11bc <pagebuf_unpin+0x20>
+	}
+    11c0:	c3                   	ret    
+	PB_TRACE(pb, PB_TRACE_REC(unpin), PBP(pb)->pb_pin_count.counter);
+}
+    11c1:	8d 76 00             	lea    0x0(%esi),%esi
+
+00000000000011c4 <pagebuf_ispin>:
+
+int
+pagebuf_ispin(
+	page_buf_t		*pb)
+{
+    11c4:	8b 44 24 04          	mov    0x4(%esp,1),%eax
+	return atomic_read(&PBP(pb)->pb_pin_count);
+    11c8:	8b 80 b8 00 00 00    	mov    0xb8(%eax),%eax
+    11ce:	c3                   	ret    
+    11cf:	90                   	nop    
+
+00000000000011d0 <pagebuf_queue_task>:
+}
+
+/*
+ *	pagebuf_wait_unpin
+ *
+ *	pagebuf_wait_unpin waits until all of the memory associated
+ *	with the buffer is not longer locked in memory.	 It returns
+ *	immediately if none of the affected pages are locked.
+ */
+static inline void
+_pagebuf_wait_unpin(
+	page_buf_t		*pb)
+{
+	DECLARE_WAITQUEUE	(wait, current);
+
+	if (atomic_read(&PBP(pb)->pb_pin_count) == 0)
+		return;
+
+	add_wait_queue(&PBP(pb)->pb_waiters, &wait);
+	for (;;) {
+		current->state = TASK_UNINTERRUPTIBLE;
+		if (atomic_read(&PBP(pb)->pb_pin_count) == 0) {
+			break;
+		}
+		run_task_queue(&tq_disk);
+		schedule();
+	}
+	remove_wait_queue(&PBP(pb)->pb_waiters, &wait);
+	current->state = TASK_RUNNING;
+}
+
+void
+pagebuf_queue_task(
+	struct tq_struct	*task)
+{
+    11d0:	55                   	push   %ebp
+    11d1:	57                   	push   %edi
+    11d2:	56                   	push   %esi
+    11d3:	53                   	push   %ebx
+    11d4:	8b 6c 24 14          	mov    0x14(%esp,1),%ebp
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    11d8:	b8 00 e0 ff ff       	mov    $0xffffe000,%eax
+    11dd:	21 e0                	and    %esp,%eax
+ * Queue a task on a tq.  Return non-zero if it was successfully
+ * added.
+ */
+static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list)
+{
+    11df:	8b 58 30             	mov    0x30(%eax),%ebx
+    11e2:	c1 e3 03             	shl    $0x3,%ebx
+    11e5:	81 c3 40 00 00 00    	add    $0x40,%ebx
+	int ret = 0;
+    11eb:	31 c0                	xor    %eax,%eax
+static __inline__ int test_and_set_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( LOCK_PREFIX
+    11ed:	f0 0f ab 45 08       	lock bts %eax,0x8(%ebp)
+    11f2:	19 c0                	sbb    %eax,%eax
+ */
+static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list)
+{
+	int ret = 0;
+	if (!test_and_set_bit(0,&bh_pointer->sync)) {
+    11f4:	85 c0                	test   %eax,%eax
+    11f6:	75 74                	jne    126c <pagebuf_queue_task+0x9c>
+		unsigned long flags;
+		spin_lock_irqsave(&tqueue_lock, flags);
+    11f8:	9c                   	pushf  
+    11f9:	5f                   	pop    %edi
+    11fa:	fa                   	cli    
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    11fb:	be 00 00 00 00       	mov    $0x0,%esi
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    1200:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+    1207:	4e ad de 
+    120a:	74 1a                	je     1226 <pagebuf_queue_task+0x56>
+printk("eip: %p\n", &&here);
+    120c:	68 00 12 00 00       	push   $0x1200
+    1211:	68 2b 00 00 00       	push   $0x2b
+    1216:	e8 fc ff ff ff       	call   1217 <pagebuf_queue_task+0x47>
+		BUG();
+    121b:	0f 0b                	ud2a   
+    121d:	85 00                	test   %eax,(%eax)
+    121f:	00 00                	add    %al,(%eax)
+    1221:	00 00                	add    %al,(%eax)
+	}
+    1223:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    1226:	f0 fe 0e             	lock decb (%esi)
+    1229:	0f 88 0a 1c 00 00    	js     2e39 <Letext+0x70>
+ */
+static __inline__ void __list_add(struct list_head * new,
+	struct list_head * prev,
+	struct list_head * next)
+{
+    122f:	8b 43 04             	mov    0x4(%ebx),%eax
+	next->prev = new;
+    1232:	89 6b 04             	mov    %ebp,0x4(%ebx)
+	new->next = next;
+    1235:	89 5d 00             	mov    %ebx,0x0(%ebp)
+	new->prev = prev;
+    1238:	89 45 04             	mov    %eax,0x4(%ebp)
+	prev->next = new;
+    123b:	89 28                	mov    %ebp,(%eax)
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    123d:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    123f:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+    1246:	4e ad de 
+    1249:	74 08                	je     1253 <pagebuf_queue_task+0x83>
+		BUG();
+    124b:	0f 0b                	ud2a   
+    124d:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    1253:	a0 00 00 00 00       	mov    0x0,%al
+    1258:	84 c0                	test   %al,%al
+    125a:	7e 08                	jle    1264 <pagebuf_queue_task+0x94>
+		BUG();
+    125c:	0f 0b                	ud2a   
+    125e:	6b 00 00             	imul   $0x0,(%eax),%eax
+    1261:	00 00                	add    %al,(%eax)
+    1263:	00 86 15 00 00 00    	add    %al,0x15(%esi)
+#endif
+	__asm__ __volatile__(
+    1269:	00 57 9d             	add    %dl,0xffffff9d(%edi)
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    126c:	b8 00 e0 ff ff       	mov    $0xffffe000,%eax
+    1271:	21 e0                	and    %esp,%eax
+	queue_task(task, &pagebuf_iodone_tq[smp_processor_id()]);
+	wake_up(&pagebuf_iodone_wait[smp_processor_id()]);
+    1273:	8b 40 30             	mov    0x30(%eax),%eax
+    1276:	c1 e0 04             	shl    $0x4,%eax
+    1279:	05 40 01 00 00       	add    $0x140,%eax
+    127e:	b9 01 00 00 00       	mov    $0x1,%ecx
+    1283:	ba 03 00 00 00       	mov    $0x3,%edx
+    1288:	e8 fc ff ff ff       	call   1289 <pagebuf_queue_task+0xb9>
+    128d:	5b                   	pop    %ebx
+    128e:	5e                   	pop    %esi
+    128f:	5f                   	pop    %edi
+    1290:	5d                   	pop    %ebp
+    1291:	c3                   	ret    
+}
+    1292:	89 f6                	mov    %esi,%esi
+
+0000000000001294 <pagebuf_iodone_sched>:
+
+
+/*
+ *	Buffer Utility Routines
+ */
+
+/*
+ *	pagebuf_iodone
+ *
+ *	pagebuf_iodone marks a buffer for which I/O is in progress
+ *	done with respect to that I/O.	The pb_done routine, if
+ *	present, will be called as a side-effect.
+ */
+void
+pagebuf_iodone_sched(
+	void			*v)
+{
+    1294:	53                   	push   %ebx
+    1295:	8b 5c 24 08          	mov    0x8(%esp,1),%ebx
+	page_buf_t		*pb = (page_buf_t *)v;
+
+	if (pb->pb_iodone) {
+    1299:	8b 43 4c             	mov    0x4c(%ebx),%eax
+    129c:	85 c0                	test   %eax,%eax
+    129e:	74 05                	je     12a5 <pagebuf_iodone_sched+0x11>
+		(*(pb->pb_iodone)) (pb);
+    12a0:	53                   	push   %ebx
+    12a1:	ff d0                	call   *%eax
+		return;
+    12a3:	eb 23                	jmp    12c8 <pagebuf_iodone_sched+0x34>
+	}
+
+	if (pb->pb_flags & PBF_ASYNC) {
+    12a5:	8b 43 08             	mov    0x8(%ebx),%eax
+    12a8:	a8 10                	test   $0x10,%al
+    12aa:	74 1f                	je     12cb <pagebuf_iodone_sched+0x37>
+		if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse)
+    12ac:	a9 00 00 08 00       	test   $0x80000,%eax
+    12b1:	74 0f                	je     12c2 <pagebuf_iodone_sched+0x2e>
+    12b3:	83 7b 50 00          	cmpl   $0x0,0x50(%ebx)
+    12b7:	75 09                	jne    12c2 <pagebuf_iodone_sched+0x2e>
+			pagebuf_unlock(pb);
+    12b9:	53                   	push   %ebx
+    12ba:	e8 fc ff ff ff       	call   12bb <pagebuf_iodone_sched+0x27>
+    12bf:	83 c4 04             	add    $0x4,%esp
+		pagebuf_rele(pb);
+    12c2:	53                   	push   %ebx
+    12c3:	e8 fc ff ff ff       	call   12c4 <pagebuf_iodone_sched+0x30>
+	}
+    12c8:	83 c4 04             	add    $0x4,%esp
+}
+    12cb:	5b                   	pop    %ebx
+    12cc:	c3                   	ret    
+    12cd:	8d 76 00             	lea    0x0(%esi),%esi
+
+00000000000012d0 <pagebuf_iodone>:
+
+void
+pagebuf_iodone(
+	page_buf_t		*pb)
+{
+    12d0:	55                   	push   %ebp
+    12d1:	57                   	push   %edi
+    12d2:	56                   	push   %esi
+    12d3:	53                   	push   %ebx
+    12d4:	8b 7c 24 14          	mov    0x14(%esp,1),%edi
+	pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
+    12d8:	8b 47 08             	mov    0x8(%edi),%eax
+    12db:	89 c2                	mov    %eax,%edx
+    12dd:	83 e2 fc             	and    $0xfffffffc,%edx
+    12e0:	89 57 08             	mov    %edx,0x8(%edi)
+	if (pb->pb_error == 0) {
+    12e3:	66 83 7f 7c 00       	cmpw   $0x0,0x7c(%edi)
+    12e8:	75 05                	jne    12ef <pagebuf_iodone+0x1f>
+		pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
+    12ea:	24 d4                	and    $0xd4,%al
+    12ec:	89 47 08             	mov    %eax,0x8(%edi)
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(done), pb->pb_iodone);
+
+	if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
+    12ef:	83 7f 4c 00          	cmpl   $0x0,0x4c(%edi)
+    12f3:	75 0b                	jne    1300 <pagebuf_iodone+0x30>
+    12f5:	f6 47 08 10          	testb  $0x10,0x8(%edi)
+    12f9:	0f 84 d3 00 00 00    	je     13d2 <pagebuf_iodone+0x102>
+    12ff:	90                   	nop    
+		INIT_TQUEUE(&pb->pb_iodone_sched,
+    1300:	8d 47 38             	lea    0x38(%edi),%eax
+    1303:	89 47 38             	mov    %eax,0x38(%edi)
+    1306:	89 47 3c             	mov    %eax,0x3c(%edi)
+    1309:	c7 47 40 00 00 00 00 	movl   $0x0,0x40(%edi)
+    1310:	c7 47 44 00 00 00 00 	movl   $0x0,0x44(%edi)
+			pagebuf_iodone_sched, (void *)pb);
+    1317:	89 7f 48             	mov    %edi,0x48(%edi)
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    131a:	ba 00 e0 ff ff       	mov    $0xffffe000,%edx
+    131f:	21 e2                	and    %esp,%edx
+ * Queue a task on a tq.  Return non-zero if it was successfully
+ * added.
+ */
+static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list)
+{
+    1321:	8b 5a 30             	mov    0x30(%edx),%ebx
+    1324:	89 c6                	mov    %eax,%esi
+    1326:	c1 e3 03             	shl    $0x3,%ebx
+    1329:	81 c3 40 00 00 00    	add    $0x40,%ebx
+	int ret = 0;
+    132f:	31 c0                	xor    %eax,%eax
+static __inline__ int test_and_set_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( LOCK_PREFIX
+    1331:	f0 0f ab 47 40       	lock bts %eax,0x40(%edi)
+    1336:	19 c0                	sbb    %eax,%eax
+ */
+static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list)
+{
+	int ret = 0;
+	if (!test_and_set_bit(0,&bh_pointer->sync)) {
+    1338:	85 c0                	test   %eax,%eax
+    133a:	75 73                	jne    13af <pagebuf_iodone+0xdf>
+		unsigned long flags;
+		spin_lock_irqsave(&tqueue_lock, flags);
+    133c:	9c                   	pushf  
+    133d:	5d                   	pop    %ebp
+    133e:	fa                   	cli    
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    133f:	bf 00 00 00 00       	mov    $0x0,%edi
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    1344:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+    134b:	4e ad de 
+    134e:	74 1a                	je     136a <pagebuf_iodone+0x9a>
+printk("eip: %p\n", &&here);
+    1350:	68 44 13 00 00       	push   $0x1344
+    1355:	68 2b 00 00 00       	push   $0x2b
+    135a:	e8 fc ff ff ff       	call   135b <pagebuf_iodone+0x8b>
+		BUG();
+    135f:	0f 0b                	ud2a   
+    1361:	85 00                	test   %eax,(%eax)
+    1363:	00 00                	add    %al,(%eax)
+    1365:	00 00                	add    %al,(%eax)
+	}
+    1367:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    136a:	f0 fe 0f             	lock decb (%edi)
+    136d:	0f 88 d2 1a 00 00    	js     2e45 <Letext+0x7c>
+ */
+static __inline__ void __list_add(struct list_head * new,
+	struct list_head * prev,
+	struct list_head * next)
+{
+    1373:	8b 43 04             	mov    0x4(%ebx),%eax
+	next->prev = new;
+    1376:	89 73 04             	mov    %esi,0x4(%ebx)
+	new->next = next;
+    1379:	89 1e                	mov    %ebx,(%esi)
+	new->prev = prev;
+    137b:	89 46 04             	mov    %eax,0x4(%esi)
+	prev->next = new;
+    137e:	89 30                	mov    %esi,(%eax)
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    1380:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    1382:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+    1389:	4e ad de 
+    138c:	74 08                	je     1396 <pagebuf_iodone+0xc6>
+		BUG();
+    138e:	0f 0b                	ud2a   
+    1390:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    1396:	a0 00 00 00 00       	mov    0x0,%al
+    139b:	84 c0                	test   %al,%al
+    139d:	7e 08                	jle    13a7 <pagebuf_iodone+0xd7>
+		BUG();
+    139f:	0f 0b                	ud2a   
+    13a1:	6b 00 00             	imul   $0x0,(%eax),%eax
+    13a4:	00 00                	add    %al,(%eax)
+    13a6:	00 86 15 00 00 00    	add    %al,0x15(%esi)
+#endif
+	__asm__ __volatile__(
+    13ac:	00 55 9d             	add    %dl,0xffffff9d(%ebp)
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    13af:	b8 00 e0 ff ff       	mov    $0xffffe000,%eax
+    13b4:	21 e0                	and    %esp,%eax
+
+		queue_task(&pb->pb_iodone_sched,
+				&pagebuf_iodone_tq[smp_processor_id()]);
+		wake_up(&pagebuf_iodone_wait[smp_processor_id()]);
+    13b6:	8b 40 30             	mov    0x30(%eax),%eax
+    13b9:	c1 e0 04             	shl    $0x4,%eax
+    13bc:	05 40 01 00 00       	add    $0x140,%eax
+    13c1:	b9 01 00 00 00       	mov    $0x1,%ecx
+    13c6:	ba 03 00 00 00       	mov    $0x3,%edx
+    13cb:	e8 fc ff ff ff       	call   13cc <pagebuf_iodone+0xfc>
+	} else {
+    13d0:	eb 0f                	jmp    13e1 <pagebuf_iodone+0x111>
+ * The default case (no contention) will result in NO
+ * jumps for both down() and up().
+ */
+static inline void up(struct semaphore * sem)
+{
+    13d2:	8d 47 58             	lea    0x58(%edi),%eax
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+	__asm__ __volatile__(
+    13d5:	89 c1                	mov    %eax,%ecx
+    13d7:	f0 ff 47 58          	lock incl 0x58(%edi)
+    13db:	0f 8e 70 1a 00 00    	jle    2e51 <Letext+0x88>
+		up(&pb->pb_iodonesema);
+	}
+    13e1:	5b                   	pop    %ebx
+    13e2:	5e                   	pop    %esi
+    13e3:	5f                   	pop    %edi
+    13e4:	5d                   	pop    %ebp
+    13e5:	c3                   	ret    
+}
+    13e6:	89 f6                	mov    %esi,%esi
+
+00000000000013e8 <pagebuf_ioerror>:
+
+/*
+ *	pagebuf_ioerror
+ *
+ *	pagebuf_ioerror sets the error code for a buffer.
+ */
+void
+pagebuf_ioerror(			/* mark/clear buffer error flag */
+	page_buf_t		*pb,	/* buffer to mark		*/
+	unsigned int		error)	/* error to store (0 if none)	*/
+{
+    13e8:	8b 54 24 04          	mov    0x4(%esp,1),%edx
+    13ec:	8b 44 24 08          	mov    0x8(%esp,1),%eax
+	pb->pb_error = error;
+    13f0:	66 89 42 7c          	mov    %ax,0x7c(%edx)
+    13f4:	c3                   	ret    
+	PB_TRACE(pb, PB_TRACE_REC(ioerror), error);
+    13f5:	8d 76 00             	lea    0x0(%esi),%esi
+
+00000000000013f8 <pagebuf_iostart>:
+}
+
+/*
+ *	pagebuf_iostart
+ *
+ *	pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
+ *	If necessary, it will arrange for any disk space allocation required,
+ *	and it will break up the request if the block mappings require it.
+ *	An pb_iodone routine in the buffer supplied will only be called
+ *	when all of the subsidiary I/O requests, if any, have been completed.
+ *	pagebuf_iostart calls the pagebuf_ioinitiate routine or
+ *	pagebuf_iorequest, if the former routine is not defined, to start
+ *	the I/O on a given low-level request.
+ */
+int
+pagebuf_iostart(			/* start I/O on a buffer	  */
+	page_buf_t		*pb,	/* buffer to start		  */
+	page_buf_flags_t	flags)	/* PBF_LOCK, PBF_ASYNC, PBF_READ, */
+					/* PBF_WRITE, PBF_ALLOCATE,	  */
+					/* PBF_DELWRI, 			  */
+					/* PBF_SYNC, PBF_DONT_BLOCK	  */
+					/* PBF_RELEASE			  */
+{
+    13f8:	56                   	push   %esi
+    13f9:	53                   	push   %ebx
+    13fa:	8b 74 24 0c          	mov    0xc(%esp,1),%esi
+    13fe:	8b 5c 24 10          	mov    0x10(%esp,1),%ebx
+	int			status = 0;
+
+	PB_TRACE(pb, PB_TRACE_REC(iostart), flags);
+
+	if (flags & PBF_DELWRI) {
+    1402:	f6 c3 40             	test   $0x40,%bl
+    1405:	74 1f                	je     1426 <pagebuf_iostart+0x2e>
+		pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
+    1407:	8b 46 08             	mov    0x8(%esi),%eax
+    140a:	24 ec                	and    $0xec,%al
+		pb->pb_flags |= flags &
+    140c:	81 e3 50 01 00 00    	and    $0x150,%ebx
+    1412:	09 d8                	or     %ebx,%eax
+    1414:	89 46 08             	mov    %eax,0x8(%esi)
+				(PBF_DELWRI | PBF_ASYNC | PBF_SYNC);
+		pagebuf_delwri_queue(pb, 1);
+    1417:	6a 01                	push   $0x1
+    1419:	56                   	push   %esi
+    141a:	e8 fc ff ff ff       	call   141b <pagebuf_iostart+0x23>
+		return status;
+    141f:	31 c0                	xor    %eax,%eax
+    1421:	83 c4 08             	add    $0x8,%esp
+    1424:	eb 55                	jmp    147b <pagebuf_iostart+0x83>
+	}
+
+	pb->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI|PBF_READ_AHEAD);
+    1426:	8b 56 08             	mov    0x8(%esi),%edx
+    1429:	81 e2 ac ff ff df    	and    $0xdfffffac,%edx
+	pb->pb_flags |= flags & (PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_READ_AHEAD);
+    142f:	89 d8                	mov    %ebx,%eax
+    1431:	25 13 01 00 20       	and    $0x20000113,%eax
+    1436:	09 c2                	or     %eax,%edx
+    1438:	89 56 08             	mov    %edx,0x8(%esi)
+
+	if (pb->pb_bn == PAGE_BUF_DADDR_NULL) {
+    143b:	83 7e 1c ff          	cmpl   $0xffffffff,0x1c(%esi)
+    143f:	75 0e                	jne    144f <pagebuf_iostart+0x57>
+    1441:	83 7e 20 ff          	cmpl   $0xffffffff,0x20(%esi)
+    1445:	75 08                	jne    144f <pagebuf_iostart+0x57>
+		BUG();
+    1447:	0f 0b                	ud2a   
+    1449:	63 05 34 00 00 00    	arpl   %ax,0x34
+	}
+
+	/* For writes call internal function which checks for
+	 * filesystem specific callout function and execute it.
+	 */
+	if (flags & PBF_WRITE) {
+    144f:	f6 c3 02             	test   $0x2,%bl
+    1452:	74 0c                	je     1460 <pagebuf_iostart+0x68>
+extern void pagebuf_terminate(void);
+
+static __inline__ int __pagebuf_iorequest(page_buf_t *pb)
+{
+	if (pb->pb_strat)
+    1454:	8b 46 54             	mov    0x54(%esi),%eax
+    1457:	85 c0                	test   %eax,%eax
+    1459:	74 05                	je     1460 <pagebuf_iostart+0x68>
+		return pb->pb_strat(pb);
+    145b:	56                   	push   %esi
+    145c:	ff d0                	call   *%eax
+    145e:	eb 06                	jmp    1466 <pagebuf_iostart+0x6e>
+		status = __pagebuf_iorequest(pb);
+	} else {
+		status = pagebuf_iorequest(pb);
+    1460:	56                   	push   %esi
+    1461:	e8 fc ff ff ff       	call   1462 <pagebuf_iostart+0x6a>
+	}
+    1466:	83 c4 04             	add    $0x4,%esp
+
+	/* Wait for I/O if we are not an async request */
+	if ((status == 0) && (flags & PBF_ASYNC) == 0) {
+    1469:	85 c0                	test   %eax,%eax
+    146b:	75 0e                	jne    147b <pagebuf_iostart+0x83>
+    146d:	f6 c3 10             	test   $0x10,%bl
+    1470:	75 09                	jne    147b <pagebuf_iostart+0x83>
+		status = pagebuf_iowait(pb);
+    1472:	56                   	push   %esi
+    1473:	e8 fc ff ff ff       	call   1474 <pagebuf_iostart+0x7c>
+	}
+    1478:	83 c4 04             	add    $0x4,%esp
+
+	return status;
+}
+    147b:	5b                   	pop    %ebx
+    147c:	5e                   	pop    %esi
+    147d:	c3                   	ret    
+    147e:	89 f6                	mov    %esi,%esi
+
+0000000000001480 <_end_pagebuf_page_io>:
+
+
+/*
+ * Helper routines for pagebuf_iorequest (pagebuf I/O completion)
+ * 
+ * (different routines for locked/unlocked, and single/multi-bh pagebufs)
+ */
+
+STATIC inline void
+_pb_io_done(
+	page_buf_t		*pb)
+{
+	if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) {
+		pb->pb_locked = 0;
+		pagebuf_iodone(pb);
+	}
+}
+
+STATIC void
+_end_pagebuf_page_io(
+	struct buffer_head	*bh,
+	int			uptodate,
+	int			locked)
+{
+    1480:	83 ec 04             	sub    $0x4,%esp
+    1483:	55                   	push   %ebp
+    1484:	57                   	push   %edi
+    1485:	56                   	push   %esi
+    1486:	53                   	push   %ebx
+    1487:	8b 5c 24 18          	mov    0x18(%esp,1),%ebx
+    148b:	8b 44 24 1c          	mov    0x1c(%esp,1),%eax
+	struct page		*page;
+	page_buf_t		*pb = (page_buf_t *) bh->b_private;
+    148f:	8b 6b 40             	mov    0x40(%ebx),%ebp
+ * This is called by bh->b_end_io() handlers when I/O has completed.
+ */
+static inline void mark_buffer_uptodate(struct buffer_head * bh, int on)
+{
+	if (on)
+    1492:	85 c0                	test   %eax,%eax
+    1494:	74 0a                	je     14a0 <_end_pagebuf_page_io+0x20>
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+    1496:	31 c0                	xor    %eax,%eax
+	__asm__ __volatile__( LOCK_PREFIX
+    1498:	f0 0f ab 43 18       	lock bts %eax,0x18(%ebx)
+ */
+static inline void mark_buffer_uptodate(struct buffer_head * bh, int on)
+{
+	if (on)
+		set_bit(BH_Uptodate, &bh->b_state);
+    149d:	eb 06                	jmp    14a5 <_end_pagebuf_page_io+0x25>
+    149f:	90                   	nop    
+ * in order to ensure changes are visible on other processors.
+ */
+static __inline__ void clear_bit(int nr, volatile void * addr)
+{
+	__asm__ __volatile__( LOCK_PREFIX
+    14a0:	f0 0f b3 43 18       	lock btr %eax,0x18(%ebx)
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_dec(atomic_t *v)
+{
+	__asm__ __volatile__(
+    14a5:	f0 ff 4b 10          	lock decl 0x10(%ebx)
+
+	mark_buffer_uptodate(bh, uptodate);
+	atomic_dec(&bh->b_count);
+
+	page = bh->b_page;
+    14a9:	8b 43 38             	mov    0x38(%ebx),%eax
+    14ac:	89 44 24 10          	mov    %eax,0x10(%esp,1)
+#endif
+
+static __inline__ int constant_test_bit(int nr, const volatile void * addr)
+{
+	return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+    14b0:	8b 43 18             	mov    0x18(%ebx),%eax
+	if (!test_bit(BH_Uptodate, &bh->b_state)) {
+    14b3:	a8 01                	test   $0x1,%al
+    14b5:	75 14                	jne    14cb <_end_pagebuf_page_io+0x4b>
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+	__asm__ __volatile__( LOCK_PREFIX
+    14b7:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+    14bb:	b8 01 00 00 00       	mov    $0x1,%eax
+    14c0:	f0 0f ab 42 18       	lock bts %eax,0x18(%edx)
+		set_bit(PG_error, &page->flags);
+		pb->pb_error = EIO;
+    14c5:	66 c7 45 7c 05 00    	movw   $0x5,0x7c(%ebp)
+	}
+
+	unlock_buffer(bh);
+    14cb:	89 d8                	mov    %ebx,%eax
+    14cd:	e8 fc ff ff ff       	call   14ce <_end_pagebuf_page_io+0x4e>
+    14d2:	83 3d 00 00 00 00 40 	cmpl   $0x40,0x0
+    14d9:	75 15                	jne    14f0 <_end_pagebuf_page_io+0x70>
+    14db:	a1 00 00 00 00       	mov    0x0,%eax
+    14e0:	53                   	push   %ebx
+    14e1:	50                   	push   %eax
+    14e2:	e8 fc ff ff ff       	call   14e3 <_end_pagebuf_page_io+0x63>
+    14e7:	83 c4 08             	add    $0x8,%esp
+    14ea:	e9 a1 00 00 00       	jmp    1590 <_end_pagebuf_page_io+0x110>
+    14ef:	90                   	nop    
+    14f0:	9c                   	pushf  
+    14f1:	5f                   	pop    %edi
+    14f2:	fa                   	cli    
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    14f3:	be 00 00 00 00       	mov    $0x0,%esi
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    14f8:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+    14ff:	4e ad de 
+    1502:	74 1a                	je     151e <_end_pagebuf_page_io+0x9e>
+printk("eip: %p\n", &&here);
+    1504:	68 f8 14 00 00       	push   $0x14f8
+    1509:	68 2b 00 00 00       	push   $0x2b
+    150e:	e8 fc ff ff ff       	call   150f <_end_pagebuf_page_io+0x8f>
+		BUG();
+    1513:	0f 0b                	ud2a   
+    1515:	85 00                	test   %eax,(%eax)
+    1517:	00 00                	add    %al,(%eax)
+    1519:	00 00                	add    %al,(%eax)
+	}
+    151b:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    151e:	f0 fe 0e             	lock decb (%esi)
+    1521:	0f 88 34 19 00 00    	js     2e5b <Letext+0x92>
+    1527:	c7 43 30 00 00 00 00 	movl   $0x0,0x30(%ebx)
+    152e:	a1 00 00 00 00       	mov    0x0,%eax
+    1533:	89 03                	mov    %eax,(%ebx)
+    1535:	89 1d 00 00 00 00    	mov    %ebx,0x0
+    153b:	ff 05 00 00 00 00    	incl   0x0
+    1541:	81 3d 48 03 00 00 48 	cmpl   $0x348,0x348
+    1548:	03 00 00 
+    154b:	74 14                	je     1561 <_end_pagebuf_page_io+0xe1>
+    154d:	b9 01 00 00 00       	mov    $0x1,%ecx
+    1552:	ba 03 00 00 00       	mov    $0x3,%edx
+    1557:	b8 40 03 00 00       	mov    $0x340,%eax
+    155c:	e8 fc ff ff ff       	call   155d <_end_pagebuf_page_io+0xdd>
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    1561:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    1563:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+    156a:	4e ad de 
+    156d:	74 08                	je     1577 <_end_pagebuf_page_io+0xf7>
+		BUG();
+    156f:	0f 0b                	ud2a   
+    1571:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    1577:	a0 00 00 00 00       	mov    0x0,%al
+    157c:	84 c0                	test   %al,%al
+    157e:	7e 08                	jle    1588 <_end_pagebuf_page_io+0x108>
+		BUG();
+    1580:	0f 0b                	ud2a   
+    1582:	6b 00 00             	imul   $0x0,(%eax),%eax
+    1585:	00 00                	add    %al,(%eax)
+    1587:	00 86 15 00 00 00    	add    %al,0x15(%esi)
+#endif
+	__asm__ __volatile__(
+    158d:	00 57 9d             	add    %dl,0xffffff9d(%edi)
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+	__asm__ __volatile__( LOCK_PREFIX
+    1590:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+    1594:	b8 03 00 00 00       	mov    $0x3,%eax
+    1599:	f0 0f ab 42 18       	lock bts %eax,0x18(%edx)
+	_pagebuf_free_bh(bh);
+
+	SetPageUptodate(page);
+	if (locked)
+    159e:	83 7c 24 20 00       	cmpl   $0x0,0x20(%esp,1)
+    15a3:	74 09                	je     15ae <_end_pagebuf_page_io+0x12e>
+		unlock_page(page);
+    15a5:	8b 44 24 10          	mov    0x10(%esp,1),%eax
+    15a9:	e8 fc ff ff ff       	call   15aa <_end_pagebuf_page_io+0x12a>
+static __inline__ int atomic_dec_and_test(atomic_t *v)
+{
+	unsigned char c;
+
+	__asm__ __volatile__(
+    15ae:	f0 ff 8d b4 00 00 00 	lock decl 0xb4(%ebp)
+    15b5:	0f 94 c0             	sete   %al
+    15b8:	84 c0                	test   %al,%al
+    15ba:	74 10                	je     15cc <_end_pagebuf_page_io+0x14c>
+    15bc:	c6 85 82 00 00 00 00 	movb   $0x0,0x82(%ebp)
+    15c3:	55                   	push   %ebp
+    15c4:	e8 fc ff ff ff       	call   15c5 <_end_pagebuf_page_io+0x145>
+    15c9:	83 c4 04             	add    $0x4,%esp
+    15cc:	5b                   	pop    %ebx
+    15cd:	5e                   	pop    %esi
+    15ce:	5f                   	pop    %edi
+    15cf:	5d                   	pop    %ebp
+    15d0:	59                   	pop    %ecx
+    15d1:	c3                   	ret    
+	_pb_io_done(pb);
+}
+    15d2:	89 f6                	mov    %esi,%esi
+
+00000000000015d4 <_end_io_locked>:
+
+STATIC void
+_end_io_locked(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+    15d4:	8b 54 24 04          	mov    0x4(%esp,1),%edx
+    15d8:	8b 44 24 08          	mov    0x8(%esp,1),%eax
+	_end_pagebuf_page_io(bh, uptodate, 1);
+    15dc:	6a 01                	push   $0x1
+    15de:	50                   	push   %eax
+    15df:	52                   	push   %edx
+    15e0:	e8 9b fe ff ff       	call   1480 <_end_pagebuf_page_io>
+}
+    15e5:	83 c4 0c             	add    $0xc,%esp
+    15e8:	c3                   	ret    
+    15e9:	8d 76 00             	lea    0x0(%esi),%esi
+
+00000000000015ec <_end_io_nolock>:
+
+STATIC void
+_end_io_nolock(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+    15ec:	8b 54 24 04          	mov    0x4(%esp,1),%edx
+    15f0:	8b 44 24 08          	mov    0x8(%esp,1),%eax
+	_end_pagebuf_page_io(bh, uptodate, 0);
+    15f4:	6a 00                	push   $0x0
+    15f6:	50                   	push   %eax
+    15f7:	52                   	push   %edx
+    15f8:	e8 83 fe ff ff       	call   1480 <_end_pagebuf_page_io>
+}
+    15fd:	83 c4 0c             	add    $0xc,%esp
+    1600:	c3                   	ret    
+    1601:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000001604 <_end_pagebuf_page_io_multi>:
+
+typedef struct {
+	page_buf_t	*pb;		/* pointer to pagebuf page is within */
+	int		locking;	/* are pages locked? */
+	atomic_t	remain;		/* count of remaining I/O requests */
+} pagesync_t;
+
+STATIC void
+_end_pagebuf_page_io_multi(
+	struct buffer_head	*bh,
+	int			uptodate,
+	int			fullpage)
+{
+    1604:	83 ec 08             	sub    $0x8,%esp
+    1607:	55                   	push   %ebp
+    1608:	57                   	push   %edi
+    1609:	56                   	push   %esi
+    160a:	53                   	push   %ebx
+    160b:	8b 5c 24 1c          	mov    0x1c(%esp,1),%ebx
+	pagesync_t		*psync = (pagesync_t *) bh->b_private;
+    160f:	8b 6b 40             	mov    0x40(%ebx),%ebp
+    1612:	8b 44 24 20          	mov    0x20(%esp,1),%eax
+	page_buf_t		*pb = psync->pb;
+    1616:	8b 55 00             	mov    0x0(%ebp),%edx
+    1619:	89 54 24 14          	mov    %edx,0x14(%esp,1)
+ * This is called by bh->b_end_io() handlers when I/O has completed.
+ */
+static inline void mark_buffer_uptodate(struct buffer_head * bh, int on)
+{
+	if (on)
+    161d:	85 c0                	test   %eax,%eax
+    161f:	74 0f                	je     1630 <_end_pagebuf_page_io_multi+0x2c>
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+    1621:	31 c0                	xor    %eax,%eax
+	__asm__ __volatile__( LOCK_PREFIX
+    1623:	f0 0f ab 43 18       	lock bts %eax,0x18(%ebx)
+ */
+static inline void mark_buffer_uptodate(struct buffer_head * bh, int on)
+{
+	if (on)
+		set_bit(BH_Uptodate, &bh->b_state);
+    1628:	eb 0b                	jmp    1635 <_end_pagebuf_page_io_multi+0x31>
+    162a:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+ * in order to ensure changes are visible on other processors.
+ */
+static __inline__ void clear_bit(int nr, volatile void * addr)
+{
+	__asm__ __volatile__( LOCK_PREFIX
+    1630:	f0 0f b3 43 18       	lock btr %eax,0x18(%ebx)
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_dec(atomic_t *v)
+{
+	__asm__ __volatile__(
+    1635:	f0 ff 4b 10          	lock decl 0x10(%ebx)
+	struct page		*page;
+
+	mark_buffer_uptodate(bh, uptodate);
+	put_bh(bh);
+
+	page = bh->b_page;
+    1639:	8b 43 38             	mov    0x38(%ebx),%eax
+    163c:	89 44 24 10          	mov    %eax,0x10(%esp,1)
+#endif
+
+static __inline__ int constant_test_bit(int nr, const volatile void * addr)
+{
+	return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+    1640:	8b 43 18             	mov    0x18(%ebx),%eax
+	if (!test_bit(BH_Uptodate, &bh->b_state)) {
+    1643:	a8 01                	test   $0x1,%al
+    1645:	75 18                	jne    165f <_end_pagebuf_page_io_multi+0x5b>
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+	__asm__ __volatile__( LOCK_PREFIX
+    1647:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+    164b:	b8 01 00 00 00       	mov    $0x1,%eax
+    1650:	f0 0f ab 42 18       	lock bts %eax,0x18(%edx)
+		set_bit(PG_error, &page->flags);
+		pb->pb_error = EIO;
+    1655:	8b 44 24 14          	mov    0x14(%esp,1),%eax
+    1659:	66 c7 40 7c 05 00    	movw   $0x5,0x7c(%eax)
+	}
+
+	unlock_buffer(bh);
+    165f:	89 d8                	mov    %ebx,%eax
+    1661:	e8 fc ff ff ff       	call   1662 <_end_pagebuf_page_io_multi+0x5e>
+	if (fullpage)
+    1666:	83 7c 24 24 00       	cmpl   $0x0,0x24(%esp,1)
+    166b:	0f 84 bf 00 00 00    	je     1730 <_end_pagebuf_page_io_multi+0x12c>
+    1671:	83 3d 00 00 00 00 40 	cmpl   $0x40,0x0
+    1678:	75 16                	jne    1690 <_end_pagebuf_page_io_multi+0x8c>
+    167a:	a1 00 00 00 00       	mov    0x0,%eax
+    167f:	53                   	push   %ebx
+    1680:	50                   	push   %eax
+    1681:	e8 fc ff ff ff       	call   1682 <_end_pagebuf_page_io_multi+0x7e>
+    1686:	83 c4 08             	add    $0x8,%esp
+    1689:	e9 a2 00 00 00       	jmp    1730 <_end_pagebuf_page_io_multi+0x12c>
+    168e:	89 f6                	mov    %esi,%esi
+    1690:	9c                   	pushf  
+    1691:	5f                   	pop    %edi
+    1692:	fa                   	cli    
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    1693:	be 00 00 00 00       	mov    $0x0,%esi
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    1698:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+    169f:	4e ad de 
+    16a2:	74 1a                	je     16be <_end_pagebuf_page_io_multi+0xba>
+printk("eip: %p\n", &&here);
+    16a4:	68 98 16 00 00       	push   $0x1698
+    16a9:	68 2b 00 00 00       	push   $0x2b
+    16ae:	e8 fc ff ff ff       	call   16af <_end_pagebuf_page_io_multi+0xab>
+		BUG();
+    16b3:	0f 0b                	ud2a   
+    16b5:	85 00                	test   %eax,(%eax)
+    16b7:	00 00                	add    %al,(%eax)
+    16b9:	00 00                	add    %al,(%eax)
+	}
+    16bb:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    16be:	f0 fe 0e             	lock decb (%esi)
+    16c1:	0f 88 a0 17 00 00    	js     2e67 <Letext+0x9e>
+    16c7:	c7 43 30 00 00 00 00 	movl   $0x0,0x30(%ebx)
+    16ce:	a1 00 00 00 00       	mov    0x0,%eax
+    16d3:	89 03                	mov    %eax,(%ebx)
+    16d5:	89 1d 00 00 00 00    	mov    %ebx,0x0
+    16db:	ff 05 00 00 00 00    	incl   0x0
+    16e1:	81 3d 48 03 00 00 48 	cmpl   $0x348,0x348
+    16e8:	03 00 00 
+    16eb:	74 14                	je     1701 <_end_pagebuf_page_io_multi+0xfd>
+    16ed:	b9 01 00 00 00       	mov    $0x1,%ecx
+    16f2:	ba 03 00 00 00       	mov    $0x3,%edx
+    16f7:	b8 40 03 00 00       	mov    $0x340,%eax
+    16fc:	e8 fc ff ff ff       	call   16fd <_end_pagebuf_page_io_multi+0xf9>
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    1701:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    1703:	81 3d 04 00 00 00 ad 	cmpl   $0xdead4ead,0x4
+    170a:	4e ad de 
+    170d:	74 08                	je     1717 <_end_pagebuf_page_io_multi+0x113>
+		BUG();
+    170f:	0f 0b                	ud2a   
+    1711:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    1717:	a0 00 00 00 00       	mov    0x0,%al
+    171c:	84 c0                	test   %al,%al
+    171e:	7e 08                	jle    1728 <_end_pagebuf_page_io_multi+0x124>
+		BUG();
+    1720:	0f 0b                	ud2a   
+    1722:	6b 00 00             	imul   $0x0,(%eax),%eax
+    1725:	00 00                	add    %al,(%eax)
+    1727:	00 86 15 00 00 00    	add    %al,0x15(%esi)
+#endif
+	__asm__ __volatile__(
+    172d:	00 57 9d             	add    %dl,0xffffff9d(%edi)
+static __inline__ int atomic_dec_and_test(atomic_t *v)
+{
+	unsigned char c;
+
+	__asm__ __volatile__(
+    1730:	f0 ff 4d 08          	lock decl 0x8(%ebp)
+    1734:	0f 94 c0             	sete   %al
+		_pagebuf_free_bh(bh);
+
+	if (atomic_dec_and_test(&psync->remain) == 1) {
+    1737:	84 c0                	test   %al,%al
+    1739:	74 4f                	je     178a <_end_pagebuf_page_io_multi+0x186>
+		if (fullpage)
+    173b:	83 7c 24 24 00       	cmpl   $0x0,0x24(%esp,1)
+    1740:	74 0e                	je     1750 <_end_pagebuf_page_io_multi+0x14c>
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+	__asm__ __volatile__( LOCK_PREFIX
+    1742:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+    1746:	b8 03 00 00 00       	mov    $0x3,%eax
+    174b:	f0 0f ab 42 18       	lock bts %eax,0x18(%edx)
+			SetPageUptodate(page);
+		if (psync->locking)
+    1750:	83 7d 04 00          	cmpl   $0x0,0x4(%ebp)
+    1754:	74 09                	je     175f <_end_pagebuf_page_io_multi+0x15b>
+			unlock_page(page);
+    1756:	8b 44 24 10          	mov    0x10(%esp,1),%eax
+    175a:	e8 fc ff ff ff       	call   175b <_end_pagebuf_page_io_multi+0x157>
+		kfree(psync);
+    175f:	55                   	push   %ebp
+    1760:	e8 fc ff ff ff       	call   1761 <_end_pagebuf_page_io_multi+0x15d>
+    1765:	83 c4 04             	add    $0x4,%esp
+static __inline__ int atomic_dec_and_test(atomic_t *v)
+{
+	unsigned char c;
+
+	__asm__ __volatile__(
+    1768:	8b 54 24 14          	mov    0x14(%esp,1),%edx
+    176c:	f0 ff 8a b4 00 00 00 	lock decl 0xb4(%edx)
+    1773:	0f 94 c0             	sete   %al
+    1776:	84 c0                	test   %al,%al
+    1778:	74 10                	je     178a <_end_pagebuf_page_io_multi+0x186>
+    177a:	c6 82 82 00 00 00 00 	movb   $0x0,0x82(%edx)
+    1781:	52                   	push   %edx
+    1782:	e8 fc ff ff ff       	call   1783 <_end_pagebuf_page_io_multi+0x17f>
+    1787:	83 c4 04             	add    $0x4,%esp
+		_pb_io_done(pb);
+	}
+    178a:	5b                   	pop    %ebx
+    178b:	5e                   	pop    %esi
+    178c:	5f                   	pop    %edi
+    178d:	5d                   	pop    %ebp
+    178e:	59                   	pop    %ecx
+    178f:	5a                   	pop    %edx
+    1790:	c3                   	ret    
+}
+    1791:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000001794 <_end_io_multi_full>:
+
+STATIC void
+_end_io_multi_full(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+    1794:	8b 54 24 04          	mov    0x4(%esp,1),%edx
+    1798:	8b 44 24 08          	mov    0x8(%esp,1),%eax
+	_end_pagebuf_page_io_multi(bh, uptodate, 1);
+    179c:	6a 01                	push   $0x1
+    179e:	50                   	push   %eax
+    179f:	52                   	push   %edx
+    17a0:	e8 5f fe ff ff       	call   1604 <_end_pagebuf_page_io_multi>
+}
+    17a5:	83 c4 0c             	add    $0xc,%esp
+    17a8:	c3                   	ret    
+    17a9:	8d 76 00             	lea    0x0(%esi),%esi
+
+00000000000017ac <_end_io_multi_part>:
+
+STATIC void
+_end_io_multi_part(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+    17ac:	8b 54 24 04          	mov    0x4(%esp,1),%edx
+    17b0:	8b 44 24 08          	mov    0x8(%esp,1),%eax
+	_end_pagebuf_page_io_multi(bh, uptodate, 0);
+    17b4:	6a 00                	push   $0x0
+    17b6:	50                   	push   %eax
+    17b7:	52                   	push   %edx
+    17b8:	e8 47 fe ff ff       	call   1604 <_end_pagebuf_page_io_multi>
+}
+    17bd:	83 c4 0c             	add    $0xc,%esp
+    17c0:	c3                   	ret    
+    17c1:	8d 76 00             	lea    0x0(%esi),%esi
+
+00000000000017c4 <_pagebuf_page_io>:
+
+
+/*
+ * Initiate I/O on part of a page we are interested in
+ */
+STATIC int
+_pagebuf_page_io(
+	struct page		*page,	/* Page structure we are dealing with */
+	page_buf_t		*pb,	/* pagebuf holding it, can be NULL */
+	page_buf_daddr_t	bn,	/* starting block number */
+	kdev_t			dev,	/* device for I/O */
+	size_t			blocksize,	/* filesystem block size */
+	off_t			pg_offset,	/* starting offset in page */
+	size_t			pg_length,	/* count of data to process */
+	int			locking,	/* page locking in use */
+	int			rw,	/* read/write operation */
+	int			flush)
+{
+    17c4:	83 ec 44             	sub    $0x44,%esp
+    17c7:	55                   	push   %ebp
+    17c8:	57                   	push   %edi
+    17c9:	56                   	push   %esi
+    17ca:	53                   	push   %ebx
+    17cb:	8b 44 24 68          	mov    0x68(%esp,1),%eax
+    17cf:	66 89 44 24 32       	mov    %ax,0x32(%esp,1)
+	size_t			sector;
+	size_t			blk_length = 0;
+	struct buffer_head	*bh, *head, *bufferlist[MAX_BUF_PER_PAGE];
+	int			multi_ok;
+	int			i = 0, cnt = 0, err = 0;
+    17d4:	c7 44 24 24 00 00 00 	movl   $0x0,0x24(%esp,1)
+    17db:	00 
+    17dc:	c7 44 24 20 00 00 00 	movl   $0x0,0x20(%esp,1)
+    17e3:	00 
+	int			public_bh = 0;
+    17e4:	c7 44 24 1c 00 00 00 	movl   $0x0,0x1c(%esp,1)
+    17eb:	00 
+
+	if ((blocksize < PAGE_CACHE_SIZE) &&
+    17ec:	81 7c 24 6c ff 0f 00 	cmpl   $0xfff,0x6c(%esp,1)
+    17f3:	00 
+    17f4:	0f 87 69 01 00 00    	ja     1963 <_pagebuf_page_io+0x19f>
+    17fa:	8b 54 24 5c          	mov    0x5c(%esp,1),%edx
+    17fe:	8b 42 08             	mov    0x8(%edx),%eax
+    1801:	a9 00 00 10 00       	test   $0x100000,%eax
+    1806:	0f 85 57 01 00 00    	jne    1963 <_pagebuf_page_io+0x19f>
+	    !(pb->pb_flags & _PBF_PRIVATE_BH)) {
+		int		cache_ok;
+
+		cache_ok = !((pb->pb_flags & PBF_FORCEIO) || (rw == WRITE));
+    180c:	c7 44 24 18 00 00 00 	movl   $0x0,0x18(%esp,1)
+    1813:	00 
+    1814:	a9 00 00 00 08       	test   $0x8000000,%eax
+    1819:	75 1a                	jne    1835 <_pagebuf_page_io+0x71>
+    181b:	c7 44 24 18 01 00 00 	movl   $0x1,0x18(%esp,1)
+    1822:	00 
+    1823:	8b 4c 24 18          	mov    0x18(%esp,1),%ecx
+    1827:	83 7c 24 7c 01       	cmpl   $0x1,0x7c(%esp,1)
+    182c:	0f 44 4c 24 24       	cmove  0x24(%esp,1),%ecx
+    1831:	89 4c 24 18          	mov    %ecx,0x18(%esp,1)
+		public_bh = multi_ok = 1;
+
+		if (!page_has_buffers(page)) {
+    1835:	8b 7c 24 58          	mov    0x58(%esp,1),%edi
+    1839:	c7 44 24 28 01 00 00 	movl   $0x1,0x28(%esp,1)
+    1840:	00 
+    1841:	c7 44 24 1c 01 00 00 	movl   $0x1,0x1c(%esp,1)
+    1848:	00 
+    1849:	83 7f 28 00          	cmpl   $0x0,0x28(%edi)
+    184d:	75 4b                	jne    189a <_pagebuf_page_io+0xd6>
+			if (!locking) {
+    184f:	83 7c 24 78 00       	cmpl   $0x0,0x78(%esp,1)
+    1854:	75 2c                	jne    1882 <_pagebuf_page_io+0xbe>
+				lock_page(page);
+    1856:	89 f8                	mov    %edi,%eax
+    1858:	e8 fc ff ff ff       	call   1859 <_pagebuf_page_io+0x95>
+				if (!page_has_buffers(page)) {
+    185d:	83 7f 28 00          	cmpl   $0x0,0x28(%edi)
+    1861:	75 14                	jne    1877 <_pagebuf_page_io+0xb3>
+					create_empty_buffers(page, dev,
+    1863:	68 00 02 00 00       	push   $0x200
+    1868:	0f b7 44 24 36       	movzwl 0x36(%esp,1),%eax
+    186d:	50                   	push   %eax
+    186e:	57                   	push   %edi
+    186f:	e8 fc ff ff ff       	call   1870 <_pagebuf_page_io+0xac>
+							SECTOR_SIZE);
+				}
+    1874:	83 c4 0c             	add    $0xc,%esp
+				unlock_page(page);
+    1877:	8b 44 24 58          	mov    0x58(%esp,1),%eax
+    187b:	e8 fc ff ff ff       	call   187c <_pagebuf_page_io+0xb8>
+			} else {
+    1880:	eb 18                	jmp    189a <_pagebuf_page_io+0xd6>
+				create_empty_buffers(page, dev, SECTOR_SIZE);
+    1882:	68 00 02 00 00       	push   $0x200
+    1887:	0f b7 44 24 36       	movzwl 0x36(%esp,1),%eax
+    188c:	50                   	push   %eax
+    188d:	8b 44 24 60          	mov    0x60(%esp,1),%eax
+    1891:	50                   	push   %eax
+    1892:	e8 fc ff ff ff       	call   1893 <_pagebuf_page_io+0xcf>
+			}
+    1897:	83 c4 0c             	add    $0xc,%esp
+		}
+
+		/* Find buffer_heads belonging to just this pagebuf */
+		bh = head = page_buffers(page);
+    189a:	8b 54 24 58          	mov    0x58(%esp,1),%edx
+    189e:	8b 52 28             	mov    0x28(%edx),%edx
+    18a1:	89 54 24 10          	mov    %edx,0x10(%esp,1)
+    18a5:	89 54 24 2c          	mov    %edx,0x2c(%esp,1)
+    18a9:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
+		do {
+			if (buffer_uptodate(bh) && cache_ok)
+    18b0:	8b 4c 24 10          	mov    0x10(%esp,1),%ecx
+    18b4:	f6 41 18 01          	testb  $0x1,0x18(%ecx)
+    18b8:	74 0b                	je     18c5 <_pagebuf_page_io+0x101>
+    18ba:	83 7c 24 18 00       	cmpl   $0x0,0x18(%esp,1)
+    18bf:	0f 85 7e 00 00 00    	jne    1943 <_pagebuf_page_io+0x17f>
+				continue;
+			blk_length = i << SECTOR_SHIFT;
+    18c5:	8b 5c 24 24          	mov    0x24(%esp,1),%ebx
+    18c9:	c1 e3 09             	shl    $0x9,%ebx
+			if (blk_length < pg_offset)
+    18cc:	3b 5c 24 70          	cmp    0x70(%esp,1),%ebx
+    18d0:	72 71                	jb     1943 <_pagebuf_page_io+0x17f>
+				continue;
+			if (blk_length >= pg_offset + pg_length)
+    18d2:	8b 44 24 70          	mov    0x70(%esp,1),%eax
+    18d6:	03 44 24 74          	add    0x74(%esp,1),%eax
+    18da:	39 c3                	cmp    %eax,%ebx
+    18dc:	0f 83 f0 01 00 00    	jae    1ad2 <_pagebuf_page_io+0x30e>
+}
+
+static inline void lock_buffer(struct buffer_head * bh)
+{
+	while (test_and_set_bit(BH_Lock, &bh->b_state))
+    18e2:	8b 5c 24 70          	mov    0x70(%esp,1),%ebx
+    18e6:	8b 7c 24 20          	mov    0x20(%esp,1),%edi
+    18ea:	8b 74 24 20          	mov    0x20(%esp,1),%esi
+    18ee:	c1 fb 09             	sar    $0x9,%ebx
+    18f1:	c1 e7 02             	shl    $0x2,%edi
+    18f4:	89 7c 24 14          	mov    %edi,0x14(%esp,1)
+    18f8:	8d 6c 24 34          	lea    0x34(%esp,1),%ebp
+    18fc:	46                   	inc    %esi
+    18fd:	eb 0e                	jmp    190d <_pagebuf_page_io+0x149>
+    18ff:	90                   	nop    
+		__wait_on_buffer(bh);
+    1900:	8b 44 24 10          	mov    0x10(%esp,1),%eax
+    1904:	50                   	push   %eax
+    1905:	e8 fc ff ff ff       	call   1906 <_pagebuf_page_io+0x142>
+    190a:	83 c4 04             	add    $0x4,%esp
+static __inline__ int test_and_set_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( LOCK_PREFIX
+    190d:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+    1911:	b8 02 00 00 00       	mov    $0x2,%eax
+    1916:	f0 0f ab 42 18       	lock bts %eax,0x18(%edx)
+    191b:	19 c0                	sbb    %eax,%eax
+}
+
+static inline void lock_buffer(struct buffer_head * bh)
+{
+	while (test_and_set_bit(BH_Lock, &bh->b_state))
+    191d:	85 c0                	test   %eax,%eax
+    191f:	75 df                	jne    1900 <_pagebuf_page_io+0x13c>
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_inc(atomic_t *v)
+{
+	__asm__ __volatile__(
+    1921:	f0 ff 42 10          	lock incl 0x10(%edx)
+				break;
+
+			lock_buffer(bh);
+			get_bh(bh);
+			assert(!waitqueue_active(&bh->b_wait));
+
+			bh->b_size = SECTOR_SIZE;
+    1925:	66 c7 42 08 00 02    	movw   $0x200,0x8(%edx)
+			bh->b_blocknr = bn + (i - (pg_offset >> SECTOR_SHIFT));
+    192b:	8b 44 24 24          	mov    0x24(%esp,1),%eax
+    192f:	29 d8                	sub    %ebx,%eax
+    1931:	03 44 24 60          	add    0x60(%esp,1),%eax
+    1935:	89 42 04             	mov    %eax,0x4(%edx)
+			bufferlist[cnt++] = bh;
+    1938:	8b 4c 24 14          	mov    0x14(%esp,1),%ecx
+    193c:	89 14 29             	mov    %edx,(%ecx,%ebp,1)
+    193f:	89 74 24 20          	mov    %esi,0x20(%esp,1)
+		} while (i++, (bh = bh->b_this_page) != head);
+    1943:	8b 7c 24 10          	mov    0x10(%esp,1),%edi
+    1947:	8b 44 24 2c          	mov    0x2c(%esp,1),%eax
+    194b:	ff 44 24 24          	incl   0x24(%esp,1)
+    194f:	8b 7f 28             	mov    0x28(%edi),%edi
+    1952:	89 7c 24 10          	mov    %edi,0x10(%esp,1)
+    1956:	39 c7                	cmp    %eax,%edi
+    1958:	0f 85 52 ff ff ff    	jne    18b0 <_pagebuf_page_io+0xec>
+
+		goto request;
+    195e:	e9 6f 01 00 00       	jmp    1ad2 <_pagebuf_page_io+0x30e>
+	}
+
+	/* Calculate the block offsets and length we will be using */
+	if (pg_offset) {
+    1963:	83 7c 24 70 00       	cmpl   $0x0,0x70(%esp,1)
+    1968:	74 1d                	je     1987 <_pagebuf_page_io+0x1c3>
+		size_t		block_offset;
+
+		block_offset = pg_offset >> SECTOR_SHIFT;
+		block_offset = pg_offset - (block_offset << SECTOR_SHIFT);
+    196a:	8b 54 24 70          	mov    0x70(%esp,1),%edx
+    196e:	8b 44 24 70          	mov    0x70(%esp,1),%eax
+    1972:	81 e2 00 fe ff ff    	and    $0xfffffe00,%edx
+    1978:	29 d0                	sub    %edx,%eax
+		blk_length = (pg_length + block_offset + SECTOR_MASK) >>
+    197a:	8b 54 24 74          	mov    0x74(%esp,1),%edx
+    197e:	8d 9c 10 ff 01 00 00 	lea    0x1ff(%eax,%edx,1),%ebx
+								SECTOR_SHIFT;
+	} else {
+    1985:	eb 0a                	jmp    1991 <_pagebuf_page_io+0x1cd>
+		blk_length = (pg_length + SECTOR_MASK) >> SECTOR_SHIFT;
+    1987:	8b 5c 24 74          	mov    0x74(%esp,1),%ebx
+    198b:	81 c3 ff 01 00 00    	add    $0x1ff,%ebx
+    1991:	c1 eb 09             	shr    $0x9,%ebx
+	}
+
+	/* This will attempt to make a request bigger than the sector
+	 * size if we are well aligned.
+	 */
+	switch (pb->pb_target->pbr_flags) {
+    1994:	8b 4c 24 5c          	mov    0x5c(%esp,1),%ecx
+    1998:	8b 41 14             	mov    0x14(%ecx),%eax
+    199b:	8b 00                	mov    (%eax),%eax
+    199d:	83 f8 01             	cmp    $0x1,%eax
+    19a0:	74 3e                	je     19e0 <_pagebuf_page_io+0x21c>
+    19a2:	7f 0c                	jg     19b0 <_pagebuf_page_io+0x1ec>
+    19a4:	85 c0                	test   %eax,%eax
+    19a6:	74 2a                	je     19d2 <_pagebuf_page_io+0x20e>
+    19a8:	eb 36                	jmp    19e0 <_pagebuf_page_io+0x21c>
+    19aa:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+    19b0:	83 f8 02             	cmp    $0x2,%eax
+    19b3:	75 2b                	jne    19e0 <_pagebuf_page_io+0x21c>
+	case 0:
+		sector = blk_length << SECTOR_SHIFT;
+		blk_length = 1;
+		break;
+	case PBR_ALIGNED_ONLY:
+		if ((pg_offset == 0) && (pg_length == PAGE_CACHE_SIZE) &&
+    19b5:	83 7c 24 70 00       	cmpl   $0x0,0x70(%esp,1)
+    19ba:	75 24                	jne    19e0 <_pagebuf_page_io+0x21c>
+    19bc:	81 7c 24 74 00 10 00 	cmpl   $0x1000,0x74(%esp,1)
+    19c3:	00 
+    19c4:	75 1a                	jne    19e0 <_pagebuf_page_io+0x21c>
+    19c6:	8b 7c 24 60          	mov    0x60(%esp,1),%edi
+    19ca:	f7 c7 07 00 00 00    	test   $0x7,%edi
+    19d0:	75 0e                	jne    19e0 <_pagebuf_page_io+0x21c>
+		    (((unsigned int) bn) & BN_ALIGN_MASK) == 0) {
+			sector = blk_length << SECTOR_SHIFT;
+    19d2:	89 de                	mov    %ebx,%esi
+    19d4:	c1 e6 09             	shl    $0x9,%esi
+			blk_length = 1;
+    19d7:	bb 01 00 00 00       	mov    $0x1,%ebx
+			break;
+    19dc:	eb 0c                	jmp    19ea <_pagebuf_page_io+0x226>
+    19de:	89 f6                	mov    %esi,%esi
+		}
+	case PBR_SECTOR_ONLY:
+		/* Fallthrough, same as default */
+	default:
+		sector = SECTOR_SIZE;
+    19e0:	be 00 02 00 00       	mov    $0x200,%esi
+	}
+
+	/* The b_size field of struct buffer_head is an unsigned short
+	 * ... we may need to split this request up.  [64K is too big]
+	 */
+	assert(sizeof(bh->b_size) == 2);
+    19e5:	eb 03                	jmp    19ea <_pagebuf_page_io+0x226>
+	while (sector > 0xffff) {
+		sector >>= 1;
+    19e7:	d1 ee                	shr    %esi
+		blk_length++;
+    19e9:	43                   	inc    %ebx
+	}
+    19ea:	81 fe ff ff 00 00    	cmp    $0xffff,%esi
+    19f0:	77 f5                	ja     19e7 <_pagebuf_page_io+0x223>
+
+	multi_ok = (blk_length != 1);
+    19f2:	83 fb 01             	cmp    $0x1,%ebx
+    19f5:	0f 95 c0             	setne  %al
+    19f8:	0f b6 c0             	movzbl %al,%eax
+    19fb:	89 44 24 28          	mov    %eax,0x28(%esp,1)
+
+	for (; blk_length > 0; blk_length--, pg_offset += sector) {
+    19ff:	85 db                	test   %ebx,%ebx
+    1a01:	0f 84 cb 00 00 00    	je     1ad2 <_pagebuf_page_io+0x30e>
+    1a07:	31 ed                	xor    %ebp,%ebp
+    1a09:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
+		bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS);
+    1a10:	a1 00 00 00 00       	mov    0x0,%eax
+    1a15:	68 f0 00 00 00       	push   $0xf0
+    1a1a:	50                   	push   %eax
+    1a1b:	e8 fc ff ff ff       	call   1a1c <_pagebuf_page_io+0x258>
+    1a20:	89 44 24 18          	mov    %eax,0x18(%esp,1)
+		if (!bh) {
+    1a24:	83 c4 08             	add    $0x8,%esp
+    1a27:	85 c0                	test   %eax,%eax
+    1a29:	75 11                	jne    1a3c <_pagebuf_page_io+0x278>
+			bh = _pagebuf_get_prealloc_bh();
+    1a2b:	e8 90 ed ff ff       	call   7c0 <_pagebuf_get_prealloc_bh>
+    1a30:	89 44 24 10          	mov    %eax,0x10(%esp,1)
+			if (!bh) {
+    1a34:	85 c0                	test   %eax,%eax
+    1a36:	0f 84 d4 01 00 00    	je     1c10 <_pagebuf_page_io+0x44c>
+	: "memory")
+{
+	int d0, d1;
+	switch (count % 4) {
+		case 0: COMMON(""); return s;
+    1a3c:	8b 7c 24 10          	mov    0x10(%esp,1),%edi
+    1a40:	b9 19 00 00 00       	mov    $0x19,%ecx
+    1a45:	89 e8                	mov    %ebp,%eax
+    1a47:	f3 ab                	repz stos %eax,%es:(%edi)
+				/* This should never happen */
+				err = -ENOMEM;
+				goto error;
+			}
+		}
+		memset(bh, 0, sizeof(*bh));
+		bh->b_size = sector;
+    1a49:	8b 44 24 10          	mov    0x10(%esp,1),%eax
+    1a4d:	66 89 70 08          	mov    %si,0x8(%eax)
+		bh->b_blocknr = bn++;
+    1a51:	8b 54 24 60          	mov    0x60(%esp,1),%edx
+    1a55:	89 50 04             	mov    %edx,0x4(%eax)
+		bh->b_dev = dev;
+    1a58:	0f b7 4c 24 32       	movzwl 0x32(%esp,1),%ecx
+    1a5d:	83 44 24 60 01       	addl   $0x1,0x60(%esp,1)
+    1a62:	83 54 24 64 00       	adcl   $0x0,0x64(%esp,1)
+    1a67:	66 89 48 0c          	mov    %cx,0xc(%eax)
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+	__asm__ __volatile__( LOCK_PREFIX
+    1a6b:	8b 7c 24 10          	mov    0x10(%esp,1),%edi
+    1a6f:	b8 02 00 00 00       	mov    $0x2,%eax
+    1a74:	f0 0f ab 47 18       	lock bts %eax,0x18(%edi)
+		set_bit(BH_Lock, &bh->b_state);
+		set_bh_page(bh, page, pg_offset);
+    1a79:	8b 44 24 70          	mov    0x70(%esp,1),%eax
+    1a7d:	50                   	push   %eax
+    1a7e:	8b 54 24 5c          	mov    0x5c(%esp,1),%edx
+    1a82:	52                   	push   %edx
+    1a83:	57                   	push   %edi
+    1a84:	e8 fc ff ff ff       	call   1a85 <_pagebuf_page_io+0x2c1>
+#define DECLARE_WAIT_QUEUE_HEAD(name) \
+	wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
+
+static inline void init_waitqueue_head(wait_queue_head_t *q)
+{
+    1a89:	89 f9                	mov    %edi,%ecx
+    1a8b:	83 c4 0c             	add    $0xc,%esp
+#if WAITQUEUE_DEBUG
+	if (!q)
+		WQ_BUG();
+#endif
+	q->lock = WAITQUEUE_RW_LOCK_UNLOCKED;
+    1a8e:	8b 7c 24 10          	mov    0x10(%esp,1),%edi
+    1a92:	83 c1 48             	add    $0x48,%ecx
+    1a95:	b8 01 00 00 00       	mov    $0x1,%eax
+    1a9a:	ba ad 4e ad de       	mov    $0xdead4ead,%edx
+    1a9f:	89 47 48             	mov    %eax,0x48(%edi)
+    1aa2:	89 57 4c             	mov    %edx,0x4c(%edi)
+	INIT_LIST_HEAD(&q->task_list);
+    1aa5:	89 f8                	mov    %edi,%eax
+    1aa7:	83 c0 50             	add    $0x50,%eax
+    1aaa:	89 41 08             	mov    %eax,0x8(%ecx)
+    1aad:	89 41 0c             	mov    %eax,0xc(%ecx)
+		init_waitqueue_head(&bh->b_wait);
+		atomic_set(&bh->b_count, 1);
+    1ab0:	c7 47 10 01 00 00 00 	movl   $0x1,0x10(%edi)
+		bufferlist[cnt++] = bh;
+    1ab7:	8b 54 24 20          	mov    0x20(%esp,1),%edx
+    1abb:	8d 44 24 34          	lea    0x34(%esp,1),%eax
+    1abf:	89 3c 90             	mov    %edi,(%eax,%edx,4)
+    1ac2:	42                   	inc    %edx
+    1ac3:	89 54 24 20          	mov    %edx,0x20(%esp,1)
+    1ac7:	01 74 24 70          	add    %esi,0x70(%esp,1)
+    1acb:	4b                   	dec    %ebx
+    1acc:	0f 85 3e ff ff ff    	jne    1a10 <_pagebuf_page_io+0x24c>
+	}
+
+request:
+	if (cnt) {
+    1ad2:	83 7c 24 20 00       	cmpl   $0x0,0x20(%esp,1)
+    1ad7:	0f 84 1a 01 00 00    	je     1bf7 <_pagebuf_page_io+0x433>
+		pagesync_t	*psync = NULL;
+    1add:	31 db                	xor    %ebx,%ebx
+		void		(*callback)(struct buffer_head *, int);
+
+		if (multi_ok) {
+    1adf:	83 7c 24 28 00       	cmpl   $0x0,0x28(%esp,1)
+    1ae4:	74 4a                	je     1b30 <_pagebuf_page_io+0x36c>
+			size_t	size = sizeof(pagesync_t);
+
+			psync = (pagesync_t *) kmalloc(size, GFP_NOFS);
+    1ae6:	68 f0 00 00 00       	push   $0xf0
+    1aeb:	6a 0c                	push   $0xc
+    1aed:	e8 fc ff ff ff       	call   1aee <_pagebuf_page_io+0x32a>
+    1af2:	89 c3                	mov    %eax,%ebx
+			if (!psync)
+    1af4:	83 c4 08             	add    $0x8,%esp
+    1af7:	85 db                	test   %ebx,%ebx
+    1af9:	75 08                	jne    1b03 <_pagebuf_page_io+0x33f>
+				BUG();	/* Ugh - out of memory condition here */
+    1afb:	0f 0b                	ud2a   
+    1afd:	78 06                	js     1b05 <_pagebuf_page_io+0x341>
+    1aff:	34 00                	xor    $0x0,%al
+    1b01:	00 00                	add    %al,(%eax)
+			psync->pb = pb;
+    1b03:	8b 4c 24 5c          	mov    0x5c(%esp,1),%ecx
+    1b07:	89 0b                	mov    %ecx,(%ebx)
+			psync->locking = locking;
+    1b09:	8b 7c 24 78          	mov    0x78(%esp,1),%edi
+    1b0d:	89 7b 04             	mov    %edi,0x4(%ebx)
+			atomic_set(&psync->remain, 0);
+    1b10:	c7 43 08 00 00 00 00 	movl   $0x0,0x8(%ebx)
+
+			callback = public_bh ?
+    1b17:	be 94 17 00 00       	mov    $0x1794,%esi
+    1b1c:	b8 ac 17 00 00       	mov    $0x17ac,%eax
+    1b21:	83 7c 24 1c 00       	cmpl   $0x0,0x1c(%esp,1)
+    1b26:	0f 45 f0             	cmovne %eax,%esi
+				   _end_io_multi_part : _end_io_multi_full;
+		} else {
+    1b29:	eb 17                	jmp    1b42 <_pagebuf_page_io+0x37e>
+    1b2b:	90                   	nop    
+    1b2c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+			callback = locking ? _end_io_locked : _end_io_nolock;
+    1b30:	be ec 15 00 00       	mov    $0x15ec,%esi
+    1b35:	b8 d4 15 00 00       	mov    $0x15d4,%eax
+    1b3a:	83 7c 24 78 00       	cmpl   $0x0,0x78(%esp,1)
+    1b3f:	0f 45 f0             	cmovne %eax,%esi
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_inc(atomic_t *v)
+{
+	__asm__ __volatile__(
+    1b42:	8b 44 24 5c          	mov    0x5c(%esp,1),%eax
+    1b46:	f0 ff 80 b4 00 00 00 	lock incl 0xb4(%eax)
+		}
+
+		/* Indicate that there is another page in progress */
+		atomic_inc(&PBP(pb)->pb_io_remaining);
+
+#ifdef RQ_WRITE_ORDERED
+		if (flush)
+			set_bit(BH_Ordered_Flush, &bufferlist[cnt-1]->b_state);
+#endif
+
+		for (i = 0; i < cnt; i++) {
+    1b4d:	8b 54 24 20          	mov    0x20(%esp,1),%edx
+    1b51:	c7 44 24 24 00 00 00 	movl   $0x0,0x24(%esp,1)
+    1b58:	00 
+    1b59:	39 54 24 24          	cmp    %edx,0x24(%esp,1)
+    1b5d:	0f 8d a4 00 00 00    	jge    1c07 <_pagebuf_page_io+0x443>
+			bh = bufferlist[i];
+    1b63:	8b 4c 24 24          	mov    0x24(%esp,1),%ecx
+    1b67:	8d 44 24 34          	lea    0x34(%esp,1),%eax
+    1b6b:	8b 04 88             	mov    (%eax,%ecx,4),%eax
+    1b6e:	89 44 24 10          	mov    %eax,0x10(%esp,1)
+
+			/* Complete the buffer_head, then submit the IO */
+			if (psync) {
+    1b72:	85 db                	test   %ebx,%ebx
+    1b74:	74 11                	je     1b87 <_pagebuf_page_io+0x3c3>
+				init_buffer(bh, callback, psync);
+    1b76:	53                   	push   %ebx
+    1b77:	56                   	push   %esi
+    1b78:	50                   	push   %eax
+    1b79:	e8 fc ff ff ff       	call   1b7a <_pagebuf_page_io+0x3b6>
+ * Atomically increments @v by 1.  Note that the guaranteed
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_inc(atomic_t *v)
+{
+    1b7e:	83 c4 0c             	add    $0xc,%esp
+	__asm__ __volatile__(
+    1b81:	f0 ff 43 08          	lock incl 0x8(%ebx)
+				atomic_inc(&psync->remain);
+			} else {
+    1b85:	eb 13                	jmp    1b9a <_pagebuf_page_io+0x3d6>
+				init_buffer(bh, callback, pb);
+    1b87:	8b 7c 24 5c          	mov    0x5c(%esp,1),%edi
+    1b8b:	57                   	push   %edi
+    1b8c:	56                   	push   %esi
+    1b8d:	8b 44 24 18          	mov    0x18(%esp,1),%eax
+    1b91:	50                   	push   %eax
+    1b92:	e8 fc ff ff ff       	call   1b93 <_pagebuf_page_io+0x3cf>
+			}
+    1b97:	83 c4 0c             	add    $0xc,%esp
+
+			bh->b_rdev = bh->b_dev;
+    1b9a:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+    1b9e:	0f b7 42 0c          	movzwl 0xc(%edx),%eax
+    1ba2:	66 89 42 14          	mov    %ax,0x14(%edx)
+			bh->b_rsector = bh->b_blocknr;
+    1ba6:	8b 42 04             	mov    0x4(%edx),%eax
+    1ba9:	89 42 44             	mov    %eax,0x44(%edx)
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+    1bac:	b8 04 00 00 00       	mov    $0x4,%eax
+	__asm__ __volatile__( LOCK_PREFIX
+    1bb1:	f0 0f ab 42 18       	lock bts %eax,0x18(%edx)
+    1bb6:	b8 03 00 00 00       	mov    $0x3,%eax
+    1bbb:	f0 0f ab 42 18       	lock bts %eax,0x18(%edx)
+			set_bit(BH_Mapped, &bh->b_state);
+			set_bit(BH_Req, &bh->b_state);
+
+			if (rw == WRITE) {
+    1bc0:	83 7c 24 7c 01       	cmpl   $0x1,0x7c(%esp,1)
+    1bc5:	75 07                	jne    1bce <_pagebuf_page_io+0x40a>
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+    1bc7:	31 c0                	xor    %eax,%eax
+	__asm__ __volatile__( LOCK_PREFIX
+    1bc9:	f0 0f ab 42 18       	lock bts %eax,0x18(%edx)
+				set_bit(BH_Uptodate, &bh->b_state);
+			}
+			generic_make_request(rw, bh);
+    1bce:	8b 4c 24 10          	mov    0x10(%esp,1),%ecx
+    1bd2:	51                   	push   %ecx
+    1bd3:	8b bc 24 80 00 00 00 	mov    0x80(%esp,1),%edi
+    1bda:	57                   	push   %edi
+    1bdb:	e8 fc ff ff ff       	call   1bdc <_pagebuf_page_io+0x418>
+    1be0:	83 c4 08             	add    $0x8,%esp
+    1be3:	8b 44 24 20          	mov    0x20(%esp,1),%eax
+    1be7:	ff 44 24 24          	incl   0x24(%esp,1)
+    1beb:	39 44 24 24          	cmp    %eax,0x24(%esp,1)
+    1bef:	0f 8c 6e ff ff ff    	jl     1b63 <_pagebuf_page_io+0x39f>
+		}
+	} else {
+    1bf5:	eb 10                	jmp    1c07 <_pagebuf_page_io+0x443>
+		if (locking)
+    1bf7:	83 7c 24 78 00       	cmpl   $0x0,0x78(%esp,1)
+    1bfc:	74 09                	je     1c07 <_pagebuf_page_io+0x443>
+			unlock_page(page);
+    1bfe:	8b 44 24 58          	mov    0x58(%esp,1),%eax
+    1c02:	e8 fc ff ff ff       	call   1c03 <_pagebuf_page_io+0x43f>
+	}
+
+	return err;
+    1c07:	31 c0                	xor    %eax,%eax
+    1c09:	eb 43                	jmp    1c4e <_pagebuf_page_io+0x48a>
+    1c0b:	90                   	nop    
+    1c0c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+error:
+	/* If we ever do get here then clean up what we already did */
+	for (i = 0; i < cnt; i++) {
+    1c10:	83 7c 24 20 00       	cmpl   $0x0,0x20(%esp,1)
+    1c15:	7e 32                	jle    1c49 <_pagebuf_page_io+0x485>
+    1c17:	8b 54 24 20          	mov    0x20(%esp,1),%edx
+    1c1b:	be 01 00 00 00       	mov    $0x1,%esi
+    1c20:	31 db                	xor    %ebx,%ebx
+    1c22:	89 54 24 24          	mov    %edx,0x24(%esp,1)
+ * This operation is atomic and cannot be reordered.  
+ * It also implies a memory barrier.
+ */
+static __inline__ int test_and_clear_bit(int nr, volatile void * addr)
+{
+    1c26:	8b 44 1c 34          	mov    0x34(%esp,%ebx,1),%eax
+	int oldbit;
+
+	__asm__ __volatile__( LOCK_PREFIX
+    1c2a:	f0 0f b3 70 18       	lock btr %esi,0x18(%eax)
+    1c2f:	19 d2                	sbb    %edx,%edx
+		atomic_set_buffer_clean(bufferlist[i]);
+		bufferlist[i]->b_end_io(bufferlist[i], 0);
+    1c31:	8b 44 1c 34          	mov    0x34(%esp,%ebx,1),%eax
+    1c35:	6a 00                	push   $0x0
+    1c37:	50                   	push   %eax
+    1c38:	8b 40 3c             	mov    0x3c(%eax),%eax
+    1c3b:	ff d0                	call   *%eax
+    1c3d:	83 c4 08             	add    $0x8,%esp
+    1c40:	83 c3 04             	add    $0x4,%ebx
+    1c43:	ff 4c 24 24          	decl   0x24(%esp,1)
+    1c47:	75 dd                	jne    1c26 <_pagebuf_page_io+0x462>
+	}
+	return err;
+    1c49:	b8 f4 ff ff ff       	mov    $0xfffffff4,%eax
+    1c4e:	5b                   	pop    %ebx
+    1c4f:	5e                   	pop    %esi
+    1c50:	5f                   	pop    %edi
+    1c51:	5d                   	pop    %ebp
+    1c52:	83 c4 44             	add    $0x44,%esp
+    1c55:	c3                   	ret    
+}
+    1c56:	89 f6                	mov    %esi,%esi
+
+0000000000001c58 <_page_buf_page_apply>:
+
+STATIC int
+_page_buf_page_apply(
+	page_buf_t		*pb,
+	loff_t			offset,
+	struct page		*page,
+	size_t			pg_offset,
+	size_t			pg_length,
+	int			last)
+{
+    1c58:	83 ec 0c             	sub    $0xc,%esp
+    1c5b:	55                   	push   %ebp
+    1c5c:	57                   	push   %edi
+    1c5d:	56                   	push   %esi
+    1c5e:	53                   	push   %ebx
+    1c5f:	8b 6c 24 20          	mov    0x20(%esp,1),%ebp
+	page_buf_daddr_t	bn = pb->pb_bn;
+	kdev_t			dev = pb->pb_target->pbr_kdev;
+    1c63:	8b 45 14             	mov    0x14(%ebp),%eax
+    1c66:	8b 75 1c             	mov    0x1c(%ebp),%esi
+    1c69:	8b 7d 20             	mov    0x20(%ebp),%edi
+    1c6c:	0f b7 50 06          	movzwl 0x6(%eax),%edx
+    1c70:	66 89 54 24 1a       	mov    %dx,0x1a(%esp,1)
+	size_t			blocksize = pb->pb_target->pbr_blocksize;
+    1c75:	8b 40 10             	mov    0x10(%eax),%eax
+    1c78:	89 44 24 14          	mov    %eax,0x14(%esp,1)
+	loff_t			pb_offset;
+	size_t			ret_len = pg_length;
+    1c7c:	8b 44 24 34          	mov    0x34(%esp,1),%eax
+    1c80:	89 44 24 10          	mov    %eax,0x10(%esp,1)
+
+	assert(page);
+
+	if ((blocksize == PAGE_CACHE_SIZE) &&
+    1c84:	81 7c 24 14 00 10 00 	cmpl   $0x1000,0x14(%esp,1)
+    1c8b:	00 
+    1c8c:	75 42                	jne    1cd0 <_page_buf_page_apply+0x78>
+    1c8e:	81 7d 2c ff 0f 00 00 	cmpl   $0xfff,0x2c(%ebp)
+    1c95:	77 39                	ja     1cd0 <_page_buf_page_apply+0x78>
+    1c97:	8b 45 08             	mov    0x8(%ebp),%eax
+    1c9a:	89 c3                	mov    %eax,%ebx
+    1c9c:	f6 c3 01             	test   $0x1,%bl
+    1c9f:	74 2f                	je     1cd0 <_page_buf_page_apply+0x78>
+    1ca1:	80 bd 82 00 00 00 00 	cmpb   $0x0,0x82(%ebp)
+    1ca8:	74 26                	je     1cd0 <_page_buf_page_apply+0x78>
+	    (pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
+	    (pb->pb_flags & PBF_READ) && pb->pb_locked) {
+		bn -= (pb->pb_offset >> SECTOR_SHIFT);
+    1caa:	0f b7 8d 80 00 00 00 	movzwl 0x80(%ebp),%ecx
+		pg_offset = 0;
+		pg_length = PAGE_CACHE_SIZE;
+    1cb1:	8b 54 24 14          	mov    0x14(%esp,1),%edx
+    1cb5:	66 c1 e9 09          	shr    $0x9,%cx
+    1cb9:	0f b7 c1             	movzwl %cx,%eax
+    1cbc:	29 c6                	sub    %eax,%esi
+    1cbe:	83 df 00             	sbb    $0x0,%edi
+    1cc1:	c7 44 24 30 00 00 00 	movl   $0x0,0x30(%esp,1)
+    1cc8:	00 
+    1cc9:	89 54 24 34          	mov    %edx,0x34(%esp,1)
+	} else {
+    1ccd:	eb 2c                	jmp    1cfb <_page_buf_page_apply+0xa3>
+    1ccf:	90                   	nop    
+		pb_offset = offset - pb->pb_file_offset;
+		if (pb_offset) {
+    1cd0:	8b 5d 08             	mov    0x8(%ebp),%ebx
+    1cd3:	8b 54 24 24          	mov    0x24(%esp,1),%edx
+    1cd7:	8b 4c 24 28          	mov    0x28(%esp,1),%ecx
+    1cdb:	2b 55 24             	sub    0x24(%ebp),%edx
+    1cde:	1b 4d 28             	sbb    0x28(%ebp),%ecx
+    1ce1:	89 d0                	mov    %edx,%eax
+    1ce3:	09 c8                	or     %ecx,%eax
+    1ce5:	74 14                	je     1cfb <_page_buf_page_apply+0xa3>
+			bn += (pb_offset + SECTOR_MASK) >> SECTOR_SHIFT;
+    1ce7:	81 c2 ff 01 00 00    	add    $0x1ff,%edx
+    1ced:	83 d1 00             	adc    $0x0,%ecx
+    1cf0:	0f ac ca 09          	shrd   $0x9,%ecx,%edx
+    1cf4:	c1 f9 09             	sar    $0x9,%ecx
+    1cf7:	01 d6                	add    %edx,%esi
+    1cf9:	11 cf                	adc    %ecx,%edi
+		}
+	}
+
+	if (pb->pb_flags & PBF_READ) {
+    1cfb:	f6 c3 01             	test   $0x1,%bl
+    1cfe:	74 10                	je     1d10 <_page_buf_page_apply+0xb8>
+		_pagebuf_page_io(page, pb, bn, dev, blocksize,
+    1d00:	6a 00                	push   $0x0
+    1d02:	6a 00                	push   $0x0
+    1d04:	0f b6 85 82 00 00 00 	movzbl 0x82(%ebp),%eax
+    1d0b:	50                   	push   %eax
+			(off_t)pg_offset, pg_length, pb->pb_locked, READ, 0);
+	} else if (pb->pb_flags & PBF_WRITE) {
+    1d0c:	eb 44                	jmp    1d52 <_page_buf_page_apply+0xfa>
+    1d0e:	89 f6                	mov    %esi,%esi
+    1d10:	f6 c3 02             	test   $0x2,%bl
+    1d13:	74 62                	je     1d77 <_page_buf_page_apply+0x11f>
+		int locking = (pb->pb_flags & _PBF_LOCKABLE) == 0;
+    1d15:	c1 eb 13             	shr    $0x13,%ebx
+    1d18:	83 f3 01             	xor    $0x1,%ebx
+    1d1b:	83 e3 01             	and    $0x1,%ebx
+
+		/* Check we need to lock pages */
+		if (locking && (pb->pb_locked == 0))
+    1d1e:	74 12                	je     1d32 <_page_buf_page_apply+0xda>
+    1d20:	80 bd 82 00 00 00 00 	cmpb   $0x0,0x82(%ebp)
+    1d27:	75 09                	jne    1d32 <_page_buf_page_apply+0xda>
+			lock_page(page);
+    1d29:	8b 44 24 2c          	mov    0x2c(%esp,1),%eax
+    1d2d:	e8 fc ff ff ff       	call   1d2e <_page_buf_page_apply+0xd6>
+		_pagebuf_page_io(page, pb, bn, dev, blocksize,
+    1d32:	31 c9                	xor    %ecx,%ecx
+    1d34:	83 7c 24 38 00       	cmpl   $0x0,0x38(%esp,1)
+    1d39:	74 13                	je     1d4e <_page_buf_page_apply+0xf6>
+    1d3b:	8b 55 08             	mov    0x8(%ebp),%edx
+    1d3e:	81 e2 00 00 00 10    	and    $0x10000000,%edx
+    1d44:	b8 01 00 00 00       	mov    $0x1,%eax
+    1d49:	85 d2                	test   %edx,%edx
+    1d4b:	0f 45 c8             	cmovne %eax,%ecx
+    1d4e:	51                   	push   %ecx
+    1d4f:	6a 01                	push   $0x1
+    1d51:	53                   	push   %ebx
+    1d52:	8b 44 24 40          	mov    0x40(%esp,1),%eax
+    1d56:	50                   	push   %eax
+    1d57:	8b 54 24 40          	mov    0x40(%esp,1),%edx
+    1d5b:	52                   	push   %edx
+    1d5c:	8b 44 24 28          	mov    0x28(%esp,1),%eax
+    1d60:	50                   	push   %eax
+    1d61:	0f b7 44 24 32       	movzwl 0x32(%esp,1),%eax
+    1d66:	50                   	push   %eax
+    1d67:	57                   	push   %edi
+    1d68:	56                   	push   %esi
+    1d69:	55                   	push   %ebp
+    1d6a:	8b 54 24 54          	mov    0x54(%esp,1),%edx
+    1d6e:	52                   	push   %edx
+    1d6f:	e8 50 fa ff ff       	call   17c4 <_pagebuf_page_io>
+			(off_t)pg_offset, pg_length, locking, WRITE,
+			last && (pb->pb_flags & PBF_FLUSH));
+	}
+    1d74:	83 c4 2c             	add    $0x2c,%esp
+
+	return ret_len;
+    1d77:	8b 44 24 10          	mov    0x10(%esp,1),%eax
+    1d7b:	5b                   	pop    %ebx
+    1d7c:	5e                   	pop    %esi
+    1d7d:	5f                   	pop    %edi
+    1d7e:	5d                   	pop    %ebp
+    1d7f:	83 c4 0c             	add    $0xc,%esp
+    1d82:	c3                   	ret    
+}
+    1d83:	90                   	nop    
+
+0000000000001d84 <pagebuf_iorequest>:
+
+/*
+ *	pagebuf_iorequest
+ *
+ *	pagebuf_iorequest is the core I/O request routine.
+ *	It assumes that the buffer is well-formed and
+ *	mapped and ready for physical I/O, unlike
+ *	pagebuf_iostart() and pagebuf_iophysio().  Those
+ *	routines call the pagebuf_ioinitiate routine to start I/O,
+ *	if it is present, or else call pagebuf_iorequest()
+ *	directly if the pagebuf_ioinitiate routine is not present.
+ *
+ *	This function will be responsible for ensuring access to the
+ *	pages is restricted whilst I/O is in progress - for locking
+ *	pagebufs the pagebuf lock is the mediator, for non-locking
+ *	pagebufs the pages will be locked. In the locking case we
+ *	need to use the pagebuf lock as multiple meta-data buffers
+ *	will reference the same page.
+ */
+int
+pagebuf_iorequest(			/* start real I/O		*/
+	page_buf_t		*pb)	/* buffer to convey to device	*/
+{
+    1d84:	83 ec 20             	sub    $0x20,%esp
+    1d87:	57                   	push   %edi
+    1d88:	56                   	push   %esi
+    1d89:	53                   	push   %ebx
+    1d8a:	8b 74 24 30          	mov    0x30(%esp,1),%esi
+	int			status = 0;
+
+	PB_TRACE(pb, PB_TRACE_REC(ioreq), 0);
+
+	if (pb->pb_flags & PBF_DELWRI) {
+    1d8e:	8b 46 08             	mov    0x8(%esi),%eax
+    1d91:	a8 40                	test   $0x40,%al
+    1d93:	74 12                	je     1da7 <pagebuf_iorequest+0x23>
+		pagebuf_delwri_queue(pb, 1);
+    1d95:	6a 01                	push   $0x1
+    1d97:	56                   	push   %esi
+    1d98:	e8 fc ff ff ff       	call   1d99 <pagebuf_iorequest+0x15>
+		return status;
+    1d9d:	31 c0                	xor    %eax,%eax
+    1d9f:	83 c4 08             	add    $0x8,%esp
+    1da2:	e9 0f 01 00 00       	jmp    1eb6 <pagebuf_iorequest+0x132>
+	}
+
+	if (pb->pb_flags & PBF_WRITE) {
+    1da7:	a8 02                	test   $0x2,%al
+    1da9:	0f 84 ac 00 00 00    	je     1e5b <pagebuf_iorequest+0xd7>
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    1daf:	b8 00 e0 ff ff       	mov    $0xffffe000,%eax
+    1db4:	21 e0                	and    %esp,%eax
+    1db6:	c7 44 24 0c 00 00 00 	movl   $0x0,0xc(%esp,1)
+    1dbd:	00 
+    1dbe:	c7 44 24 10 00 00 00 	movl   $0x0,0x10(%esp,1)
+    1dc5:	00 
+    1dc6:	c7 44 24 14 00 00 00 	movl   $0x0,0x14(%esp,1)
+    1dcd:	00 
+    1dce:	c7 44 24 18 00 00 00 	movl   $0x0,0x18(%esp,1)
+    1dd5:	00 
+    1dd6:	89 44 24 10          	mov    %eax,0x10(%esp,1)
+    1dda:	c7 44 24 1c 00 00 00 	movl   $0x0,0x1c(%esp,1)
+    1de1:	00 
+    1de2:	89 44 24 20          	mov    %eax,0x20(%esp,1)
+    1de6:	c7 44 24 24 00 00 00 	movl   $0x0,0x24(%esp,1)
+    1ded:	00 
+    1dee:	c7 44 24 28 00 00 00 	movl   $0x0,0x28(%esp,1)
+    1df5:	00 
+    1df6:	8b 86 b8 00 00 00    	mov    0xb8(%esi),%eax
+    1dfc:	85 c0                	test   %eax,%eax
+    1dfe:	74 5b                	je     1e5b <pagebuf_iorequest+0xd7>
+    1e00:	8d 9e bc 00 00 00    	lea    0xbc(%esi),%ebx
+    1e06:	8d 54 24 1c          	lea    0x1c(%esp,1),%edx
+    1e0a:	89 d8                	mov    %ebx,%eax
+    1e0c:	e8 fc ff ff ff       	call   1e0d <pagebuf_iorequest+0x89>
+    1e11:	89 df                	mov    %ebx,%edi
+    1e13:	eb 1e                	jmp    1e33 <pagebuf_iorequest+0xaf>
+extern void __run_task_queue(task_queue *list);
+
+static inline void run_task_queue(task_queue *list)
+{
+	if (TQ_ACTIVE(*list))
+    1e15:	81 3d 00 00 00 00 00 	cmpl   $0x0,0x0
+    1e1c:	00 00 00 
+    1e1f:	74 0d                	je     1e2e <pagebuf_iorequest+0xaa>
+		__run_task_queue(list);
+    1e21:	68 00 00 00 00       	push   $0x0
+    1e26:	e8 fc ff ff ff       	call   1e27 <pagebuf_iorequest+0xa3>
+    1e2b:	83 c4 04             	add    $0x4,%esp
+    1e2e:	e8 fc ff ff ff       	call   1e2f <pagebuf_iorequest+0xab>
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    1e33:	bb 00 e0 ff ff       	mov    $0xffffe000,%ebx
+    1e38:	21 e3                	and    %esp,%ebx
+    1e3a:	c7 03 02 00 00 00    	movl   $0x2,(%ebx)
+    1e40:	8b 86 b8 00 00 00    	mov    0xb8(%esi),%eax
+    1e46:	85 c0                	test   %eax,%eax
+    1e48:	75 cb                	jne    1e15 <pagebuf_iorequest+0x91>
+    1e4a:	8d 54 24 1c          	lea    0x1c(%esp,1),%edx
+    1e4e:	89 f8                	mov    %edi,%eax
+    1e50:	e8 fc ff ff ff       	call   1e51 <pagebuf_iorequest+0xcd>
+    1e55:	c7 03 00 00 00 00    	movl   $0x0,(%ebx)
+		_pagebuf_wait_unpin(pb);
+	}
+
+	/* Set the count to 1 initially, this will stop an I/O
+	 * completion callout which happens before we have started
+	 * all the I/O from calling iodone too early
+	 */
+	atomic_set(&PBP(pb)->pb_io_remaining, 1);
+    1e5b:	c7 86 b4 00 00 00 01 	movl   $0x1,0xb4(%esi)
+    1e62:	00 00 00 
+	status = _pagebuf_segment_apply(pb);
+    1e65:	56                   	push   %esi
+    1e66:	e8 95 02 00 00       	call   2100 <_pagebuf_segment_apply>
+    1e6b:	89 c3                	mov    %eax,%ebx
+ * cases.  Note that the guaranteed
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ int atomic_dec_and_test(atomic_t *v)
+{
+    1e6d:	83 c4 04             	add    $0x4,%esp
+	unsigned char c;
+
+	__asm__ __volatile__(
+    1e70:	f0 ff 8e b4 00 00 00 	lock decl 0xb4(%esi)
+    1e77:	0f 94 c0             	sete   %al
+
+	/* Drop our count and if everything worked we are done */
+	if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) {
+    1e7a:	84 c0                	test   %al,%al
+    1e7c:	74 08                	je     1e86 <pagebuf_iorequest+0x102>
+		pagebuf_iodone(pb);
+    1e7e:	56                   	push   %esi
+    1e7f:	e8 fc ff ff ff       	call   1e80 <pagebuf_iorequest+0xfc>
+	} else if ((pb->pb_flags & (PBF_SYNC|PBF_ASYNC)) == PBF_SYNC)  {
+    1e84:	eb 25                	jmp    1eab <pagebuf_iorequest+0x127>
+    1e86:	8b 46 08             	mov    0x8(%esi),%eax
+    1e89:	25 10 01 00 00       	and    $0x110,%eax
+    1e8e:	3d 00 01 00 00       	cmp    $0x100,%eax
+    1e93:	75 19                	jne    1eae <pagebuf_iorequest+0x12a>
+extern void __run_task_queue(task_queue *list);
+
+static inline void run_task_queue(task_queue *list)
+{
+	if (TQ_ACTIVE(*list))
+    1e95:	81 3d 00 00 00 00 00 	cmpl   $0x0,0x0
+    1e9c:	00 00 00 
+    1e9f:	74 0d                	je     1eae <pagebuf_iorequest+0x12a>
+		__run_task_queue(list);
+    1ea1:	68 00 00 00 00       	push   $0x0
+    1ea6:	e8 fc ff ff ff       	call   1ea7 <pagebuf_iorequest+0x123>
+    1eab:	83 c4 04             	add    $0x4,%esp
+		run_task_queue(&tq_disk);
+	}
+
+	return status < 0 ? status : 0;
+    1eae:	31 c0                	xor    %eax,%eax
+    1eb0:	83 fb 01             	cmp    $0x1,%ebx
+    1eb3:	0f 4c c3             	cmovl  %ebx,%eax
+}
+    1eb6:	5b                   	pop    %ebx
+    1eb7:	5e                   	pop    %esi
+    1eb8:	5f                   	pop    %edi
+    1eb9:	83 c4 20             	add    $0x20,%esp
+    1ebc:	c3                   	ret    
+    1ebd:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000001ec0 <pagebuf_iowait>:
+
+/*
+ *	pagebuf_iowait
+ *
+ *	pagebuf_iowait waits for I/O to complete on the buffer supplied.
+ *	It returns immediately if no I/O is pending.  In any case, it returns
+ *	the error code, if any, or 0 if there is no error.
+ */
+int
+pagebuf_iowait(
+	page_buf_t		*pb)
+{
+    1ec0:	53                   	push   %ebx
+    1ec1:	8b 5c 24 08          	mov    0x8(%esp,1),%ebx
+extern void __run_task_queue(task_queue *list);
+
+static inline void run_task_queue(task_queue *list)
+{
+	if (TQ_ACTIVE(*list))
+    1ec5:	81 3d 00 00 00 00 00 	cmpl   $0x0,0x0
+    1ecc:	00 00 00 
+    1ecf:	74 0f                	je     1ee0 <pagebuf_iowait+0x20>
+		__run_task_queue(list);
+    1ed1:	68 00 00 00 00       	push   $0x0
+    1ed6:	e8 fc ff ff ff       	call   1ed7 <pagebuf_iowait+0x17>
+    1edb:	83 c4 04             	add    $0x4,%esp
+}
+    1ede:	89 f6                	mov    %esi,%esi
+ * "__down_failed" is a special asm handler that calls the C
+ * routine that actually waits. See arch/i386/kernel/semaphore.c
+ */
+static inline void down(struct semaphore * sem)
+{
+    1ee0:	8d 4b 58             	lea    0x58(%ebx),%ecx
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+
+	__asm__ __volatile__(
+    1ee3:	f0 ff 4b 58          	lock decl 0x58(%ebx)
+    1ee7:	0f 88 86 0f 00 00    	js     2e73 <Letext+0xaa>
+	PB_TRACE(pb, PB_TRACE_REC(iowait), 0);
+	run_task_queue(&tq_disk);
+	down(&pb->pb_iodonesema);
+	PB_TRACE(pb, PB_TRACE_REC(iowaited), (int)pb->pb_error);
+	return pb->pb_error;
+    1eed:	0f b7 43 7c          	movzwl 0x7c(%ebx),%eax
+    1ef1:	5b                   	pop    %ebx
+    1ef2:	c3                   	ret    
+}
+    1ef3:	90                   	nop    
+
+0000000000001ef4 <pagebuf_mapout_locked>:
+
+STATIC void *
+pagebuf_mapout_locked(
+	page_buf_t		*pb)
+{
+    1ef4:	53                   	push   %ebx
+    1ef5:	8b 4c 24 08          	mov    0x8(%esp,1),%ecx
+	void			*old_addr = NULL;
+    1ef9:	31 c0                	xor    %eax,%eax
+
+	if (pb->pb_flags & PBF_MAPPED) {
+    1efb:	8b 51 08             	mov    0x8(%ecx),%edx
+    1efe:	f6 c2 04             	test   $0x4,%dl
+    1f01:	74 26                	je     1f29 <pagebuf_mapout_locked+0x35>
+		if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
+    1f03:	f7 c2 00 00 00 01    	test   $0x1000000,%edx
+    1f09:	74 0e                	je     1f19 <pagebuf_mapout_locked+0x25>
+			old_addr = pb->pb_addr - pb->pb_offset;
+    1f0b:	8b 59 34             	mov    0x34(%ecx),%ebx
+    1f0e:	0f b7 81 80 00 00 00 	movzwl 0x80(%ecx),%eax
+    1f15:	29 c3                	sub    %eax,%ebx
+    1f17:	89 d8                	mov    %ebx,%eax
+		pb->pb_addr = NULL;
+    1f19:	c7 41 34 00 00 00 00 	movl   $0x0,0x34(%ecx)
+		pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
+    1f20:	81 e2 fb ff ff fe    	and    $0xfefffffb,%edx
+    1f26:	89 51 08             	mov    %edx,0x8(%ecx)
+	}
+
+	return old_addr;	/* Caller must free the address space,
+    1f29:	5b                   	pop    %ebx
+    1f2a:	c3                   	ret    
+				 * we are under a spin lock, probably
+				 * not safe to do vfree here
+				 */
+}
+    1f2b:	90                   	nop    
+
+0000000000001f2c <pagebuf_offset>:
+
+caddr_t
+pagebuf_offset(
+	page_buf_t		*pb,
+	off_t			offset)
+{
+    1f2c:	8b 44 24 04          	mov    0x4(%esp,1),%eax
+	struct page		*page;
+
+	offset += pb->pb_offset;
+    1f30:	0f b7 90 80 00 00 00 	movzwl 0x80(%eax),%edx
+
+	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
+    1f37:	8b 80 84 00 00 00    	mov    0x84(%eax),%eax
+    1f3d:	03 54 24 08          	add    0x8(%esp,1),%edx
+    1f41:	89 d1                	mov    %edx,%ecx
+    1f43:	c1 f9 0c             	sar    $0xc,%ecx
+    1f46:	8b 04 88             	mov    (%eax,%ecx,4),%eax
+	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
+    1f49:	81 e2 ff 0f 00 00    	and    $0xfff,%edx
+    1f4f:	03 50 2c             	add    0x2c(%eax),%edx
+    1f52:	89 d0                	mov    %edx,%eax
+    1f54:	c3                   	ret    
+}
+    1f55:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000001f58 <pagebuf_segment>:
+
+/*
+ *	pagebuf_segment
+ *
+ *	pagebuf_segment is used to retrieve the various contiguous
+ *	segments of a buffer.  The variable addressed by the
+ *	loff_t * should be initialized to 0, and successive
+ *	calls will update to point to the segment following the one
+ *	returned.
+ */
+STATIC void
+pagebuf_segment(
+	page_buf_t		*pb,	/* buffer to examine		*/
+	loff_t			*boff_p,/* offset in buffer of next	*/
+					/* next segment (updated)	*/
+	struct page		**spage_p, /* page (updated)		*/
+					/* (NULL if not in page array)	*/
+	size_t			*soff_p,/* offset in page (updated)	*/
+	size_t			*ssize_p) /* segment length (updated)	*/
+{
+	loff_t			kpboff;	/* offset in pagebuf		*/
+	int			kpi;	/* page index in pagebuf	*/
+	size_t			slen;	/* segment length		*/
+
+	kpboff = *boff_p;
+    1f58:	83 ec 08             	sub    $0x8,%esp
+    1f5b:	55                   	push   %ebp
+    1f5c:	57                   	push   %edi
+    1f5d:	56                   	push   %esi
+    1f5e:	53                   	push   %ebx
+    1f5f:	8b 44 24 20          	mov    0x20(%esp,1),%eax
+
+	kpi = page_buf_btoct(kpboff + pb->pb_offset);
+    1f63:	8b 4c 24 1c          	mov    0x1c(%esp,1),%ecx
+    1f67:	8b 30                	mov    (%eax),%esi
+    1f69:	8b 78 04             	mov    0x4(%eax),%edi
+    1f6c:	0f b7 81 80 00 00 00 	movzwl 0x80(%ecx),%eax
+    1f73:	89 f1                	mov    %esi,%ecx
+    1f75:	89 fb                	mov    %edi,%ebx
+    1f77:	01 c1                	add    %eax,%ecx
+    1f79:	83 d3 00             	adc    $0x0,%ebx
+    1f7c:	89 da                	mov    %ebx,%edx
+
+	*spage_p = pb->pb_pages[kpi];
+    1f7e:	8b 5c 24 1c          	mov    0x1c(%esp,1),%ebx
+    1f82:	89 c8                	mov    %ecx,%eax
+    1f84:	0f ac d0 0c          	shrd   $0xc,%edx,%eax
+    1f88:	c1 fa 0c             	sar    $0xc,%edx
+    1f8b:	89 c2                	mov    %eax,%edx
+    1f8d:	8b 83 84 00 00 00    	mov    0x84(%ebx),%eax
+
+	*soff_p = page_buf_poff(kpboff + pb->pb_offset);
+	slen = PAGE_CACHE_SIZE - *soff_p;
+    1f93:	bd 00 10 00 00       	mov    $0x1000,%ebp
+    1f98:	8b 04 90             	mov    (%eax,%edx,4),%eax
+    1f9b:	8b 54 24 24          	mov    0x24(%esp,1),%edx
+    1f9f:	89 02                	mov    %eax,(%edx)
+    1fa1:	8b 4c 24 28          	mov    0x28(%esp,1),%ecx
+    1fa5:	0f b7 83 80 00 00 00 	movzwl 0x80(%ebx),%eax
+    1fac:	01 f0                	add    %esi,%eax
+    1fae:	25 ff 0f 00 00       	and    $0xfff,%eax
+    1fb3:	89 01                	mov    %eax,(%ecx)
+	if (slen > (pb->pb_count_desired - kpboff))
+    1fb5:	8b 54 24 1c          	mov    0x1c(%esp,1),%edx
+    1fb9:	29 c5                	sub    %eax,%ebp
+    1fbb:	89 e9                	mov    %ebp,%ecx
+    1fbd:	31 db                	xor    %ebx,%ebx
+    1fbf:	8b 42 30             	mov    0x30(%edx),%eax
+    1fc2:	31 d2                	xor    %edx,%edx
+    1fc4:	89 44 24 10          	mov    %eax,0x10(%esp,1)
+    1fc8:	89 54 24 14          	mov    %edx,0x14(%esp,1)
+    1fcc:	8b 44 24 10          	mov    0x10(%esp,1),%eax
+    1fd0:	8b 54 24 14          	mov    0x14(%esp,1),%edx
+    1fd4:	29 f0                	sub    %esi,%eax
+    1fd6:	19 fa                	sbb    %edi,%edx
+    1fd8:	39 d3                	cmp    %edx,%ebx
+    1fda:	7f 06                	jg     1fe2 <pagebuf_segment+0x8a>
+    1fdc:	75 0d                	jne    1feb <pagebuf_segment+0x93>
+    1fde:	39 c1                	cmp    %eax,%ecx
+    1fe0:	76 09                	jbe    1feb <pagebuf_segment+0x93>
+		slen = (pb->pb_count_desired - kpboff);
+    1fe2:	8b 4c 24 1c          	mov    0x1c(%esp,1),%ecx
+    1fe6:	8b 69 30             	mov    0x30(%ecx),%ebp
+    1fe9:	29 f5                	sub    %esi,%ebp
+	*ssize_p = slen;
+    1feb:	8b 44 24 2c          	mov    0x2c(%esp,1),%eax
+    1fef:	89 28                	mov    %ebp,(%eax)
+
+	*boff_p = *boff_p + slen;
+    1ff1:	8b 5c 24 20          	mov    0x20(%esp,1),%ebx
+    1ff5:	01 2b                	add    %ebp,(%ebx)
+    1ff7:	83 53 04 00          	adcl   $0x0,0x4(%ebx)
+    1ffb:	5b                   	pop    %ebx
+    1ffc:	5e                   	pop    %esi
+    1ffd:	5f                   	pop    %edi
+    1ffe:	5d                   	pop    %ebp
+    1fff:	59                   	pop    %ecx
+    2000:	5a                   	pop    %edx
+    2001:	c3                   	ret    
+}
+    2002:	89 f6                	mov    %esi,%esi
+
+0000000000002004 <pagebuf_iomove>:
+
+/*
+ *	pagebuf_iomove
+ *
+ *	Move data into or out of a buffer.
+ */
+void
+pagebuf_iomove(
+	page_buf_t		*pb,	/* buffer to process		*/
+	off_t			boff,	/* starting buffer offset	*/
+	size_t			bsize,	/* length to copy		*/
+	caddr_t			data,	/* data address			*/
+	page_buf_rw_t		mode)	/* read/write flag		*/
+{
+    2004:	55                   	push   %ebp
+    2005:	89 e5                	mov    %esp,%ebp
+    2007:	83 ec 14             	sub    $0x14,%esp
+    200a:	57                   	push   %edi
+    200b:	56                   	push   %esi
+    200c:	53                   	push   %ebx
+    200d:	8b 5d 14             	mov    0x14(%ebp),%ebx
+	loff_t			cboff;
+	size_t			cpoff;
+	size_t			csize;
+	struct page		*page;
+
+	cboff = boff;
+    2010:	8b 45 0c             	mov    0xc(%ebp),%eax
+    2013:	99                   	cltd   
+    2014:	89 55 fc             	mov    %edx,0xfffffffc(%ebp)
+	boff += bsize; /* last */
+    2017:	8b 55 10             	mov    0x10(%ebp),%edx
+    201a:	89 45 f8             	mov    %eax,0xfffffff8(%ebp)
+    201d:	01 d0                	add    %edx,%eax
+    201f:	89 45 0c             	mov    %eax,0xc(%ebp)
+
+	while (cboff < boff) {
+    2022:	89 c2                	mov    %eax,%edx
+    2024:	89 c1                	mov    %eax,%ecx
+    2026:	8d 45 f8             	lea    0xfffffff8(%ebp),%eax
+    2029:	8b 70 04             	mov    0x4(%eax),%esi
+    202c:	c1 f9 1f             	sar    $0x1f,%ecx
+    202f:	39 f1                	cmp    %esi,%ecx
+    2031:	7f 0f                	jg     2042 <pagebuf_iomove+0x3e>
+    2033:	0f 85 ba 00 00 00    	jne    20f3 <pagebuf_iomove+0xef>
+    2039:	3b 55 f8             	cmp    0xfffffff8(%ebp),%edx
+    203c:	0f 86 b1 00 00 00    	jbe    20f3 <pagebuf_iomove+0xef>
+		pagebuf_segment(pb, &cboff, &page, &cpoff, &csize);
+    2042:	8d 45 f4             	lea    0xfffffff4(%ebp),%eax
+    2045:	50                   	push   %eax
+    2046:	8d 45 f0             	lea    0xfffffff0(%ebp),%eax
+    2049:	50                   	push   %eax
+    204a:	8d 45 ec             	lea    0xffffffec(%ebp),%eax
+    204d:	50                   	push   %eax
+    204e:	8d 45 f8             	lea    0xfffffff8(%ebp),%eax
+    2051:	50                   	push   %eax
+    2052:	8b 55 08             	mov    0x8(%ebp),%edx
+    2055:	52                   	push   %edx
+    2056:	e8 fd fe ff ff       	call   1f58 <pagebuf_segment>
+		assert(((csize + cpoff) <= PAGE_CACHE_SIZE));
+    205b:	83 c4 14             	add    $0x14,%esp
+
+		switch (mode) {
+    205e:	83 7d 18 02          	cmpl   $0x2,0x18(%ebp)
+    2062:	74 4c                	je     20b0 <pagebuf_iomove+0xac>
+    2064:	77 0a                	ja     2070 <pagebuf_iomove+0x6c>
+    2066:	83 7d 18 01          	cmpl   $0x1,0x18(%ebp)
+    206a:	74 34                	je     20a0 <pagebuf_iomove+0x9c>
+    206c:	eb 62                	jmp    20d0 <pagebuf_iomove+0xcc>
+    206e:	89 f6                	mov    %esi,%esi
+    2070:	83 7d 18 03          	cmpl   $0x3,0x18(%ebp)
+    2074:	75 5a                	jne    20d0 <pagebuf_iomove+0xcc>
+		case PBRW_ZERO:
+			memset(page_address(page) + cpoff, 0, csize);
+    2076:	8b 45 ec             	mov    0xffffffec(%ebp),%eax
+ * things 32 bits at a time even when we don't know the size of the
+ * area at compile-time..
+ */
+static inline void * __constant_c_memset(void * s, unsigned long c, size_t count)
+{
+    2079:	8b 7d f0             	mov    0xfffffff0(%ebp),%edi
+    207c:	8b 75 f4             	mov    0xfffffff4(%ebp),%esi
+int d0, d1;
+__asm__ __volatile__(
+    207f:	89 f1                	mov    %esi,%ecx
+    2081:	c1 e9 02             	shr    $0x2,%ecx
+    2084:	89 f2                	mov    %esi,%edx
+    2086:	03 78 2c             	add    0x2c(%eax),%edi
+    2089:	31 c0                	xor    %eax,%eax
+    208b:	f3 ab                	repz stos %eax,%es:(%edi)
+    208d:	f6 c2 02             	test   $0x2,%dl
+    2090:	74 02                	je     2094 <pagebuf_iomove+0x90>
+    2092:	66 ab                	stos   %ax,%es:(%edi)
+    2094:	f6 c2 01             	test   $0x1,%dl
+    2097:	74 01                	je     209a <pagebuf_iomove+0x96>
+    2099:	aa                   	stos   %al,%es:(%edi)
+			break;
+    209a:	eb 34                	jmp    20d0 <pagebuf_iomove+0xcc>
+		case PBRW_READ:
+			memcpy(data, page_address(page) + cpoff, csize);
+    209c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+    20a0:	8b 45 ec             	mov    0xffffffec(%ebp),%eax
+return __res;
+}
+
+static inline void * __memcpy(void * to, const void * from, size_t n)
+{
+    20a3:	8b 75 f0             	mov    0xfffffff0(%ebp),%esi
+int d0, d1, d2;
+__asm__ __volatile__(
+    20a6:	89 df                	mov    %ebx,%edi
+    20a8:	03 70 2c             	add    0x2c(%eax),%esi
+			break;
+    20ab:	eb 0e                	jmp    20bb <pagebuf_iomove+0xb7>
+		case PBRW_WRITE:
+			memcpy(page_address(page) + cpoff, data, csize);
+    20ad:	8d 76 00             	lea    0x0(%esi),%esi
+    20b0:	8b 45 ec             	mov    0xffffffec(%ebp),%eax
+return __res;
+}
+
+static inline void * __memcpy(void * to, const void * from, size_t n)
+{
+    20b3:	8b 7d f0             	mov    0xfffffff0(%ebp),%edi
+int d0, d1, d2;
+__asm__ __volatile__(
+    20b6:	89 de                	mov    %ebx,%esi
+    20b8:	03 78 2c             	add    0x2c(%eax),%edi
+    20bb:	8b 45 f4             	mov    0xfffffff4(%ebp),%eax
+    20be:	89 c1                	mov    %eax,%ecx
+    20c0:	c1 e9 02             	shr    $0x2,%ecx
+    20c3:	f3 a5                	repz movsl %ds:(%esi),%es:(%edi)
+    20c5:	a8 02                	test   $0x2,%al
+    20c7:	74 02                	je     20cb <pagebuf_iomove+0xc7>
+    20c9:	66 a5                	movsw  %ds:(%esi),%es:(%edi)
+    20cb:	a8 01                	test   $0x1,%al
+    20cd:	74 01                	je     20d0 <pagebuf_iomove+0xcc>
+    20cf:	a4                   	movsb  %ds:(%esi),%es:(%edi)
+		}
+
+		data += csize;
+	}
+    20d0:	8b 45 0c             	mov    0xc(%ebp),%eax
+    20d3:	89 c2                	mov    %eax,%edx
+    20d5:	89 c1                	mov    %eax,%ecx
+    20d7:	8b 45 fc             	mov    0xfffffffc(%ebp),%eax
+    20da:	03 5d f4             	add    0xfffffff4(%ebp),%ebx
+    20dd:	c1 f9 1f             	sar    $0x1f,%ecx
+    20e0:	39 c1                	cmp    %eax,%ecx
+    20e2:	0f 8f 5a ff ff ff    	jg     2042 <pagebuf_iomove+0x3e>
+    20e8:	75 09                	jne    20f3 <pagebuf_iomove+0xef>
+    20ea:	3b 55 f8             	cmp    0xfffffff8(%ebp),%edx
+    20ed:	0f 87 4f ff ff ff    	ja     2042 <pagebuf_iomove+0x3e>
+    20f3:	8d 65 e0             	lea    0xffffffe0(%ebp),%esp
+    20f6:	5b                   	pop    %ebx
+    20f7:	5e                   	pop    %esi
+    20f8:	5f                   	pop    %edi
+    20f9:	89 ec                	mov    %ebp,%esp
+    20fb:	5d                   	pop    %ebp
+    20fc:	c3                   	ret    
+}
+    20fd:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000002100 <_pagebuf_segment_apply>:
+
+/*
+ *	_pagebuf_segment_apply
+ *
+ *	Applies _page_buf_page_apply to each segment of the page_buf_t.
+ */
+STATIC int
+_pagebuf_segment_apply(			/* apply function to segments	*/
+	page_buf_t		*pb)	/* buffer to examine		*/
+{
+	int			buf_index, sval, status = 0;
+	loff_t			buffer_offset = pb->pb_file_offset;
+    2100:	83 ec 14             	sub    $0x14,%esp
+    2103:	55                   	push   %ebp
+    2104:	57                   	push   %edi
+    2105:	56                   	push   %esi
+    2106:	53                   	push   %ebx
+    2107:	8b 54 24 28          	mov    0x28(%esp,1),%edx
+    210b:	8b 42 24             	mov    0x24(%edx),%eax
+    210e:	8b 52 28             	mov    0x28(%edx),%edx
+    2111:	89 44 24 18          	mov    %eax,0x18(%esp,1)
+	size_t			buffer_len = pb->pb_count_desired;
+    2115:	8b 44 24 28          	mov    0x28(%esp,1),%eax
+    2119:	89 54 24 1c          	mov    %edx,0x1c(%esp,1)
+    211d:	8b 58 30             	mov    0x30(%eax),%ebx
+	size_t			page_offset, len, total = 0;
+    2120:	c7 44 24 14 00 00 00 	movl   $0x0,0x14(%esp,1)
+    2127:	00 
+	size_t			cur_offset, cur_len;
+
+	pagebuf_hold(pb);
+    2128:	50                   	push   %eax
+    2129:	e8 fc ff ff ff       	call   212a <_pagebuf_segment_apply+0x2a>
+
+	cur_offset = pb->pb_offset;
+    212e:	8b 54 24 2c          	mov    0x2c(%esp,1),%edx
+	cur_len = buffer_len;
+    2132:	89 de                	mov    %ebx,%esi
+
+	for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) {
+    2134:	31 ed                	xor    %ebp,%ebp
+    2136:	83 c4 04             	add    $0x4,%esp
+    2139:	0f b7 ba 80 00 00 00 	movzwl 0x80(%edx),%edi
+    2140:	c7 44 24 20 00 00 00 	movl   $0x0,0x20(%esp,1)
+    2147:	00 
+    2148:	66 83 7a 7e 00       	cmpw   $0x0,0x7e(%edx)
+    214d:	0f 84 9e 00 00 00    	je     21f1 <_pagebuf_segment_apply+0xf1>
+		if (cur_len == 0)
+    2153:	85 f6                	test   %esi,%esi
+    2155:	0f 84 96 00 00 00    	je     21f1 <_pagebuf_segment_apply+0xf1>
+			break;
+		if (cur_offset >= PAGE_CACHE_SIZE) {
+    215b:	81 ff ff 0f 00 00    	cmp    $0xfff,%edi
+    2161:	76 0d                	jbe    2170 <_pagebuf_segment_apply+0x70>
+			cur_offset -= PAGE_CACHE_SIZE;
+    2163:	81 c7 00 f0 ff ff    	add    $0xfffff000,%edi
+			continue;
+    2169:	8d 55 01             	lea    0x1(%ebp),%edx
+    216c:	eb 71                	jmp    21df <_pagebuf_segment_apply+0xdf>
+		}
+    216e:	89 f6                	mov    %esi,%esi
+
+		page_offset = cur_offset;
+		cur_offset = 0;
+
+		len = PAGE_CACHE_SIZE - page_offset;
+		if (len > cur_len)
+			len = cur_len;
+		cur_len -= len;
+
+		sval = _page_buf_page_apply(pb, buffer_offset,
+    2170:	8b 44 24 28          	mov    0x28(%esp,1),%eax
+    2174:	89 fa                	mov    %edi,%edx
+    2176:	b9 00 10 00 00       	mov    $0x1000,%ecx
+    217b:	29 d1                	sub    %edx,%ecx
+    217d:	39 f1                	cmp    %esi,%ecx
+    217f:	0f 47 ce             	cmova  %esi,%ecx
+    2182:	8d 5d 01             	lea    0x1(%ebp),%ebx
+    2185:	0f b7 40 7e          	movzwl 0x7e(%eax),%eax
+    2189:	39 c3                	cmp    %eax,%ebx
+    218b:	0f 94 c0             	sete   %al
+    218e:	0f b6 c0             	movzbl %al,%eax
+    2191:	50                   	push   %eax
+    2192:	51                   	push   %ecx
+    2193:	52                   	push   %edx
+    2194:	8b 54 24 34          	mov    0x34(%esp,1),%edx
+    2198:	8b 82 84 00 00 00    	mov    0x84(%edx),%eax
+    219e:	31 ff                	xor    %edi,%edi
+    21a0:	29 ce                	sub    %ecx,%esi
+    21a2:	8b 04 a8             	mov    (%eax,%ebp,4),%eax
+    21a5:	50                   	push   %eax
+    21a6:	8b 44 24 28          	mov    0x28(%esp,1),%eax
+    21aa:	8b 54 24 2c          	mov    0x2c(%esp,1),%edx
+    21ae:	52                   	push   %edx
+    21af:	50                   	push   %eax
+    21b0:	8b 54 24 40          	mov    0x40(%esp,1),%edx
+    21b4:	52                   	push   %edx
+    21b5:	e8 9e fa ff ff       	call   1c58 <_page_buf_page_apply>
+				pb->pb_pages[buf_index], page_offset, len,
+				buf_index+1 == pb->pb_page_count);
+		if (sval <= 0) {
+    21ba:	83 c4 1c             	add    $0x1c,%esp
+    21bd:	89 da                	mov    %ebx,%edx
+    21bf:	85 c0                	test   %eax,%eax
+    21c1:	7f 0d                	jg     21d0 <_pagebuf_segment_apply+0xd0>
+			status = sval;
+    21c3:	89 44 24 20          	mov    %eax,0x20(%esp,1)
+			break;
+    21c7:	eb 28                	jmp    21f1 <_pagebuf_segment_apply+0xf1>
+    21c9:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
+		} else {
+			len = sval;
+    21d0:	89 c1                	mov    %eax,%ecx
+			total += len;
+    21d2:	01 4c 24 14          	add    %ecx,0x14(%esp,1)
+		}
+
+		buffer_offset += len;
+    21d6:	01 4c 24 18          	add    %ecx,0x18(%esp,1)
+    21da:	83 54 24 1c 00       	adcl   $0x0,0x1c(%esp,1)
+    21df:	89 d5                	mov    %edx,%ebp
+    21e1:	8b 54 24 28          	mov    0x28(%esp,1),%edx
+    21e5:	0f b7 42 7e          	movzwl 0x7e(%edx),%eax
+    21e9:	39 c5                	cmp    %eax,%ebp
+    21eb:	0f 8c 62 ff ff ff    	jl     2153 <_pagebuf_segment_apply+0x53>
+		buffer_len -= len;
+	}
+
+	pagebuf_rele(pb);
+    21f1:	8b 44 24 28          	mov    0x28(%esp,1),%eax
+    21f5:	50                   	push   %eax
+    21f6:	e8 fc ff ff ff       	call   21f7 <_pagebuf_segment_apply+0xf7>
+
+	if (!status)
+    21fb:	83 c4 04             	add    $0x4,%esp
+    21fe:	8b 54 24 14          	mov    0x14(%esp,1),%edx
+    2202:	83 7c 24 20 00       	cmpl   $0x0,0x20(%esp,1)
+    2207:	0f 45 54 24 20       	cmovne 0x20(%esp,1),%edx
+    220c:	89 54 24 20          	mov    %edx,0x20(%esp,1)
+		status = total;
+
+	return (status);
+    2210:	89 d0                	mov    %edx,%eax
+    2212:	5b                   	pop    %ebx
+    2213:	5e                   	pop    %esi
+    2214:	5f                   	pop    %edi
+    2215:	5d                   	pop    %ebp
+    2216:	83 c4 14             	add    $0x14,%esp
+    2219:	c3                   	ret    
+}
+    221a:	89 f6                	mov    %esi,%esi
+
+000000000000221c <pagebuf_delwri_queue>:
+
+
+/*
+ * Pagebuf delayed write buffer handling
+ */
+
+void
+pagebuf_delwri_queue(
+	page_buf_t		*pb,
+	int			unlock)
+{
+    221c:	57                   	push   %edi
+    221d:	56                   	push   %esi
+    221e:	53                   	push   %ebx
+    221f:	8b 5c 24 10          	mov    0x10(%esp,1),%ebx
+    2223:	8b 7c 24 14          	mov    0x14(%esp,1),%edi
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    2227:	8b 35 28 00 00 00    	mov    0x28,%esi
+    222d:	83 c6 08             	add    $0x8,%esi
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    2230:	81 7e 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%esi)
+    2237:	74 1a                	je     2253 <pagebuf_delwri_queue+0x37>
+printk("eip: %p\n", &&here);
+    2239:	68 30 22 00 00       	push   $0x2230
+    223e:	68 2b 00 00 00       	push   $0x2b
+    2243:	e8 fc ff ff ff       	call   2244 <pagebuf_delwri_queue+0x28>
+		BUG();
+    2248:	0f 0b                	ud2a   
+    224a:	85 00                	test   %eax,(%eax)
+    224c:	00 00                	add    %al,(%eax)
+    224e:	00 00                	add    %al,(%eax)
+	}
+    2250:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    2253:	f0 fe 0e             	lock decb (%esi)
+    2256:	0f 88 21 0c 00 00    	js     2e7d <Letext+0xb4>
+	PB_TRACE(pb, PB_TRACE_REC(delwri_q), unlock);
+	spin_lock(&pb_daemon->pb_delwrite_lock);
+	/* If already in the queue, dequeue and place at tail */
+	if (!list_empty(&pb->pb_list)) {
+    225c:	39 1b                	cmp    %ebx,(%ebx)
+    225e:	74 14                	je     2274 <pagebuf_delwri_queue+0x58>
+		if (unlock) {
+    2260:	85 ff                	test   %edi,%edi
+    2262:	74 04                	je     2268 <pagebuf_delwri_queue+0x4c>
+ * useful range of an atomic_t is only 24 bits.
+ */ 
+static __inline__ void atomic_dec(atomic_t *v)
+{
+	__asm__ __volatile__(
+    2264:	f0 ff 4b 18          	lock decl 0x18(%ebx)
+ * the prev/next entries already!
+ */
+static __inline__ void __list_del(struct list_head * prev,
+				  struct list_head * next)
+{
+    2268:	8b 53 04             	mov    0x4(%ebx),%edx
+    226b:	8b 03                	mov    (%ebx),%eax
+	next->prev = prev;
+    226d:	89 50 04             	mov    %edx,0x4(%eax)
+	prev->next = next;
+    2270:	89 02                	mov    %eax,(%edx)
+			atomic_dec(&pb->pb_hold);
+		}
+		list_del(&pb->pb_list);
+	} else {
+    2272:	eb 08                	jmp    227c <pagebuf_delwri_queue+0x60>
+		pb_daemon->pb_delwri_cnt++;
+    2274:	a1 28 00 00 00       	mov    0x28,%eax
+    2279:	ff 40 18             	incl   0x18(%eax)
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static __inline__ void list_add_tail(struct list_head *new, struct list_head *head)
+{
+    227c:	8b 0d 28 00 00 00    	mov    0x28,%ecx
+    2282:	8d 41 10             	lea    0x10(%ecx),%eax
+    2285:	8b 50 04             	mov    0x4(%eax),%edx
+    2288:	89 58 04             	mov    %ebx,0x4(%eax)
+    228b:	89 03                	mov    %eax,(%ebx)
+	}
+	list_add_tail(&pb->pb_list, &pb_daemon->pb_delwrite_l);
+	PBP(pb)->pb_flushtime = jiffies + pb_params.p_un.age_buffer;
+    228d:	a1 00 00 00 00       	mov    0x0,%eax
+	struct list_head * next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+    2292:	89 53 04             	mov    %edx,0x4(%ebx)
+	prev->next = new;
+    2295:	89 1a                	mov    %ebx,(%edx)
+    2297:	03 05 04 00 00 00    	add    0x4,%eax
+    229d:	89 83 b0 00 00 00    	mov    %eax,0xb0(%ebx)
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    22a3:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    22a5:	81 79 0c ad 4e ad de 	cmpl   $0xdead4ead,0xc(%ecx)
+    22ac:	74 08                	je     22b6 <pagebuf_delwri_queue+0x9a>
+		BUG();
+    22ae:	0f 0b                	ud2a   
+    22b0:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    22b6:	8a 41 08             	mov    0x8(%ecx),%al
+    22b9:	84 c0                	test   %al,%al
+    22bb:	7e 08                	jle    22c5 <pagebuf_delwri_queue+0xa9>
+		BUG();
+    22bd:	0f 0b                	ud2a   
+    22bf:	6b 00 00             	imul   $0x0,(%eax),%eax
+    22c2:	00 00                	add    %al,(%eax)
+    22c4:	00 86 51 08 85 ff    	add    %al,0xff850851(%esi)
+	spin_unlock(&pb_daemon->pb_delwrite_lock);
+
+	if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) {
+    22ca:	74 0f                	je     22db <pagebuf_delwri_queue+0xbf>
+    22cc:	f6 43 0a 08          	testb  $0x8,0xa(%ebx)
+    22d0:	74 09                	je     22db <pagebuf_delwri_queue+0xbf>
+		pagebuf_unlock(pb);
+    22d2:	53                   	push   %ebx
+    22d3:	e8 fc ff ff ff       	call   22d4 <pagebuf_delwri_queue+0xb8>
+	}
+    22d8:	83 c4 04             	add    $0x4,%esp
+    22db:	5b                   	pop    %ebx
+    22dc:	5e                   	pop    %esi
+    22dd:	5f                   	pop    %edi
+    22de:	c3                   	ret    
+}
+    22df:	90                   	nop    
+
+00000000000022e0 <pagebuf_delwri_dequeue>:
+
+void
+pagebuf_delwri_dequeue(
+	page_buf_t		*pb)
+{
+    22e0:	56                   	push   %esi
+    22e1:	53                   	push   %ebx
+    22e2:	8b 5c 24 0c          	mov    0xc(%esp,1),%ebx
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    22e6:	8b 35 28 00 00 00    	mov    0x28,%esi
+    22ec:	83 c6 08             	add    $0x8,%esi
+    22ef:	90                   	nop    
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    22f0:	81 7e 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%esi)
+    22f7:	74 1a                	je     2313 <pagebuf_delwri_dequeue+0x33>
+printk("eip: %p\n", &&here);
+    22f9:	68 f0 22 00 00       	push   $0x22f0
+    22fe:	68 2b 00 00 00       	push   $0x2b
+    2303:	e8 fc ff ff ff       	call   2304 <pagebuf_delwri_dequeue+0x24>
+		BUG();
+    2308:	0f 0b                	ud2a   
+    230a:	85 00                	test   %eax,(%eax)
+    230c:	00 00                	add    %al,(%eax)
+    230e:	00 00                	add    %al,(%eax)
+	}
+    2310:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    2313:	f0 fe 0e             	lock decb (%esi)
+    2316:	0f 88 6d 0b 00 00    	js     2e89 <Letext+0xc0>
+ * the prev/next entries already!
+ */
+static __inline__ void __list_del(struct list_head * prev,
+				  struct list_head * next)
+{
+    231c:	8b 53 04             	mov    0x4(%ebx),%edx
+    231f:	8b 03                	mov    (%ebx),%eax
+	next->prev = prev;
+    2321:	89 50 04             	mov    %edx,0x4(%eax)
+	prev->next = next;
+    2324:	89 02                	mov    %eax,(%edx)
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static __inline__ void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static __inline__ void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry); 
+    2326:	89 1b                	mov    %ebx,(%ebx)
+    2328:	89 5b 04             	mov    %ebx,0x4(%ebx)
+	PB_TRACE(pb, PB_TRACE_REC(delwri_uq), 0);
+	spin_lock(&pb_daemon->pb_delwrite_lock);
+	list_del_init(&pb->pb_list);
+	pb->pb_flags &= ~PBF_DELWRI;
+    232b:	80 63 08 bf          	andb   $0xbf,0x8(%ebx)
+	pb_daemon->pb_delwri_cnt--;
+    232f:	8b 15 28 00 00 00    	mov    0x28,%edx
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    2335:	b1 01                	mov    $0x1,%cl
+    2337:	ff 4a 18             	decl   0x18(%edx)
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    233a:	81 7a 0c ad 4e ad de 	cmpl   $0xdead4ead,0xc(%edx)
+    2341:	74 0d                	je     2350 <pagebuf_delwri_dequeue+0x70>
+		BUG();
+    2343:	0f 0b                	ud2a   
+    2345:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+    234b:	90                   	nop    
+    234c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+	if (!spin_is_locked(lock))
+    2350:	8a 42 08             	mov    0x8(%edx),%al
+    2353:	84 c0                	test   %al,%al
+    2355:	7e 08                	jle    235f <pagebuf_delwri_dequeue+0x7f>
+		BUG();
+    2357:	0f 0b                	ud2a   
+    2359:	6b 00 00             	imul   $0x0,(%eax),%eax
+    235c:	00 00                	add    %al,(%eax)
+    235e:	00 86 4a 08 5b 5e    	add    %al,0x5e5b084a(%esi)
+#endif
+	__asm__ __volatile__(
+    2364:	c3                   	ret    
+	spin_unlock(&pb_daemon->pb_delwrite_lock);
+}
+    2365:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000002368 <pagebuf_iodone_daemon>:
+
+
+/*
+ * The pagebuf iodone daemon
+ */
+
+STATIC int pb_daemons[NR_CPUS];
+
+STATIC int
+pagebuf_iodone_daemon(
+	void			*__bind_cpu)
+{
+    2368:	83 ec 2c             	sub    $0x2c,%esp
+    236b:	55                   	push   %ebp
+    236c:	57                   	push   %edi
+    236d:	56                   	push   %esi
+    236e:	53                   	push   %ebx
+    236f:	8b 7c 24 40          	mov    0x40(%esp,1),%edi
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    2373:	bb 00 e0 ff ff       	mov    $0xffffe000,%ebx
+    2378:	21 e3                	and    %esp,%ebx
+	int			bind_cpu = (int) (long) __bind_cpu;
+	int			cpu = cpu_logical_map(bind_cpu);
+	DECLARE_WAITQUEUE	(wait, current);
+    237a:	c7 44 24 1c 00 00 00 	movl   $0x0,0x1c(%esp,1)
+    2381:	00 
+    2382:	c7 44 24 20 00 00 00 	movl   $0x0,0x20(%esp,1)
+    2389:	00 
+    238a:	c7 44 24 24 00 00 00 	movl   $0x0,0x24(%esp,1)
+    2391:	00 
+    2392:	c7 44 24 28 00 00 00 	movl   $0x0,0x28(%esp,1)
+    2399:	00 
+    239a:	89 5c 24 20          	mov    %ebx,0x20(%esp,1)
+    239e:	c7 44 24 2c 00 00 00 	movl   $0x0,0x2c(%esp,1)
+    23a5:	00 
+    23a6:	89 5c 24 30          	mov    %ebx,0x30(%esp,1)
+    23aa:	c7 44 24 34 00 00 00 	movl   $0x0,0x34(%esp,1)
+    23b1:	00 
+    23b2:	c7 44 24 38 00 00 00 	movl   $0x0,0x38(%esp,1)
+    23b9:	00 
+
+	/*  Set up the thread  */
+	daemonize();
+    23ba:	e8 fc ff ff ff       	call   23bb <pagebuf_iodone_daemon+0x53>
+
+	/* Avoid signals */
+	spin_lock_irq(&current->sigmask_lock);
+    23bf:	fa                   	cli    
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    23c0:	81 c3 5c 06 00 00    	add    $0x65c,%ebx
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    23c6:	81 7b 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%ebx)
+    23cd:	74 1a                	je     23e9 <pagebuf_iodone_daemon+0x81>
+printk("eip: %p\n", &&here);
+    23cf:	68 c6 23 00 00       	push   $0x23c6
+    23d4:	68 2b 00 00 00       	push   $0x2b
+    23d9:	e8 fc ff ff ff       	call   23da <pagebuf_iodone_daemon+0x72>
+		BUG();
+    23de:	0f 0b                	ud2a   
+    23e0:	85 00                	test   %eax,(%eax)
+    23e2:	00 00                	add    %al,(%eax)
+    23e4:	00 00                	add    %al,(%eax)
+	}
+    23e6:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    23e9:	f0 fe 0b             	lock decb (%ebx)
+    23ec:	0f 88 a3 0a 00 00    	js     2e95 <Letext+0xcc>
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    23f2:	bb 00 e0 ff ff       	mov    $0xffffe000,%ebx
+    23f7:	21 e3                	and    %esp,%ebx
+	switch (_NSIG_WORDS) {
+	default:
+		memset(set, -1, sizeof(sigset_t));
+		break;
+	case 2: set->sig[1] = -1;
+    23f9:	c7 83 6c 06 00 00 ff 	movl   $0xffffffff,0x66c(%ebx)
+    2400:	ff ff ff 
+	case 1:	set->sig[0] = -1;
+    2403:	c7 83 68 06 00 00 ff 	movl   $0xffffffff,0x668(%ebx)
+    240a:	ff ff ff 
+   All callers should have t->sigmask_lock.  */
+
+static inline void recalc_sigpending(struct task_struct *t)
+{
+	t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+    240d:	c7 43 08 00 00 00 00 	movl   $0x0,0x8(%ebx)
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    2414:	89 d8                	mov    %ebx,%eax
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    2416:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    2418:	81 b8 60 06 00 00 ad 	cmpl   $0xdead4ead,0x660(%eax)
+    241f:	4e ad de 
+    2422:	74 0c                	je     2430 <pagebuf_iodone_daemon+0xc8>
+		BUG();
+    2424:	0f 0b                	ud2a   
+    2426:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+    242c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+	if (!spin_is_locked(lock))
+    2430:	8a 80 5c 06 00 00    	mov    0x65c(%eax),%al
+    2436:	84 c0                	test   %al,%al
+    2438:	7e 08                	jle    2442 <pagebuf_iodone_daemon+0xda>
+		BUG();
+    243a:	0f 0b                	ud2a   
+    243c:	6b 00 00             	imul   $0x0,(%eax),%eax
+    243f:	00 00                	add    %al,(%eax)
+    2441:	00 86 93 5c 06 00    	add    %al,0x65c93(%esi)
+#endif
+	__asm__ __volatile__(
+    2447:	00 fb                	add    %bh,%bl
+	sigfillset(&current->blocked);
+	recalc_sigpending(current);
+	spin_unlock_irq(&current->sigmask_lock);
+
+	/* Migrate to the right CPU */
+	current->cpus_allowed = 1UL << cpu;
+    2449:	b8 01 00 00 00       	mov    $0x1,%eax
+    244e:	89 f9                	mov    %edi,%ecx
+    2450:	d3 e0                	shl    %cl,%eax
+    2452:	89 43 38             	mov    %eax,0x38(%ebx)
+	while (smp_processor_id() != cpu)
+    2455:	bb 40 00 00 00       	mov    $0x40,%ebx
+    245a:	8d 04 fd 00 00 00 00 	lea    0x0(,%edi,8),%eax
+    2461:	89 44 24 14          	mov    %eax,0x14(%esp,1)
+    2465:	89 fe                	mov    %edi,%esi
+    2467:	c1 e6 04             	shl    $0x4,%esi
+    246a:	8d 14 bd 00 00 00 00 	lea    0x0(,%edi,4),%edx
+    2471:	89 54 24 18          	mov    %edx,0x18(%esp,1)
+    2475:	8d 4c 24 2c          	lea    0x2c(%esp,1),%ecx
+    2479:	89 4c 24 10          	mov    %ecx,0x10(%esp,1)
+    247d:	eb 06                	jmp    2485 <pagebuf_iodone_daemon+0x11d>
+    247f:	90                   	nop    
+		schedule();
+    2480:	e8 fc ff ff ff       	call   2481 <pagebuf_iodone_daemon+0x119>
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    2485:	bd 00 e0 ff ff       	mov    $0xffffe000,%ebp
+    248a:	21 e5                	and    %esp,%ebp
+    248c:	39 7d 30             	cmp    %edi,0x30(%ebp)
+    248f:	75 ef                	jne    2480 <pagebuf_iodone_daemon+0x118>
+
+	sprintf(current->comm, "pagebuf_io_CPU%d", bind_cpu);
+    2491:	57                   	push   %edi
+    2492:	68 4e 00 00 00       	push   $0x4e
+    2497:	8d 85 3e 03 00 00    	lea    0x33e(%ebp),%eax
+    249d:	50                   	push   %eax
+    249e:	e8 fc ff ff ff       	call   249f <pagebuf_iodone_daemon+0x137>
+	INIT_LIST_HEAD(&pagebuf_iodone_tq[cpu]);
+    24a3:	83 c4 0c             	add    $0xc,%esp
+    24a6:	8b 4c 24 14          	mov    0x14(%esp,1),%ecx
+    24aa:	81 c1 40 00 00 00    	add    $0x40,%ecx
+    24b0:	89 0c fb             	mov    %ecx,(%ebx,%edi,8)
+    24b3:	89 4c fb 04          	mov    %ecx,0x4(%ebx,%edi,8)
+#define DECLARE_WAIT_QUEUE_HEAD(name) \
+	wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
+
+static inline void init_waitqueue_head(wait_queue_head_t *q)
+{
+    24b7:	8d 9e 40 01 00 00    	lea    0x140(%esi),%ebx
+#if WAITQUEUE_DEBUG
+	if (!q)
+		WQ_BUG();
+#endif
+	q->lock = WAITQUEUE_RW_LOCK_UNLOCKED;
+    24bd:	b8 01 00 00 00       	mov    $0x1,%eax
+    24c2:	ba ad 4e ad de       	mov    $0xdead4ead,%edx
+    24c7:	89 86 40 01 00 00    	mov    %eax,0x140(%esi)
+    24cd:	89 96 44 01 00 00    	mov    %edx,0x144(%esi)
+	INIT_LIST_HEAD(&q->task_list);
+    24d3:	8d 43 08             	lea    0x8(%ebx),%eax
+    24d6:	89 43 08             	mov    %eax,0x8(%ebx)
+    24d9:	89 86 4c 01 00 00    	mov    %eax,0x14c(%esi)
+	init_waitqueue_head(&pagebuf_iodone_wait[cpu]);
+	__set_current_state(TASK_INTERRUPTIBLE);
+    24df:	c7 45 00 01 00 00 00 	movl   $0x1,0x0(%ebp)
+	mb();
+    24e6:	f0 83 44 24 00 00    	lock addl $0x0,0x0(%esp,1)
+
+	pb_daemons[cpu] = 1;
+    24ec:	8b 54 24 18          	mov    0x18(%esp,1),%edx
+    24f0:	b8 00 06 00 00       	mov    $0x600,%eax
+    24f5:	c7 04 02 01 00 00 00 	movl   $0x1,(%edx,%eax,1)
+
+	for (;;) {
+    24fc:	89 de                	mov    %ebx,%esi
+    24fe:	89 cf                	mov    %ecx,%edi
+    2500:	89 eb                	mov    %ebp,%ebx
+		add_wait_queue(&pagebuf_iodone_wait[cpu],
+    2502:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+    2506:	89 f0                	mov    %esi,%eax
+    2508:	e8 fc ff ff ff       	call   2509 <pagebuf_iodone_daemon+0x1a1>
+				&wait);
+
+		if (TQ_ACTIVE(pagebuf_iodone_tq[cpu]))
+    250d:	39 3f                	cmp    %edi,(%edi)
+    250f:	74 06                	je     2517 <pagebuf_iodone_daemon+0x1af>
+			__set_task_state(current, TASK_RUNNING);
+    2511:	c7 03 00 00 00 00    	movl   $0x0,(%ebx)
+		schedule();
+    2517:	e8 fc ff ff ff       	call   2518 <pagebuf_iodone_daemon+0x1b0>
+		remove_wait_queue(&pagebuf_iodone_wait[cpu],
+    251c:	8b 54 24 10          	mov    0x10(%esp,1),%edx
+    2520:	89 f0                	mov    %esi,%eax
+    2522:	e8 fc ff ff ff       	call   2523 <pagebuf_iodone_daemon+0x1bb>
+extern void __run_task_queue(task_queue *list);
+
+static inline void run_task_queue(task_queue *list)
+{
+	if (TQ_ACTIVE(*list))
+    2527:	8b 4c 24 14          	mov    0x14(%esp,1),%ecx
+    252b:	8b 44 24 14          	mov    0x14(%esp,1),%eax
+    252f:	05 40 00 00 00       	add    $0x40,%eax
+    2534:	39 81 40 00 00 00    	cmp    %eax,0x40(%ecx)
+    253a:	74 09                	je     2545 <pagebuf_iodone_daemon+0x1dd>
+		__run_task_queue(list);
+    253c:	50                   	push   %eax
+    253d:	e8 fc ff ff ff       	call   253e <pagebuf_iodone_daemon+0x1d6>
+    2542:	83 c4 04             	add    $0x4,%esp
+				&wait);
+		run_task_queue(&pagebuf_iodone_tq[cpu]);
+		if (pb_daemons[cpu] == 0)
+    2545:	8b 54 24 18          	mov    0x18(%esp,1),%edx
+    2549:	b8 00 06 00 00       	mov    $0x600,%eax
+    254e:	83 3c 02 00          	cmpl   $0x0,(%edx,%eax,1)
+    2552:	74 0c                	je     2560 <pagebuf_iodone_daemon+0x1f8>
+			break;
+		__set_current_state(TASK_INTERRUPTIBLE);
+    2554:	c7 03 01 00 00 00    	movl   $0x1,(%ebx)
+    255a:	eb a6                	jmp    2502 <pagebuf_iodone_daemon+0x19a>
+    255c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+	}
+
+	pb_daemons[cpu] = -1;
+    2560:	8b 44 24 18          	mov    0x18(%esp,1),%eax
+    2564:	b9 00 06 00 00       	mov    $0x600,%ecx
+    2569:	c7 04 08 ff ff ff ff 	movl   $0xffffffff,(%eax,%ecx,1)
+	wake_up_interruptible(&pagebuf_iodone_wait[cpu]);
+    2570:	b9 01 00 00 00       	mov    $0x1,%ecx
+    2575:	ba 01 00 00 00       	mov    $0x1,%edx
+    257a:	89 f0                	mov    %esi,%eax
+    257c:	e8 fc ff ff ff       	call   257d <pagebuf_iodone_daemon+0x215>
+	return 0;
+    2581:	31 c0                	xor    %eax,%eax
+    2583:	5b                   	pop    %ebx
+    2584:	5e                   	pop    %esi
+    2585:	5f                   	pop    %edi
+    2586:	5d                   	pop    %ebp
+    2587:	83 c4 2c             	add    $0x2c,%esp
+    258a:	c3                   	ret    
+}
+    258b:	90                   	nop    
+
+000000000000258c <pagebuf_daemon_wakeup>:
+
+/* Defines for pagebuf daemon */
+DECLARE_WAIT_QUEUE_HEAD(pbd_waitq);
+STATIC int force_flush;
+
+STATIC void
+pagebuf_daemon_wakeup(
+	int			flag)
+{
+    258c:	8b 44 24 04          	mov    0x4(%esp,1),%eax
+	force_flush = flag;
+    2590:	a3 80 06 00 00       	mov    %eax,0x680
+	if (waitqueue_active(&pbd_waitq)) {
+    2595:	81 3d 08 00 00 00 08 	cmpl   $0x8,0x8
+    259c:	00 00 00 
+    259f:	74 14                	je     25b5 <pagebuf_daemon_wakeup+0x29>
+		wake_up_interruptible(&pbd_waitq);
+    25a1:	b9 01 00 00 00       	mov    $0x1,%ecx
+    25a6:	ba 01 00 00 00       	mov    $0x1,%edx
+    25ab:	b8 00 00 00 00       	mov    $0x0,%eax
+    25b0:	e8 fc ff ff ff       	call   25b1 <pagebuf_daemon_wakeup+0x25>
+	}
+    25b5:	c3                   	ret    
+}
+    25b6:	89 f6                	mov    %esi,%esi
+
+00000000000025b8 <pagebuf_daemon>:
+
+typedef void (*timeout_fn)(unsigned long);
+
+STATIC int
+pagebuf_daemon(
+	void			*data)
+{
+	int			count;
+	page_buf_t		*pb;
+	struct list_head	*curr, *next, tmp;
+	struct timer_list	pb_daemon_timer =
+    25b8:	55                   	push   %ebp
+    25b9:	89 e5                	mov    %esp,%ebp
+    25bb:	83 ec 20             	sub    $0x20,%esp
+    25be:	57                   	push   %edi
+    25bf:	56                   	push   %esi
+    25c0:	53                   	push   %ebx
+    25c1:	8d 7d ec             	lea    0xffffffec(%ebp),%edi
+    25c4:	31 c0                	xor    %eax,%eax
+    25c6:	fc                   	cld    
+    25c7:	ab                   	stos   %eax,%es:(%edi)
+    25c8:	ab                   	stos   %eax,%es:(%edi)
+    25c9:	ab                   	stos   %eax,%es:(%edi)
+    25ca:	ab                   	stos   %eax,%es:(%edi)
+    25cb:	ab                   	stos   %eax,%es:(%edi)
+    25cc:	c7 45 fc 8c 25 00 00 	movl   $0x258c,0xfffffffc(%ebp)
+		{ {NULL, NULL}, 0, 0, (timeout_fn)pagebuf_daemon_wakeup };
+
+	/*  Set up the thread  */
+	daemonize();
+    25d3:	e8 fc ff ff ff       	call   25d4 <pagebuf_daemon+0x1c>
+
+	/* Avoid signals */
+	spin_lock_irq(&current->sigmask_lock);
+    25d8:	fa                   	cli    
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    25d9:	b8 00 e0 ff ff       	mov    $0xffffe000,%eax
+    25de:	21 e0                	and    %esp,%eax
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    25e0:	8d 98 5c 06 00 00    	lea    0x65c(%eax),%ebx
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    25e6:	81 7b 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%ebx)
+    25ed:	74 1a                	je     2609 <pagebuf_daemon+0x51>
+printk("eip: %p\n", &&here);
+    25ef:	68 e6 25 00 00       	push   $0x25e6
+    25f4:	68 2b 00 00 00       	push   $0x2b
+    25f9:	e8 fc ff ff ff       	call   25fa <pagebuf_daemon+0x42>
+		BUG();
+    25fe:	0f 0b                	ud2a   
+    2600:	85 00                	test   %eax,(%eax)
+    2602:	00 00                	add    %al,(%eax)
+    2604:	00 00                	add    %al,(%eax)
+	}
+    2606:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    2609:	f0 fe 0b             	lock decb (%ebx)
+    260c:	0f 88 8f 08 00 00    	js     2ea1 <Letext+0xd8>
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    2612:	b9 00 e0 ff ff       	mov    $0xffffe000,%ecx
+    2617:	21 e1                	and    %esp,%ecx
+	switch (_NSIG_WORDS) {
+	default:
+		memset(set, -1, sizeof(sigset_t));
+		break;
+	case 2: set->sig[1] = -1;
+    2619:	c7 81 6c 06 00 00 ff 	movl   $0xffffffff,0x66c(%ecx)
+    2620:	ff ff ff 
+	case 1:	set->sig[0] = -1;
+    2623:	c7 81 68 06 00 00 ff 	movl   $0xffffffff,0x668(%ecx)
+    262a:	ff ff ff 
+   All callers should have t->sigmask_lock.  */
+
+static inline void recalc_sigpending(struct task_struct *t)
+{
+	t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+    262d:	c7 41 08 00 00 00 00 	movl   $0x0,0x8(%ecx)
+
+static inline struct task_struct * get_current(void)
+{
+	struct task_struct *current;
+	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+    2634:	89 c8                	mov    %ecx,%eax
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+    2636:	b2 01                	mov    $0x1,%dl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    2638:	81 b8 60 06 00 00 ad 	cmpl   $0xdead4ead,0x660(%eax)
+    263f:	4e ad de 
+    2642:	74 0c                	je     2650 <pagebuf_daemon+0x98>
+		BUG();
+    2644:	0f 0b                	ud2a   
+    2646:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+    264c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+	if (!spin_is_locked(lock))
+    2650:	8a 80 5c 06 00 00    	mov    0x65c(%eax),%al
+    2656:	84 c0                	test   %al,%al
+    2658:	7e 08                	jle    2662 <pagebuf_daemon+0xaa>
+		BUG();
+    265a:	0f 0b                	ud2a   
+    265c:	6b 00 00             	imul   $0x0,(%eax),%eax
+    265f:	00 00                	add    %al,(%eax)
+    2661:	00 86 91 5c 06 00    	add    %al,0x65c91(%esi)
+#endif
+	__asm__ __volatile__(
+    2667:	00 fb                	add    %bh,%bl
+ */
+
+#define __HAVE_ARCH_STRCPY
+static inline char * strcpy(char * dest,const char *src)
+{
+    2669:	8d b9 3e 03 00 00    	lea    0x33e(%ecx),%edi
+    266f:	be 5f 00 00 00       	mov    $0x5f,%esi
+int d0, d1, d2;
+__asm__ __volatile__(
+    2674:	ac                   	lods   %ds:(%esi),%al
+    2675:	aa                   	stos   %al,%es:(%edi)
+    2676:	84 c0                	test   %al,%al
+    2678:	75 fa                	jne    2674 <pagebuf_daemon+0xbc>
+	sigfillset(&current->blocked);
+	recalc_sigpending(current);
+	spin_unlock_irq(&current->sigmask_lock);
+
+	strcpy(current->comm, "pagebufd");
+	current->flags |= PF_MEMALLOC;
+    267a:	80 49 05 08          	orb    $0x8,0x5(%ecx)
+
+	INIT_LIST_HEAD(&tmp);
+    267e:	8d 45 e4             	lea    0xffffffe4(%ebp),%eax
+    2681:	89 45 e4             	mov    %eax,0xffffffe4(%ebp)
+    2684:	89 40 04             	mov    %eax,0x4(%eax)
+	do {
+		if (pb_daemon->active == 1) {
+    2687:	8b 15 28 00 00 00    	mov    0x28,%edx
+    268d:	83 3a 01             	cmpl   $0x1,(%edx)
+    2690:	75 2d                	jne    26bf <pagebuf_daemon+0x107>
+			del_timer(&pb_daemon_timer);
+    2692:	8d 45 ec             	lea    0xffffffec(%ebp),%eax
+    2695:	50                   	push   %eax
+    2696:	e8 fc ff ff ff       	call   2697 <pagebuf_daemon+0xdf>
+			pb_daemon_timer.expires = jiffies +
+    269b:	a1 00 00 00 00       	mov    0x0,%eax
+    26a0:	03 05 00 00 00 00    	add    0x0,%eax
+    26a6:	89 45 f4             	mov    %eax,0xfffffff4(%ebp)
+					pb_params.p_un.flush_interval;
+			add_timer(&pb_daemon_timer);
+    26a9:	8d 4d ec             	lea    0xffffffec(%ebp),%ecx
+    26ac:	51                   	push   %ecx
+    26ad:	e8 fc ff ff ff       	call   26ae <pagebuf_daemon+0xf6>
+			interruptible_sleep_on(&pbd_waitq);
+    26b2:	b8 00 00 00 00       	mov    $0x0,%eax
+    26b7:	e8 fc ff ff ff       	call   26b8 <pagebuf_daemon+0x100>
+		}
+    26bc:	83 c4 08             	add    $0x8,%esp
+
+		if (pb_daemon->active == 0) {
+    26bf:	a1 28 00 00 00       	mov    0x28,%eax
+    26c4:	83 38 00             	cmpl   $0x0,(%eax)
+    26c7:	75 0c                	jne    26d5 <pagebuf_daemon+0x11d>
+			del_timer(&pb_daemon_timer);
+    26c9:	8d 45 ec             	lea    0xffffffec(%ebp),%eax
+    26cc:	50                   	push   %eax
+    26cd:	e8 fc ff ff ff       	call   26ce <pagebuf_daemon+0x116>
+		}
+    26d2:	83 c4 04             	add    $0x4,%esp
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    26d5:	8b 1d 28 00 00 00    	mov    0x28,%ebx
+    26db:	83 c3 08             	add    $0x8,%ebx
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    26de:	81 7b 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%ebx)
+    26e5:	74 1a                	je     2701 <pagebuf_daemon+0x149>
+printk("eip: %p\n", &&here);
+    26e7:	68 de 26 00 00       	push   $0x26de
+    26ec:	68 2b 00 00 00       	push   $0x2b
+    26f1:	e8 fc ff ff ff       	call   26f2 <pagebuf_daemon+0x13a>
+		BUG();
+    26f6:	0f 0b                	ud2a   
+    26f8:	85 00                	test   %eax,(%eax)
+    26fa:	00 00                	add    %al,(%eax)
+    26fc:	00 00                	add    %al,(%eax)
+	}
+    26fe:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    2701:	f0 fe 0b             	lock decb (%ebx)
+    2704:	0f 88 a3 07 00 00    	js     2ead <Letext+0xe4>
+
+		spin_lock(&pb_daemon->pb_delwrite_lock);
+
+		count = 0;
+    270a:	c7 45 e0 00 00 00 00 	movl   $0x0,0xffffffe0(%ebp)
+		list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) {
+    2711:	a1 28 00 00 00       	mov    0x28,%eax
+    2716:	8b 70 10             	mov    0x10(%eax),%esi
+    2719:	83 c0 10             	add    $0x10,%eax
+    271c:	8b 3e                	mov    (%esi),%edi
+    271e:	39 c6                	cmp    %eax,%esi
+    2720:	0f 84 79 00 00 00    	je     279f <pagebuf_daemon+0x1e7>
+			pb = list_entry(curr, page_buf_t, pb_list);
+    2726:	89 f3                	mov    %esi,%ebx
+
+			PB_TRACE(pb, PB_TRACE_REC(walkq1), pagebuf_ispin(pb));
+
+			if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) &&
+    2728:	f6 43 08 40          	testb  $0x40,0x8(%ebx)
+    272c:	74 61                	je     278f <pagebuf_daemon+0x1d7>
+    272e:	53                   	push   %ebx
+    272f:	e8 fc ff ff ff       	call   2730 <pagebuf_daemon+0x178>
+    2734:	83 c4 04             	add    $0x4,%esp
+    2737:	85 c0                	test   %eax,%eax
+    2739:	75 54                	jne    278f <pagebuf_daemon+0x1d7>
+    273b:	f6 43 0a 08          	testb  $0x8,0xa(%ebx)
+    273f:	74 0f                	je     2750 <pagebuf_daemon+0x198>
+    2741:	53                   	push   %ebx
+    2742:	e8 fc ff ff ff       	call   2743 <pagebuf_daemon+0x18b>
+    2747:	83 c4 04             	add    $0x4,%esp
+    274a:	85 c0                	test   %eax,%eax
+    274c:	75 41                	jne    278f <pagebuf_daemon+0x1d7>
+    274e:	89 f6                	mov    %esi,%esi
+			    (((pb->pb_flags & _PBF_LOCKABLE) == 0) ||
+			     !pagebuf_cond_lock(pb))) {
+
+				if (!force_flush && time_before(jiffies,
+    2750:	83 3d 80 06 00 00 00 	cmpl   $0x0,0x680
+    2757:	75 18                	jne    2771 <pagebuf_daemon+0x1b9>
+    2759:	a1 00 00 00 00       	mov    0x0,%eax
+    275e:	2b 83 b0 00 00 00    	sub    0xb0(%ebx),%eax
+    2764:	79 0b                	jns    2771 <pagebuf_daemon+0x1b9>
+						PBP(pb)->pb_flushtime)) {
+					pagebuf_unlock(pb);
+    2766:	53                   	push   %ebx
+    2767:	e8 fc ff ff ff       	call   2768 <pagebuf_daemon+0x1b0>
+					break;
+    276c:	83 c4 04             	add    $0x4,%esp
+    276f:	eb 2e                	jmp    279f <pagebuf_daemon+0x1e7>
+ * the prev/next entries already!
+ */
+static __inline__ void __list_del(struct list_head * prev,
+				  struct list_head * next)
+{
+    2771:	8b 46 04             	mov    0x4(%esi),%eax
+    2774:	8b 16                	mov    (%esi),%edx
+	next->prev = prev;
+    2776:	89 42 04             	mov    %eax,0x4(%edx)
+	prev->next = next;
+    2779:	89 10                	mov    %edx,(%eax)
+    277b:	8b 45 e4             	mov    0xffffffe4(%ebp),%eax
+    277e:	89 70 04             	mov    %esi,0x4(%eax)
+    2781:	89 06                	mov    %eax,(%esi)
+    2783:	8d 4d e4             	lea    0xffffffe4(%ebp),%ecx
+    2786:	89 4e 04             	mov    %ecx,0x4(%esi)
+    2789:	89 75 e4             	mov    %esi,0xffffffe4(%ebp)
+				}
+
+				list_del(&pb->pb_list);
+				list_add(&pb->pb_list, &tmp);
+
+				count++;
+    278c:	ff 45 e0             	incl   0xffffffe0(%ebp)
+    278f:	89 fe                	mov    %edi,%esi
+    2791:	8b 3e                	mov    (%esi),%edi
+    2793:	a1 28 00 00 00       	mov    0x28,%eax
+    2798:	83 c0 10             	add    $0x10,%eax
+    279b:	39 c6                	cmp    %eax,%esi
+    279d:	75 87                	jne    2726 <pagebuf_daemon+0x16e>
+		:"=q" (oldval), "=m" (lock->lock) \
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+    279f:	8b 15 28 00 00 00    	mov    0x28,%edx
+	char oldval = 1;
+    27a5:	b1 01                	mov    $0x1,%cl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    27a7:	81 7a 0c ad 4e ad de 	cmpl   $0xdead4ead,0xc(%edx)
+    27ae:	74 08                	je     27b8 <pagebuf_daemon+0x200>
+		BUG();
+    27b0:	0f 0b                	ud2a   
+    27b2:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    27b8:	8a 42 08             	mov    0x8(%edx),%al
+    27bb:	84 c0                	test   %al,%al
+    27bd:	7e 08                	jle    27c7 <pagebuf_daemon+0x20f>
+		BUG();
+    27bf:	0f 0b                	ud2a   
+    27c1:	6b 00 00             	imul   $0x0,(%eax),%eax
+    27c4:	00 00                	add    %al,(%eax)
+    27c6:	00 86 4a 08 8d 45    	add    %al,0x458d084a(%esi)
+			}
+		}
+
+		spin_unlock(&pb_daemon->pb_delwrite_lock);
+		while (!list_empty(&tmp)) {
+    27cc:	e4 39                	in     $0x39,%al
+    27ce:	45                   	inc    %ebp
+    27cf:	e4 74                	in     $0x74,%al
+    27d1:	3f                   	aas    
+			pb = list_entry(tmp.next,
+    27d2:	8b 5d e4             	mov    0xffffffe4(%ebp),%ebx
+ * the prev/next entries already!
+ */
+static __inline__ void __list_del(struct list_head * prev,
+				  struct list_head * next)
+{
+    27d5:	8b 43 04             	mov    0x4(%ebx),%eax
+    27d8:	8b 13                	mov    (%ebx),%edx
+	next->prev = prev;
+    27da:	89 42 04             	mov    %eax,0x4(%edx)
+	prev->next = next;
+    27dd:	89 10                	mov    %edx,(%eax)
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static __inline__ void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static __inline__ void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry); 
+    27df:	89 1b                	mov    %ebx,(%ebx)
+    27e1:	89 5b 04             	mov    %ebx,0x4(%ebx)
+							page_buf_t, pb_list);
+			list_del_init(&pb->pb_list);
+			pb->pb_flags &= ~PBF_DELWRI;
+    27e4:	8b 43 08             	mov    0x8(%ebx),%eax
+    27e7:	24 bf                	and    $0xbf,%al
+			pb->pb_flags |= PBF_WRITE;
+    27e9:	0c 02                	or     $0x2,%al
+    27eb:	89 43 08             	mov    %eax,0x8(%ebx)
+extern void pagebuf_terminate(void);
+
+static __inline__ int __pagebuf_iorequest(page_buf_t *pb)
+{
+	if (pb->pb_strat)
+    27ee:	83 7b 54 00          	cmpl   $0x0,0x54(%ebx)
+    27f2:	74 0c                	je     2800 <pagebuf_daemon+0x248>
+		return pb->pb_strat(pb);
+    27f4:	53                   	push   %ebx
+    27f5:	8b 43 54             	mov    0x54(%ebx),%eax
+    27f8:	ff d0                	call   *%eax
+    27fa:	eb 0a                	jmp    2806 <pagebuf_daemon+0x24e>
+    27fc:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+	return pagebuf_iorequest(pb);
+    2800:	53                   	push   %ebx
+    2801:	e8 fc ff ff ff       	call   2802 <pagebuf_daemon+0x24a>
+    2806:	83 c4 04             	add    $0x4,%esp
+    2809:	8d 4d e4             	lea    0xffffffe4(%ebp),%ecx
+    280c:	39 4d e4             	cmp    %ecx,0xffffffe4(%ebp)
+    280f:	75 c1                	jne    27d2 <pagebuf_daemon+0x21a>
+
+			__pagebuf_iorequest(pb);
+		}
+
+		if (count)
+    2811:	83 7d e0 00          	cmpl   $0x0,0xffffffe0(%ebp)
+    2815:	74 19                	je     2830 <pagebuf_daemon+0x278>
+extern void __run_task_queue(task_queue *list);
+
+static inline void run_task_queue(task_queue *list)
+{
+	if (TQ_ACTIVE(*list))
+    2817:	81 3d 00 00 00 00 00 	cmpl   $0x0,0x0
+    281e:	00 00 00 
+    2821:	74 0d                	je     2830 <pagebuf_daemon+0x278>
+		__run_task_queue(list);
+    2823:	68 00 00 00 00       	push   $0x0
+    2828:	e8 fc ff ff ff       	call   2829 <pagebuf_daemon+0x271>
+    282d:	83 c4 04             	add    $0x4,%esp
+			run_task_queue(&tq_disk);
+		if (as_list_len > 0)
+    2830:	83 3d e4 05 00 00 00 	cmpl   $0x0,0x5e4
+    2837:	7e 05                	jle    283e <pagebuf_daemon+0x286>
+			purge_addresses();
+    2839:	e8 96 d8 ff ff       	call   d4 <purge_addresses>
+
+		force_flush = 0;
+	} while (pb_daemon->active == 1);
+    283e:	a1 28 00 00 00       	mov    0x28,%eax
+    2843:	c7 05 80 06 00 00 00 	movl   $0x0,0x680
+    284a:	00 00 00 
+    284d:	83 38 01             	cmpl   $0x1,(%eax)
+    2850:	0f 84 3c fe ff ff    	je     2692 <pagebuf_daemon+0xda>
+
+	pb_daemon->active = -1;
+    2856:	c7 00 ff ff ff ff    	movl   $0xffffffff,(%eax)
+	wake_up_interruptible(&pbd_waitq);
+    285c:	b9 01 00 00 00       	mov    $0x1,%ecx
+    2861:	ba 01 00 00 00       	mov    $0x1,%edx
+    2866:	b8 00 00 00 00       	mov    $0x0,%eax
+    286b:	e8 fc ff ff ff       	call   286c <pagebuf_daemon+0x2b4>
+
+	return 0;
+    2870:	31 c0                	xor    %eax,%eax
+    2872:	8d 65 d4             	lea    0xffffffd4(%ebp),%esp
+    2875:	5b                   	pop    %ebx
+    2876:	5e                   	pop    %esi
+    2877:	5f                   	pop    %edi
+    2878:	89 ec                	mov    %ebp,%esp
+    287a:	5d                   	pop    %ebp
+    287b:	c3                   	ret    
+
+000000000000287c <pagebuf_delwri_flush>:
+}
+
+void
+pagebuf_delwri_flush(
+	pb_target_t		*target,
+	u_long			flags,
+	int			*pinptr)
+{
+	page_buf_t		*pb;
+	struct list_head	*curr, *next, tmp;
+	int			pincount = 0;
+    287c:	83 ec 10             	sub    $0x10,%esp
+    287f:	55                   	push   %ebp
+    2880:	57                   	push   %edi
+    2881:	56                   	push   %esi
+    2882:	53                   	push   %ebx
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    2883:	8b 1d 28 00 00 00    	mov    0x28,%ebx
+    2889:	31 ed                	xor    %ebp,%ebp
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    288b:	83 c3 08             	add    $0x8,%ebx
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    288e:	81 7b 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%ebx)
+    2895:	74 1a                	je     28b1 <pagebuf_delwri_flush+0x35>
+printk("eip: %p\n", &&here);
+    2897:	68 8e 28 00 00       	push   $0x288e
+    289c:	68 2b 00 00 00       	push   $0x2b
+    28a1:	e8 fc ff ff ff       	call   28a2 <pagebuf_delwri_flush+0x26>
+		BUG();
+    28a6:	0f 0b                	ud2a   
+    28a8:	85 00                	test   %eax,(%eax)
+    28aa:	00 00                	add    %al,(%eax)
+    28ac:	00 00                	add    %al,(%eax)
+	}
+    28ae:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    28b1:	f0 fe 0b             	lock decb (%ebx)
+    28b4:	0f 88 ff 05 00 00    	js     2eb9 <Letext+0xf0>
+
+	spin_lock(&pb_daemon->pb_delwrite_lock);
+	INIT_LIST_HEAD(&tmp);
+    28ba:	8d 44 24 18          	lea    0x18(%esp,1),%eax
+    28be:	89 44 24 18          	mov    %eax,0x18(%esp,1)
+    28c2:	89 40 04             	mov    %eax,0x4(%eax)
+
+	list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) {
+    28c5:	8b 15 28 00 00 00    	mov    0x28,%edx
+    28cb:	8b 72 10             	mov    0x10(%edx),%esi
+    28ce:	8b 44 24 28          	mov    0x28(%esp,1),%eax
+    28d2:	83 c2 10             	add    $0x10,%edx
+    28d5:	8b 0e                	mov    (%esi),%ecx
+    28d7:	89 4c 24 14          	mov    %ecx,0x14(%esp,1)
+    28db:	89 44 24 10          	mov    %eax,0x10(%esp,1)
+    28df:	83 64 24 10 01       	andl   $0x1,0x10(%esp,1)
+    28e4:	39 d6                	cmp    %edx,%esi
+    28e6:	0f 84 2c 01 00 00    	je     2a18 <pagebuf_delwri_flush+0x19c>
+    28ec:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+		pb = list_entry(curr, page_buf_t, pb_list);
+    28f0:	89 f3                	mov    %esi,%ebx
+
+		/*
+		 * Skip other targets, markers and in progress buffers
+		 */
+
+		if ((pb->pb_flags == 0) || (pb->pb_target != target) ||
+    28f2:	8b 43 08             	mov    0x8(%ebx),%eax
+    28f5:	85 c0                	test   %eax,%eax
+    28f7:	0f 84 01 01 00 00    	je     29fe <pagebuf_delwri_flush+0x182>
+    28fd:	8b 54 24 24          	mov    0x24(%esp,1),%edx
+    2901:	39 53 14             	cmp    %edx,0x14(%ebx)
+    2904:	0f 85 f4 00 00 00    	jne    29fe <pagebuf_delwri_flush+0x182>
+    290a:	a8 40                	test   $0x40,%al
+    290c:	0f 84 ec 00 00 00    	je     29fe <pagebuf_delwri_flush+0x182>
+		    !(pb->pb_flags & PBF_DELWRI)) {
+			continue;
+		}
+
+		PB_TRACE(pb, PB_TRACE_REC(walkq2), pagebuf_ispin(pb));
+		if (pagebuf_ispin(pb)) {
+    2912:	53                   	push   %ebx
+    2913:	e8 fc ff ff ff       	call   2914 <pagebuf_delwri_flush+0x98>
+    2918:	83 c4 04             	add    $0x4,%esp
+    291b:	85 c0                	test   %eax,%eax
+    291d:	75 16                	jne    2935 <pagebuf_delwri_flush+0xb9>
+			pincount++;
+			continue;
+		}
+
+		if (flags & PBDF_TRYLOCK) {
+    291f:	8b 7c 24 28          	mov    0x28(%esp,1),%edi
+    2923:	83 e7 02             	and    $0x2,%edi
+    2926:	74 18                	je     2940 <pagebuf_delwri_flush+0xc4>
+			if (!pagebuf_cond_lock(pb)) {
+    2928:	53                   	push   %ebx
+    2929:	e8 fc ff ff ff       	call   292a <pagebuf_delwri_flush+0xae>
+    292e:	83 c4 04             	add    $0x4,%esp
+    2931:	85 c0                	test   %eax,%eax
+    2933:	75 0b                	jne    2940 <pagebuf_delwri_flush+0xc4>
+				pincount++;
+    2935:	45                   	inc    %ebp
+				continue;
+    2936:	e9 c3 00 00 00       	jmp    29fe <pagebuf_delwri_flush+0x182>
+    293b:	90                   	nop    
+    293c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+ * the prev/next entries already!
+ */
+static __inline__ void __list_del(struct list_head * prev,
+				  struct list_head * next)
+{
+    2940:	8b 56 04             	mov    0x4(%esi),%edx
+    2943:	8b 06                	mov    (%esi),%eax
+	next->prev = prev;
+    2945:	89 50 04             	mov    %edx,0x4(%eax)
+	prev->next = next;
+    2948:	89 02                	mov    %eax,(%edx)
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static __inline__ void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static __inline__ void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry); 
+    294a:	89 36                	mov    %esi,(%esi)
+    294c:	89 76 04             	mov    %esi,0x4(%esi)
+			}
+		}
+
+		list_del_init(&pb->pb_list);
+		if (flags & PBDF_WAIT) {
+    294f:	83 7c 24 10 00       	cmpl   $0x0,0x10(%esp,1)
+    2954:	74 1a                	je     2970 <pagebuf_delwri_flush+0xf4>
+ */
+static __inline__ void __list_add(struct list_head * new,
+	struct list_head * prev,
+	struct list_head * next)
+{
+    2956:	8b 44 24 18          	mov    0x18(%esp,1),%eax
+	next->prev = new;
+    295a:	89 70 04             	mov    %esi,0x4(%eax)
+	new->next = next;
+    295d:	89 06                	mov    %eax,(%esi)
+	new->prev = prev;
+    295f:	8d 4c 24 18          	lea    0x18(%esp,1),%ecx
+    2963:	89 4e 04             	mov    %ecx,0x4(%esi)
+	prev->next = new;
+    2966:	89 74 24 18          	mov    %esi,0x18(%esp,1)
+			list_add(&pb->pb_list, &tmp);
+			pb->pb_flags &= ~PBF_ASYNC;
+    296a:	80 66 08 ef          	andb   $0xef,0x8(%esi)
+    296e:	89 f6                	mov    %esi,%esi
+		:"=q" (oldval), "=m" (lock->lock) \
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+    2970:	8b 15 28 00 00 00    	mov    0x28,%edx
+	char oldval = 1;
+    2976:	b1 01                	mov    $0x1,%cl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    2978:	81 7a 0c ad 4e ad de 	cmpl   $0xdead4ead,0xc(%edx)
+    297f:	74 08                	je     2989 <pagebuf_delwri_flush+0x10d>
+		BUG();
+    2981:	0f 0b                	ud2a   
+    2983:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    2989:	8a 42 08             	mov    0x8(%edx),%al
+    298c:	84 c0                	test   %al,%al
+    298e:	7e 08                	jle    2998 <pagebuf_delwri_flush+0x11c>
+		BUG();
+    2990:	0f 0b                	ud2a   
+    2992:	6b 00 00             	imul   $0x0,(%eax),%eax
+    2995:	00 00                	add    %al,(%eax)
+    2997:	00 86 4a 08 85 ff    	add    %al,0xff85084a(%esi)
+		}
+
+		spin_unlock(&pb_daemon->pb_delwrite_lock);
+
+		if ((flags & PBDF_TRYLOCK) == 0) {
+    299d:	75 09                	jne    29a8 <pagebuf_delwri_flush+0x12c>
+			pagebuf_lock(pb);
+    299f:	56                   	push   %esi
+    29a0:	e8 fc ff ff ff       	call   29a1 <pagebuf_delwri_flush+0x125>
+		}
+    29a5:	83 c4 04             	add    $0x4,%esp
+
+		pb->pb_flags &= ~PBF_DELWRI;
+    29a8:	8b 46 08             	mov    0x8(%esi),%eax
+    29ab:	24 bf                	and    $0xbf,%al
+		pb->pb_flags |= PBF_WRITE;
+    29ad:	0c 02                	or     $0x2,%al
+    29af:	89 46 08             	mov    %eax,0x8(%esi)
+extern void pagebuf_terminate(void);
+
+static __inline__ int __pagebuf_iorequest(page_buf_t *pb)
+{
+	if (pb->pb_strat)
+    29b2:	83 7e 54 00          	cmpl   $0x0,0x54(%esi)
+    29b6:	74 08                	je     29c0 <pagebuf_delwri_flush+0x144>
+		return pb->pb_strat(pb);
+    29b8:	56                   	push   %esi
+    29b9:	8b 46 54             	mov    0x54(%esi),%eax
+    29bc:	ff d0                	call   *%eax
+    29be:	eb 06                	jmp    29c6 <pagebuf_delwri_flush+0x14a>
+	return pagebuf_iorequest(pb);
+    29c0:	56                   	push   %esi
+    29c1:	e8 fc ff ff ff       	call   29c2 <pagebuf_delwri_flush+0x146>
+    29c6:	83 c4 04             	add    $0x4,%esp
+	return oldval > 0;
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    29c9:	8b 1d 28 00 00 00    	mov    0x28,%ebx
+    29cf:	83 c3 08             	add    $0x8,%ebx
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+    29d2:	81 7b 04 ad 4e ad de 	cmpl   $0xdead4ead,0x4(%ebx)
+    29d9:	74 1a                	je     29f5 <pagebuf_delwri_flush+0x179>
+printk("eip: %p\n", &&here);
+    29db:	68 d2 29 00 00       	push   $0x29d2
+    29e0:	68 2b 00 00 00       	push   $0x2b
+    29e5:	e8 fc ff ff ff       	call   29e6 <pagebuf_delwri_flush+0x16a>
+		BUG();
+    29ea:	0f 0b                	ud2a   
+    29ec:	85 00                	test   %eax,(%eax)
+    29ee:	00 00                	add    %al,(%eax)
+    29f0:	00 00                	add    %al,(%eax)
+	}
+    29f2:	83 c4 08             	add    $0x8,%esp
+#endif
+	__asm__ __volatile__(
+    29f5:	f0 fe 0b             	lock decb (%ebx)
+    29f8:	0f 88 c7 04 00 00    	js     2ec5 <Letext+0xfc>
+    29fe:	8b 74 24 14          	mov    0x14(%esp,1),%esi
+    2a02:	8b 06                	mov    (%esi),%eax
+    2a04:	89 44 24 14          	mov    %eax,0x14(%esp,1)
+    2a08:	a1 28 00 00 00       	mov    0x28,%eax
+    2a0d:	83 c0 10             	add    $0x10,%eax
+    2a10:	39 c6                	cmp    %eax,%esi
+    2a12:	0f 85 d8 fe ff ff    	jne    28f0 <pagebuf_delwri_flush+0x74>
+		:"=q" (oldval), "=m" (lock->lock) \
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+    2a18:	8b 15 28 00 00 00    	mov    0x28,%edx
+	char oldval = 1;
+    2a1e:	b1 01                	mov    $0x1,%cl
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+    2a20:	81 7a 0c ad 4e ad de 	cmpl   $0xdead4ead,0xc(%edx)
+    2a27:	74 08                	je     2a31 <pagebuf_delwri_flush+0x1b5>
+		BUG();
+    2a29:	0f 0b                	ud2a   
+    2a2b:	69 00 00 00 00 00    	imul   $0x0,(%eax),%eax
+	if (!spin_is_locked(lock))
+    2a31:	8a 42 08             	mov    0x8(%edx),%al
+    2a34:	84 c0                	test   %al,%al
+    2a36:	7e 08                	jle    2a40 <pagebuf_delwri_flush+0x1c4>
+		BUG();
+    2a38:	0f 0b                	ud2a   
+    2a3a:	6b 00 00             	imul   $0x0,(%eax),%eax
+    2a3d:	00 00                	add    %al,(%eax)
+    2a3f:	00 86 4a 08 81 3d    	add    %al,0x3d81084a(%esi)
+	...
+extern void __run_task_queue(task_queue *list);
+
+static inline void run_task_queue(task_queue *list)
+{
+	if (TQ_ACTIVE(*list))
+    2a4d:	74 0d                	je     2a5c <pagebuf_delwri_flush+0x1e0>
+		__run_task_queue(list);
+    2a4f:	68 00 00 00 00       	push   $0x0
+    2a54:	e8 fc ff ff ff       	call   2a55 <pagebuf_delwri_flush+0x1d9>
+    2a59:	83 c4 04             	add    $0x4,%esp
+
+		__pagebuf_iorequest(pb);
+
+		spin_lock(&pb_daemon->pb_delwrite_lock);
+	}
+
+	spin_unlock(&pb_daemon->pb_delwrite_lock);
+
+	run_task_queue(&tq_disk);
+
+	if (pinptr)
+    2a5c:	83 7c 24 2c 00       	cmpl   $0x0,0x2c(%esp,1)
+    2a61:	74 06                	je     2a69 <pagebuf_delwri_flush+0x1ed>
+		*pinptr = pincount;
+    2a63:	8b 54 24 2c          	mov    0x2c(%esp,1),%edx
+    2a67:	89 2a                	mov    %ebp,(%edx)
+
+	if ((flags & PBDF_WAIT) == 0)
+    2a69:	83 7c 24 10 00       	cmpl   $0x0,0x10(%esp,1)
+    2a6e:	74 53                	je     2ac3 <pagebuf_delwri_flush+0x247>
+		return;
+
+	while (!list_empty(&tmp)) {
+    2a70:	8d 4c 24 18          	lea    0x18(%esp,1),%ecx
+    2a74:	39 4c 24 18          	cmp    %ecx,0x18(%esp,1)
+    2a78:	74 49                	je     2ac3 <pagebuf_delwri_flush+0x247>
+    2a7a:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+		pb = list_entry(tmp.next, page_buf_t, pb_list);
+    2a80:	8b 5c 24 18          	mov    0x18(%esp,1),%ebx
+ * the prev/next entries already!
+ */
+static __inline__ void __list_del(struct list_head * prev,
+				  struct list_head * next)
+{
+    2a84:	8b 53 04             	mov    0x4(%ebx),%edx
+    2a87:	8b 03                	mov    (%ebx),%eax
+	next->prev = prev;
+    2a89:	89 50 04             	mov    %edx,0x4(%eax)
+	prev->next = next;
+    2a8c:	89 02                	mov    %eax,(%edx)
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static __inline__ void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static __inline__ void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry); 
+    2a8e:	89 1b                	mov    %ebx,(%ebx)
+    2a90:	89 5b 04             	mov    %ebx,0x4(%ebx)
+
+		list_del_init(&pb->pb_list);
+		pagebuf_iowait(pb);
+    2a93:	53                   	push   %ebx
+    2a94:	e8 fc ff ff ff       	call   2a95 <pagebuf_delwri_flush+0x219>
+		if (!pb->pb_relse)
+    2a99:	83 c4 04             	add    $0x4,%esp
+    2a9c:	83 7b 50 00          	cmpl   $0x0,0x50(%ebx)
+    2aa0:	75 0e                	jne    2ab0 <pagebuf_delwri_flush+0x234>
+			pagebuf_unlock(pb);
+    2aa2:	53                   	push   %ebx
+    2aa3:	e8 fc ff ff ff       	call   2aa4 <pagebuf_delwri_flush+0x228>
+    2aa8:	83 c4 04             	add    $0x4,%esp
+    2aab:	90                   	nop    
+    2aac:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+		pagebuf_rele(pb);
+    2ab0:	53                   	push   %ebx
+    2ab1:	e8 fc ff ff ff       	call   2ab2 <pagebuf_delwri_flush+0x236>
+	}
+    2ab6:	83 c4 04             	add    $0x4,%esp
+    2ab9:	8d 44 24 18          	lea    0x18(%esp,1),%eax
+    2abd:	39 44 24 18          	cmp    %eax,0x18(%esp,1)
+    2ac1:	75 bd                	jne    2a80 <pagebuf_delwri_flush+0x204>
+}
+    2ac3:	5b                   	pop    %ebx
+    2ac4:	5e                   	pop    %esi
+    2ac5:	5f                   	pop    %edi
+    2ac6:	5d                   	pop    %ebp
+    2ac7:	83 c4 10             	add    $0x10,%esp
+    2aca:	c3                   	ret    
+    2acb:	90                   	nop    
+
+0000000000002acc <pagebuf_daemon_start>:
+
+STATIC int
+pagebuf_daemon_start(void)
+{
+	if (!pb_daemon) {
+    2acc:	55                   	push   %ebp
+    2acd:	57                   	push   %edi
+    2ace:	56                   	push   %esi
+    2acf:	53                   	push   %ebx
+    2ad0:	83 3d 28 00 00 00 00 	cmpl   $0x0,0x28
+    2ad7:	0f 85 dd 00 00 00    	jne    2bba <pagebuf_daemon_start+0xee>
+		int		cpu;
+
+		pb_daemon = (pagebuf_daemon_t *)
+    2add:	68 f0 01 00 00       	push   $0x1f0
+    2ae2:	6a 1c                	push   $0x1c
+    2ae4:	e8 fc ff ff ff       	call   2ae5 <pagebuf_daemon_start+0x19>
+    2ae9:	89 c1                	mov    %eax,%ecx
+    2aeb:	89 0d 28 00 00 00    	mov    %ecx,0x28
+				kmalloc(sizeof(pagebuf_daemon_t), GFP_KERNEL);
+		if (!pb_daemon) {
+    2af1:	83 c4 08             	add    $0x8,%esp
+    2af4:	85 c9                	test   %ecx,%ecx
+    2af6:	75 0a                	jne    2b02 <pagebuf_daemon_start+0x36>
+			return -1; /* error */
+    2af8:	b8 ff ff ff ff       	mov    $0xffffffff,%eax
+    2afd:	e9 ba 00 00 00       	jmp    2bbc <pagebuf_daemon_start+0xf0>
+		}
+
+		pb_daemon->active = 1;
+    2b02:	c7 01 01 00 00 00    	movl   $0x1,(%ecx)
+		pb_daemon->io_active = 1;
+    2b08:	c7 41 04 01 00 00 00 	movl   $0x1,0x4(%ecx)
+		pb_daemon->pb_delwri_cnt = 0;
+    2b0f:	c7 41 18 00 00 00 00 	movl   $0x0,0x18(%ecx)
+		pb_daemon->pb_delwrite_lock = SPIN_LOCK_UNLOCKED;
+    2b16:	b8 01 00 00 00       	mov    $0x1,%eax
+    2b1b:	ba ad 4e ad de       	mov    $0xdead4ead,%edx
+    2b20:	89 41 08             	mov    %eax,0x8(%ecx)
+    2b23:	89 51 0c             	mov    %edx,0xc(%ecx)
+
+		INIT_LIST_HEAD(&pb_daemon->pb_delwrite_l);
+    2b26:	8d 41 10             	lea    0x10(%ecx),%eax
+    2b29:	89 41 10             	mov    %eax,0x10(%ecx)
+    2b2c:	89 41 14             	mov    %eax,0x14(%ecx)
+
+		kernel_thread(pagebuf_daemon, (void *)pb_daemon,
+    2b2f:	68 00 07 00 00       	push   $0x700
+    2b34:	51                   	push   %ecx
+    2b35:	68 b8 25 00 00       	push   $0x25b8
+    2b3a:	e8 fc ff ff ff       	call   2b3b <pagebuf_daemon_start+0x6f>
+				CLONE_FS|CLONE_FILES|CLONE_VM);
+		for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+    2b3f:	31 db                	xor    %ebx,%ebx
+    2b41:	83 c4 0c             	add    $0xc,%esp
+    2b44:	3b 1d 00 00 00 00    	cmp    0x0,%ebx
+    2b4a:	7d 6e                	jge    2bba <pagebuf_daemon_start+0xee>
+    2b4c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+			if (kernel_thread(pagebuf_iodone_daemon,
+    2b50:	68 00 07 00 00       	push   $0x700
+    2b55:	53                   	push   %ebx
+    2b56:	68 68 23 00 00       	push   $0x2368
+    2b5b:	e8 fc ff ff ff       	call   2b5c <pagebuf_daemon_start+0x90>
+    2b60:	83 c4 0c             	add    $0xc,%esp
+    2b63:	85 c0                	test   %eax,%eax
+    2b65:	7d 19                	jge    2b80 <pagebuf_daemon_start+0xb4>
+					(void *)(long) cpu,
+					CLONE_FS|CLONE_FILES|CLONE_VM) < 0) {
+				printk("pagebuf_daemon_start failed\n");
+    2b67:	68 68 00 00 00       	push   $0x68
+    2b6c:	e8 fc ff ff ff       	call   2b6d <pagebuf_daemon_start+0xa1>
+			} else {
+    2b71:	83 c4 04             	add    $0x4,%esp
+    2b74:	8d 7b 01             	lea    0x1(%ebx),%edi
+    2b77:	eb 37                	jmp    2bb0 <pagebuf_daemon_start+0xe4>
+    2b79:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
+				while (!pb_daemons[cpu_logical_map(cpu)]) {
+    2b80:	8d 04 9d 00 00 00 00 	lea    0x0(,%ebx,4),%eax
+    2b87:	89 c6                	mov    %eax,%esi
+    2b89:	bd 00 06 00 00       	mov    $0x600,%ebp
+    2b8e:	8d 7b 01             	lea    0x1(%ebx),%edi
+    2b91:	83 be 00 06 00 00 00 	cmpl   $0x0,0x600(%esi)
+    2b98:	75 16                	jne    2bb0 <pagebuf_daemon_start+0xe4>
+    2b9a:	bb 00 e0 ff ff       	mov    $0xffffe000,%ebx
+    2b9f:	21 e3                	and    %esp,%ebx
+					current->policy |= SCHED_YIELD;
+    2ba1:	80 4b 28 10          	orb    $0x10,0x28(%ebx)
+					schedule();
+    2ba5:	e8 fc ff ff ff       	call   2ba6 <pagebuf_daemon_start+0xda>
+    2baa:	83 3c 2e 00          	cmpl   $0x0,(%esi,%ebp,1)
+    2bae:	74 f1                	je     2ba1 <pagebuf_daemon_start+0xd5>
+    2bb0:	89 fb                	mov    %edi,%ebx
+    2bb2:	3b 1d 00 00 00 00    	cmp    0x0,%ebx
+    2bb8:	7c 96                	jl     2b50 <pagebuf_daemon_start+0x84>
+				}
+			}
+		}
+	}
+	return 0;
+    2bba:	31 c0                	xor    %eax,%eax
+    2bbc:	5b                   	pop    %ebx
+    2bbd:	5e                   	pop    %esi
+    2bbe:	5f                   	pop    %edi
+    2bbf:	5d                   	pop    %ebp
+    2bc0:	c3                   	ret    
+}
+    2bc1:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000002bc4 <pagebuf_daemon_stop>:
+
+/*
+ * pagebuf_daemon_stop
+ * 
+ * Note: do not mark as __exit, it is called from pagebuf_terminate.
+ */
+STATIC void
+pagebuf_daemon_stop(void)
+{
+	if (pb_daemon) {
+    2bc4:	a1 28 00 00 00       	mov    0x28,%eax
+    2bc9:	55                   	push   %ebp
+    2bca:	57                   	push   %edi
+    2bcb:	56                   	push   %esi
+    2bcc:	53                   	push   %ebx
+    2bcd:	85 c0                	test   %eax,%eax
+    2bcf:	0f 84 ac 00 00 00    	je     2c81 <pagebuf_daemon_stop+0xbd>
+		int		cpu;
+
+		pb_daemon->active = 0;
+    2bd5:	c7 00 00 00 00 00    	movl   $0x0,(%eax)
+		pb_daemon->io_active = 0;
+    2bdb:	c7 40 04 00 00 00 00 	movl   $0x0,0x4(%eax)
+
+		wake_up_interruptible(&pbd_waitq);
+    2be2:	b9 01 00 00 00       	mov    $0x1,%ecx
+    2be7:	ba 01 00 00 00       	mov    $0x1,%edx
+    2bec:	b8 00 00 00 00       	mov    $0x0,%eax
+    2bf1:	e8 fc ff ff ff       	call   2bf2 <pagebuf_daemon_stop+0x2e>
+		while (pb_daemon->active == 0) {
+    2bf6:	eb 0a                	jmp    2c02 <pagebuf_daemon_stop+0x3e>
+			interruptible_sleep_on(&pbd_waitq);
+    2bf8:	b8 00 00 00 00       	mov    $0x0,%eax
+    2bfd:	e8 fc ff ff ff       	call   2bfe <pagebuf_daemon_stop+0x3a>
+		}
+    2c02:	a1 28 00 00 00       	mov    0x28,%eax
+    2c07:	83 38 00             	cmpl   $0x0,(%eax)
+    2c0a:	74 ec                	je     2bf8 <pagebuf_daemon_stop+0x34>
+		for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+    2c0c:	31 ff                	xor    %edi,%edi
+    2c0e:	3b 3d 00 00 00 00    	cmp    0x0,%edi
+    2c14:	7d 53                	jge    2c69 <pagebuf_daemon_stop+0xa5>
+    2c16:	bd 00 06 00 00       	mov    $0x600,%ebp
+    2c1b:	90                   	nop    
+    2c1c:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+			pb_daemons[cpu_logical_map(cpu)] = 0;
+    2c20:	8d 34 bd 00 00 00 00 	lea    0x0(,%edi,4),%esi
+    2c27:	c7 04 2e 00 00 00 00 	movl   $0x0,(%esi,%ebp,1)
+			wake_up(&pagebuf_iodone_wait[cpu_logical_map(cpu)]);
+    2c2e:	89 fb                	mov    %edi,%ebx
+    2c30:	c1 e3 04             	shl    $0x4,%ebx
+    2c33:	8d 83 40 01 00 00    	lea    0x140(%ebx),%eax
+    2c39:	b9 01 00 00 00       	mov    $0x1,%ecx
+    2c3e:	ba 03 00 00 00       	mov    $0x3,%edx
+    2c43:	e8 fc ff ff ff       	call   2c44 <pagebuf_daemon_stop+0x80>
+			while (pb_daemons[cpu_logical_map(cpu)] != -1) {
+    2c48:	47                   	inc    %edi
+    2c49:	83 3c 2e ff          	cmpl   $0xffffffff,(%esi,%ebp,1)
+    2c4d:	74 12                	je     2c61 <pagebuf_daemon_stop+0x9d>
+    2c4f:	90                   	nop    
+				interruptible_sleep_on(
+				    &pagebuf_iodone_wait[cpu_logical_map(cpu)]);
+    2c50:	8d 83 40 01 00 00    	lea    0x140(%ebx),%eax
+    2c56:	e8 fc ff ff ff       	call   2c57 <pagebuf_daemon_stop+0x93>
+    2c5b:	83 3c 2e ff          	cmpl   $0xffffffff,(%esi,%ebp,1)
+    2c5f:	75 ef                	jne    2c50 <pagebuf_daemon_stop+0x8c>
+    2c61:	3b 3d 00 00 00 00    	cmp    0x0,%edi
+    2c67:	7c b7                	jl     2c20 <pagebuf_daemon_stop+0x5c>
+			}
+		}
+
+		kfree(pb_daemon);
+    2c69:	a1 28 00 00 00       	mov    0x28,%eax
+    2c6e:	50                   	push   %eax
+    2c6f:	e8 fc ff ff ff       	call   2c70 <pagebuf_daemon_stop+0xac>
+		pb_daemon = NULL;
+    2c74:	c7 05 28 00 00 00 00 	movl   $0x0,0x28
+    2c7b:	00 00 00 
+	}
+    2c7e:	83 c4 04             	add    $0x4,%esp
+    2c81:	5b                   	pop    %ebx
+    2c82:	5e                   	pop    %esi
+    2c83:	5f                   	pop    %edi
+    2c84:	5d                   	pop    %ebp
+    2c85:	c3                   	ret    
+}
+    2c86:	89 f6                	mov    %esi,%esi
+
+0000000000002c88 <pb_stats_clear_handler>:
+
+
+/*
+ * Pagebuf sysctl interface
+ */
+
+STATIC int
+pb_stats_clear_handler(
+	ctl_table		*ctl,
+	int			write,
+	struct file		*filp,
+	void			*buffer,
+	size_t			*lenp)
+{
+    2c88:	57                   	push   %edi
+    2c89:	56                   	push   %esi
+    2c8a:	53                   	push   %ebx
+    2c8b:	8b 5c 24 10          	mov    0x10(%esp,1),%ebx
+    2c8f:	8b 74 24 14          	mov    0x14(%esp,1),%esi
+    2c93:	8b 4c 24 18          	mov    0x18(%esp,1),%ecx
+    2c97:	8b 54 24 1c          	mov    0x1c(%esp,1),%edx
+    2c9b:	8b 44 24 20          	mov    0x20(%esp,1),%eax
+	int			ret;
+	int			*valp = ctl->data;
+    2c9f:	8b 7b 08             	mov    0x8(%ebx),%edi
+
+	ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp);
+    2ca2:	50                   	push   %eax
+    2ca3:	52                   	push   %edx
+    2ca4:	51                   	push   %ecx
+    2ca5:	56                   	push   %esi
+    2ca6:	53                   	push   %ebx
+    2ca7:	e8 fc ff ff ff       	call   2ca8 <pb_stats_clear_handler+0x20>
+    2cac:	89 c3                	mov    %eax,%ebx
+
+	if (!ret && write && *valp) {
+    2cae:	83 c4 14             	add    $0x14,%esp
+    2cb1:	85 db                	test   %ebx,%ebx
+    2cb3:	75 2e                	jne    2ce3 <pb_stats_clear_handler+0x5b>
+    2cb5:	85 f6                	test   %esi,%esi
+    2cb7:	74 2a                	je     2ce3 <pb_stats_clear_handler+0x5b>
+    2cb9:	83 3f 00             	cmpl   $0x0,(%edi)
+    2cbc:	74 25                	je     2ce3 <pb_stats_clear_handler+0x5b>
+		printk("XFS Clearing pbstats\n");
+    2cbe:	68 85 00 00 00       	push   $0x85
+    2cc3:	e8 fc ff ff ff       	call   2cc4 <pb_stats_clear_handler+0x3c>
+		memset(&pbstats, 0, sizeof(pbstats));
+    2cc8:	83 c4 04             	add    $0x4,%esp
+ * This looks horribly ugly, but the compiler can optimize it totally,
+ * as we by now know that both pattern and count is constant..
+ */
+static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
+{
+    2ccb:	bf 00 00 00 00       	mov    $0x0,%edi
+	switch (count) {
+		case 0:
+			return s;
+		case 1:
+			*(unsigned char *)s = pattern;
+			return s;
+		case 2:
+			*(unsigned short *)s = pattern;
+			return s;
+		case 3:
+			*(unsigned short *)s = pattern;
+			*(2+(unsigned char *)s) = pattern;
+			return s;
+		case 4:
+			*(unsigned long *)s = pattern;
+			return s;
+	}
+#define COMMON(x) \
+__asm__  __volatile__( \
+	"rep ; stosl" \
+	x \
+	: "=&c" (d0), "=&D" (d1) \
+	: "a" (pattern),"0" (count/4),"1" ((long) s) \
+	: "memory")
+{
+	int d0, d1;
+	switch (count % 4) {
+		case 0: COMMON(""); return s;
+    2cd0:	b9 09 00 00 00       	mov    $0x9,%ecx
+    2cd5:	89 d8                	mov    %ebx,%eax
+    2cd7:	f3 ab                	repz stos %eax,%es:(%edi)
+		pb_params.p_un.stats_clear = 0;
+    2cd9:	c7 05 0c 00 00 00 00 	movl   $0x0,0xc
+    2ce0:	00 00 00 
+	}
+
+	return ret;
+    2ce3:	89 d8                	mov    %ebx,%eax
+    2ce5:	5b                   	pop    %ebx
+    2ce6:	5e                   	pop    %esi
+    2ce7:	5f                   	pop    %edi
+    2ce8:	c3                   	ret    
+}
+    2ce9:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000002cec <pagebuf_readstats>:
+
+STATIC struct ctl_table_header *pagebuf_table_header;
+
+STATIC ctl_table pagebuf_table[] = {
+	{PB_FLUSH_INT, "flush_int", &pb_params.data[0],
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
+	&sysctl_intvec, NULL, &pagebuf_min[0], &pagebuf_max[0]},
+
+	{PB_FLUSH_AGE, "flush_age", &pb_params.data[1],
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
+	&sysctl_intvec, NULL, &pagebuf_min[1], &pagebuf_max[1]},
+
+	{PB_STATS_CLEAR, "stats_clear", &pb_params.data[3],
+	sizeof(ulong), 0644, NULL, &pb_stats_clear_handler,
+	&sysctl_intvec, NULL, &pagebuf_min[3], &pagebuf_max[3]},
+
+#ifdef PAGEBUF_TRACE
+	{PB_DEBUG, "debug", &pb_params.data[4],
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_minmax,
+	&sysctl_intvec, NULL, &pagebuf_min[4], &pagebuf_max[4]},
+#endif
+	{0}
+};
+
+STATIC ctl_table pagebuf_dir_table[] = {
+	{VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table},
+	{0}
+};
+
+STATIC ctl_table pagebuf_root_table[] = {
+	{CTL_VM, "vm",	NULL, 0, 0555, pagebuf_dir_table},
+	{0}
+};
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+pagebuf_readstats(
+	char			*buffer,
+	char			**start,
+	off_t			offset,
+	int			count,
+	int			*eof,
+	void			*data)
+{
+    2cec:	55                   	push   %ebp
+    2ced:	57                   	push   %edi
+    2cee:	56                   	push   %esi
+    2cef:	53                   	push   %ebx
+    2cf0:	8b 7c 24 14          	mov    0x14(%esp,1),%edi
+    2cf4:	8b 6c 24 1c          	mov    0x1c(%esp,1),%ebp
+	int			i, len;
+
+	len = 0;
+	len += sprintf(buffer + len, "pagebuf");
+    2cf8:	68 bb 00 00 00       	push   $0xbb
+    2cfd:	57                   	push   %edi
+    2cfe:	e8 fc ff ff ff       	call   2cff <pagebuf_readstats+0x13>
+    2d03:	89 c3                	mov    %eax,%ebx
+	for (i = 0; i < sizeof(pbstats) / sizeof(u_int32_t); i++) {
+    2d05:	31 f6                	xor    %esi,%esi
+    2d07:	83 c4 08             	add    $0x8,%esp
+    2d0a:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+		len += sprintf(buffer + len, " %u",
+    2d10:	8b 04 b5 00 00 00 00 	mov    0x0(,%esi,4),%eax
+    2d17:	50                   	push   %eax
+    2d18:	68 c6 00 00 00       	push   $0xc6
+    2d1d:	8d 04 3b             	lea    (%ebx,%edi,1),%eax
+    2d20:	50                   	push   %eax
+    2d21:	e8 fc ff ff ff       	call   2d22 <pagebuf_readstats+0x36>
+    2d26:	01 c3                	add    %eax,%ebx
+    2d28:	83 c4 0c             	add    $0xc,%esp
+    2d2b:	46                   	inc    %esi
+    2d2c:	83 fe 08             	cmp    $0x8,%esi
+    2d2f:	76 df                	jbe    2d10 <pagebuf_readstats+0x24>
+			*(((u_int32_t*)&pbstats) + i));
+	}
+	buffer[len++] = '\n';
+    2d31:	c6 04 3b 0a          	movb   $0xa,(%ebx,%edi,1)
+    2d35:	43                   	inc    %ebx
+
+	if (offset >= len) {
+    2d36:	39 dd                	cmp    %ebx,%ebp
+    2d38:	7c 16                	jl     2d50 <pagebuf_readstats+0x64>
+		*start = buffer;
+    2d3a:	8b 44 24 18          	mov    0x18(%esp,1),%eax
+    2d3e:	89 38                	mov    %edi,(%eax)
+		*eof = 1;
+    2d40:	8b 44 24 24          	mov    0x24(%esp,1),%eax
+    2d44:	c7 00 01 00 00 00    	movl   $0x1,(%eax)
+		return 0;
+    2d4a:	31 c0                	xor    %eax,%eax
+    2d4c:	eb 26                	jmp    2d74 <pagebuf_readstats+0x88>
+    2d4e:	89 f6                	mov    %esi,%esi
+	}
+	*start = buffer + offset;
+    2d50:	8b 44 24 18          	mov    0x18(%esp,1),%eax
+    2d54:	01 ef                	add    %ebp,%edi
+    2d56:	89 38                	mov    %edi,(%eax)
+	if ((len -= offset) > count)
+    2d58:	29 eb                	sub    %ebp,%ebx
+    2d5a:	3b 5c 24 20          	cmp    0x20(%esp,1),%ebx
+    2d5e:	7f 10                	jg     2d70 <pagebuf_readstats+0x84>
+		return count;
+	*eof = 1;
+    2d60:	8b 44 24 24          	mov    0x24(%esp,1),%eax
+    2d64:	c7 00 01 00 00 00    	movl   $0x1,(%eax)
+
+	return len;
+    2d6a:	89 d8                	mov    %ebx,%eax
+    2d6c:	eb 06                	jmp    2d74 <pagebuf_readstats+0x88>
+    2d6e:	89 f6                	mov    %esi,%esi
+    2d70:	8b 44 24 20          	mov    0x20(%esp,1),%eax
+    2d74:	5b                   	pop    %ebx
+    2d75:	5e                   	pop    %esi
+    2d76:	5f                   	pop    %edi
+    2d77:	5d                   	pop    %ebp
+    2d78:	c3                   	ret    
+}
+    2d79:	8d 76 00             	lea    0x0(%esi),%esi
+
+0000000000002d7c <pagebuf_shaker>:
+#endif	/* CONFIG_PROC_FS */
+
+STATIC void
+pagebuf_shaker(void)
+{
+	pagebuf_daemon_wakeup(1);
+    2d7c:	6a 01                	push   $0x1
+    2d7e:	e8 09 f8 ff ff       	call   258c <pagebuf_daemon_wakeup>
+}
+    2d83:	83 c4 04             	add    $0x4,%esp
+    2d86:	c3                   	ret    
+    2d87:	90                   	nop    
+
+0000000000002d88 <pagebuf_terminate>:
+
+
+/*
+ *	Initialization and Termination
+ */
+
+int __init
+pagebuf_init(void)
+{
+	int			i;
+
+	pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);
+
+#ifdef CONFIG_PROC_FS
+	if (proc_mkdir("fs/pagebuf", 0))
+		create_proc_read_entry(
+			"fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL);
+#endif
+
+	pagebuf_cache = kmem_cache_create("page_buf_t",
+			sizeof(page_buf_private_t), 0,
+			SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (pagebuf_cache == NULL) {
+		printk("pagebuf: couldn't init pagebuf cache\n");
+		pagebuf_terminate();
+		return -ENOMEM;
+	}
+
+	if (_pagebuf_prealloc_bh(NR_RESERVED_BH) < NR_RESERVED_BH) {
+		printk("pagebuf: couldn't pre-allocate %d buffer heads\n",
+			NR_RESERVED_BH);
+		pagebuf_terminate();
+		return -ENOMEM;
+	}
+
+	init_waitqueue_head(&pb_resv_bh_wait);
+
+	for (i = 0; i < NHASH; i++) {
+		spin_lock_init(&pbhash[i].pb_hash_lock);
+		INIT_LIST_HEAD(&pbhash[i].pb_hash);
+	}
+
+#ifdef PAGEBUF_TRACE
+# if 1
+	pb_trace.buf = (pagebuf_trace_t *)kmalloc(
+			PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t), GFP_KERNEL);
+# else
+	/* Alternatively, for really really long trace bufs */
+	pb_trace.buf = (pagebuf_trace_t *)vmalloc(
+			PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t));
+# endif
+	memset(pb_trace.buf, 0, PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t));
+	pb_trace.start = 0;
+	pb_trace.end = PB_TRACE_BUFSIZE - 1;
+#endif
+
+	pagebuf_daemon_start();
+	kmem_shake_register(pagebuf_shaker);
+	return 0;
+}
+
+/*
+ *	pagebuf_terminate. 
+ * 
+ *	Note: do not mark as __exit, this is also called from the __init code.
+ */
+void
+pagebuf_terminate(void)
+{
+	pagebuf_daemon_stop();
+    2d88:	e8 37 fe ff ff       	call   2bc4 <pagebuf_daemon_stop>
+
+	kmem_cache_destroy(pagebuf_cache);
+    2d8d:	a1 24 00 00 00       	mov    0x24,%eax
+    2d92:	50                   	push   %eax
+    2d93:	e8 fc ff ff ff       	call   2d94 <pagebuf_terminate+0xc>
+	kmem_shake_deregister(pagebuf_shaker);
+    2d98:	68 7c 2d 00 00       	push   $0x2d7c
+    2d9d:	e8 fc ff ff ff       	call   2d9e <pagebuf_terminate+0x16>
+
+	unregister_sysctl_table(pagebuf_table_header);
+    2da2:	a1 84 06 00 00       	mov    0x684,%eax
+    2da7:	50                   	push   %eax
+    2da8:	e8 fc ff ff ff       	call   2da9 <pagebuf_terminate+0x21>
+#ifdef	CONFIG_PROC_FS
+	remove_proc_entry("fs/pagebuf/stat", NULL);
+    2dad:	6a 00                	push   $0x0
+    2daf:	68 d5 00 00 00       	push   $0xd5
+    2db4:	e8 fc ff ff ff       	call   2db5 <pagebuf_terminate+0x2d>
+	remove_proc_entry("fs/pagebuf", NULL);
+    2db9:	6a 00                	push   $0x0
+    2dbb:	68 ca 00 00 00       	push   $0xca
+    2dc0:	e8 fc ff ff ff       	call   2dc1 <pagebuf_terminate+0x39>
+#endif
+}
+    2dc5:	83 c4 1c             	add    $0x1c,%esp
+    2dc8:	c3                   	ret    
+
+0000000000002dc9 <Letext>:
+    2dc9:	80 3d 40 00 00 00 00 	cmpb   $0x0,0x40
+    2dd0:	f3 90                	repz nop 
+    2dd2:	7e f5                	jle    2dc9 <Letext>
+    2dd4:	e9 95 d2 ff ff       	jmp    6e <free_address+0x26>
+    2dd9:	80 3d 40 00 00 00 00 	cmpb   $0x0,0x40
+    2de0:	f3 90                	repz nop 
+    2de2:	7e f5                	jle    2dd9 <Letext+0x10>
+    2de4:	e9 20 d3 ff ff       	jmp    109 <purge_addresses+0x35>
+    2de9:	80 3b 00             	cmpb   $0x0,(%ebx)
+    2dec:	f3 90                	repz nop 
+    2dee:	7e f9                	jle    2de9 <Letext+0x20>
+    2df0:	e9 25 da ff ff       	jmp    81a <_pagebuf_get_prealloc_bh+0x5a>
+    2df5:	80 3b 00             	cmpb   $0x0,(%ebx)
+    2df8:	f3 90                	repz nop 
+    2dfa:	7e f9                	jle    2df5 <Letext+0x2c>
+    2dfc:	e9 c8 da ff ff       	jmp    8c9 <_pagebuf_get_prealloc_bh+0x109>
+    2e01:	80 3b 00             	cmpb   $0x0,(%ebx)
+    2e04:	f3 90                	repz nop 
+    2e06:	7e f9                	jle    2e01 <Letext+0x38>
+    2e08:	e9 c2 db ff ff       	jmp    9cf <_pagebuf_find+0x6b>
+    2e0d:	e8 fc ff ff ff       	call   2e0e <Letext+0x45>
+    2e12:	e9 eb dc ff ff       	jmp    b02 <_pagebuf_find+0x19e>
+    2e17:	e8 fc ff ff ff       	call   2e18 <Letext+0x4f>
+    2e1c:	e9 96 e1 ff ff       	jmp    fb7 <pagebuf_get_no_daddr+0xc3>
+    2e21:	80 3b 00             	cmpb   $0x0,(%ebx)
+    2e24:	f3 90                	repz nop 
+    2e26:	7e f9                	jle    2e21 <Letext+0x58>
+    2e28:	e9 eb e1 ff ff       	jmp    1018 <pagebuf_free+0x44>
+    2e2d:	80 3f 00             	cmpb   $0x0,(%edi)
+    2e30:	f3 90                	repz nop 
+    2e32:	7e f9                	jle    2e2d <Letext+0x64>
+    2e34:	e9 3f e2 ff ff       	jmp    1078 <pagebuf_rele+0x44>
+    2e39:	80 3e 00             	cmpb   $0x0,(%esi)
+    2e3c:	f3 90                	repz nop 
+    2e3e:	7e f9                	jle    2e39 <Letext+0x70>
+    2e40:	e9 e1 e3 ff ff       	jmp    1226 <pagebuf_queue_task+0x56>
+    2e45:	80 3f 00             	cmpb   $0x0,(%edi)
+    2e48:	f3 90                	repz nop 
+    2e4a:	7e f9                	jle    2e45 <Letext+0x7c>
+    2e4c:	e9 19 e5 ff ff       	jmp    136a <pagebuf_iodone+0x9a>
+    2e51:	e8 fc ff ff ff       	call   2e52 <Letext+0x89>
+    2e56:	e9 86 e5 ff ff       	jmp    13e1 <pagebuf_iodone+0x111>
+    2e5b:	80 3e 00             	cmpb   $0x0,(%esi)
+    2e5e:	f3 90                	repz nop 
+    2e60:	7e f9                	jle    2e5b <Letext+0x92>
+    2e62:	e9 b7 e6 ff ff       	jmp    151e <_end_pagebuf_page_io+0x9e>
+    2e67:	80 3e 00             	cmpb   $0x0,(%esi)
+    2e6a:	f3 90                	repz nop 
+    2e6c:	7e f9                	jle    2e67 <Letext+0x9e>
+    2e6e:	e9 4b e8 ff ff       	jmp    16be <_end_pagebuf_page_io_multi+0xba>
+    2e73:	e8 fc ff ff ff       	call   2e74 <Letext+0xab>
+    2e78:	e9 70 f0 ff ff       	jmp    1eed <pagebuf_iowait+0x2d>
+    2e7d:	80 3e 00             	cmpb   $0x0,(%esi)
+    2e80:	f3 90                	repz nop 
+    2e82:	7e f9                	jle    2e7d <Letext+0xb4>
+    2e84:	e9 ca f3 ff ff       	jmp    2253 <pagebuf_delwri_queue+0x37>
+    2e89:	80 3e 00             	cmpb   $0x0,(%esi)
+    2e8c:	f3 90                	repz nop 
+    2e8e:	7e f9                	jle    2e89 <Letext+0xc0>
+    2e90:	e9 7e f4 ff ff       	jmp    2313 <pagebuf_delwri_dequeue+0x33>
+    2e95:	80 3b 00             	cmpb   $0x0,(%ebx)
+    2e98:	f3 90                	repz nop 
+    2e9a:	7e f9                	jle    2e95 <Letext+0xcc>
+    2e9c:	e9 48 f5 ff ff       	jmp    23e9 <pagebuf_iodone_daemon+0x81>
+    2ea1:	80 3b 00             	cmpb   $0x0,(%ebx)
+    2ea4:	f3 90                	repz nop 
+    2ea6:	7e f9                	jle    2ea1 <Letext+0xd8>
+    2ea8:	e9 5c f7 ff ff       	jmp    2609 <pagebuf_daemon+0x51>
+    2ead:	80 3b 00             	cmpb   $0x0,(%ebx)
+    2eb0:	f3 90                	repz nop 
+    2eb2:	7e f9                	jle    2ead <Letext+0xe4>
+    2eb4:	e9 48 f8 ff ff       	jmp    2701 <pagebuf_daemon+0x149>
+    2eb9:	80 3b 00             	cmpb   $0x0,(%ebx)
+    2ebc:	f3 90                	repz nop 
+    2ebe:	7e f9                	jle    2eb9 <Letext+0xf0>
+    2ec0:	e9 ec f9 ff ff       	jmp    28b1 <pagebuf_delwri_flush+0x35>
+    2ec5:	80 3b 00             	cmpb   $0x0,(%ebx)
+    2ec8:	f3 90                	repz nop 
+    2eca:	7e f9                	jle    2ec5 <Letext+0xfc>
+    2ecc:	e9 24 fb ff ff       	jmp    29f5 <pagebuf_delwri_flush+0x179>
+Disassembly of section .text.init:
+
+0000000000000000 <pagebuf_init>:
+   0:	55                   	push   %ebp
+   1:	57                   	push   %edi
+   2:	56                   	push   %esi
+   3:	53                   	push   %ebx
+   4:	6a 01                	push   $0x1
+   6:	68 60 01 00 00       	push   $0x160
+   b:	e8 fc ff ff ff       	call   c <pagebuf_init+0xc>
+  10:	a3 84 06 00 00       	mov    %eax,0x684
+  15:	6a 00                	push   $0x0
+  17:	68 ca 00 00 00       	push   $0xca
+  1c:	e8 fc ff ff ff       	call   1d <pagebuf_init+0x1d>
+  21:	83 c4 10             	add    $0x10,%esp
+  24:	85 c0                	test   %eax,%eax
+  26:	74 23                	je     4b <pagebuf_init+0x4b>
+  28:	6a 00                	push   $0x0
+  2a:	6a 00                	push   $0x0
+  2c:	68 d5 00 00 00       	push   $0xd5
+  31:	e8 fc ff ff ff       	call   32 <pagebuf_init+0x32>
+  36:	83 c4 0c             	add    $0xc,%esp
+  39:	85 c0                	test   %eax,%eax
+  3b:	74 0e                	je     4b <pagebuf_init+0x4b>
+  3d:	c7 40 38 ec 2c 00 00 	movl   $0x2cec,0x38(%eax)
+  44:	c7 40 34 00 00 00 00 	movl   $0x0,0x34(%eax)
+{
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+  4b:	6a 00                	push   $0x0
+  4d:	6a 00                	push   $0x0
+  4f:	68 00 20 00 00       	push   $0x2000
+printk("eip: %p\n", &&here);
+  54:	6a 00                	push   $0x0
+  56:	68 cc 00 00 00       	push   $0xcc
+  5b:	68 e5 00 00 00       	push   $0xe5
+  60:	e8 fc ff ff ff       	call   61 <pagebuf_init+0x61>
+		BUG();
+  65:	a3 24 00 00 00       	mov    %eax,0x24
+  6a:	83 c4 18             	add    $0x18,%esp
+	}
+  6d:	85 c0                	test   %eax,%eax
+#endif
+	__asm__ __volatile__(
+  6f:	75 1f                	jne    90 <pagebuf_init+0x90>
+  71:	68 00 01 00 00       	push   $0x100
+  76:	e8 fc ff ff ff       	call   77 <pagebuf_init+0x77>
+  7b:	e8 fc ff ff ff       	call   7c <pagebuf_init+0x7c>
+  80:	b8 f4 ff ff ff       	mov    $0xfffffff4,%eax
+  85:	e9 a9 00 00 00       	jmp    133 <pagebuf_init+0x133>
+  8a:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+  90:	6a 40                	push   $0x40
+  92:	e8 6c 07 00 00       	call   803 <_pagebuf_get_prealloc_bh+0x43>
+  97:	83 c4 04             	add    $0x4,%esp
+  9a:	83 f8 3f             	cmp    $0x3f,%eax
+  9d:	7f 21                	jg     c0 <pagebuf_init+0xc0>
+  9f:	6a 40                	push   $0x40
+		:"=q" (oldval), "=m" (lock->lock) \
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+  a1:	68 40 01 00 00       	push   $0x140
+	char oldval = 1;
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+  a6:	e8 fc ff ff ff       	call   a7 <pagebuf_init+0xa7>
+  ab:	e8 fc ff ff ff       	call   ac <pagebuf_init+0xac>
+  b0:	b8 f4 ff ff ff       	mov    $0xfffffff4,%eax
+		BUG();
+  b5:	83 c4 08             	add    $0x8,%esp
+  b8:	eb 7c                	jmp    136 <pagebuf_init+0x136>
+	if (!spin_is_locked(lock))
+  ba:	8d b6 00 00 00 00    	lea    0x0(%esi),%esi
+  c0:	b8 01 00 00 00       	mov    $0x1,%eax
+		BUG();
+  c5:	ba ad 4e ad de       	mov    $0xdead4ead,%edx
+#endif
+	__asm__ __volatile__(
+  ca:	a3 40 03 00 00       	mov    %eax,0x340
+  cf:	89 15 44 03 00 00    	mov    %edx,0x344
+  d5:	c7 05 48 03 00 00 48 	movl   $0x348,0x348
+  dc:	03 00 00 
+  df:	c7 05 4c 03 00 00 48 	movl   $0x348,0x34c
+  e6:	03 00 00 
+{
+#if SPINLOCK_DEBUG
+	__label__ here;
+here:
+	if (lock->magic != SPINLOCK_MAGIC) {
+  e9:	bd 6c 03 00 00       	mov    $0x36c,%ebp
+  ee:	bb 60 03 00 00       	mov    $0x360,%ebx
+printk("eip: %p\n", &&here);
+  f3:	89 df                	mov    %ebx,%edi
+  f5:	31 c9                	xor    %ecx,%ecx
+  f7:	be 1f 00 00 00       	mov    $0x1f,%esi
+  fc:	8d 74 26 00          	lea    0x0(%esi,1),%esi
+		BUG();
+ 100:	b8 01 00 00 00       	mov    $0x1,%eax
+ 105:	ba ad 4e ad de       	mov    $0xdead4ead,%edx
+	}
+#endif
+	__asm__ __volatile__(
+ 10a:	89 44 0d 00          	mov    %eax,0x0(%ebp,%ecx,1)
+ 10e:	89 54 0d 04          	mov    %edx,0x4(%ebp,%ecx,1)
+ 112:	89 1c 0f             	mov    %ebx,(%edi,%ecx,1)
+ 115:	89 5c 0f 04          	mov    %ebx,0x4(%edi,%ecx,1)
+ 119:	83 c3 14             	add    $0x14,%ebx
+ 11c:	83 c1 14             	add    $0x14,%ecx
+ 11f:	4e                   	dec    %esi
+ 120:	79 de                	jns    100 <pagebuf_init+0x100>
+ 122:	e8 c8 2a 00 00       	call   2bef <pagebuf_daemon_stop+0x2b>
+ 127:	68 7c 2d 00 00       	push   $0x2d7c
+ 12c:	e8 fc ff ff ff       	call   12d <pagebuf_init+0x12d>
+		:"0" (oldval) : "memory"
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+	char oldval = 1;
+ 131:	31 c0                	xor    %eax,%eax
+#if SPINLOCK_DEBUG
+	if (lock->magic != SPINLOCK_MAGIC)
+ 133:	83 c4 04             	add    $0x4,%esp
+ 136:	5b                   	pop    %ebx
+ 137:	5e                   	pop    %esi
+ 138:	5f                   	pop    %edi
+ 139:	5d                   	pop    %ebp
+ 13a:	c3                   	ret    
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf_inline.h linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf_inline.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf_inline.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf_inline.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "page_buf.h"
+
+/*
+ *	pagebuf_cond_lock
+ *
+ *	pagebuf_cond_lock locks a buffer object, if it is not already locked.
+ *	Note that this in no way
+ *	locks the underlying pages, so it is only useful for synchronizing
+ *	concurrent use of page buffer objects, not for synchronizing independent
+ *	access to the underlying pages.
+ */
+static __inline int
+pagebuf_cond_lock(
+	page_buf_t		*pb)
+{
+	int			locked;
+
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+
+	locked = down_trylock(&PBP(pb)->pb_sema) == 0;
+	if (locked) {
+		PB_SET_OWNER(pb);
+		PB_TRACE(pb, PB_TRACE_REC(condlck), locked);
+		return 0;
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(condlck), 0);
+	return -EBUSY;
+}
+
+/*
+ *	pagebuf_is_locked
+ *
+ *	pagebuf_is_locked tests if the buffer is locked, return 1 if locked
+ *	and 0 if not.  This routine is useful only for assertions that
+ *	the buffer is locked, since the state could change at any time
+ *	if the buffer is not locked.
+ */
+static __inline int
+pagebuf_is_locked(
+	page_buf_t		*pb)
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+
+	return (atomic_read(&PBP(pb)->pb_sema.count) <= 0);
+}
+
+/*
+ *	pagebuf_lock_value
+ *
+ *	Return lock value for a pagebuf
+ */
+static __inline int
+pagebuf_lock_value(
+	page_buf_t		*pb)
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+
+	return (atomic_read(&PBP(pb)->pb_sema.count));
+}
+
+/*
+ *	pagebuf_lock
+ *
+ *	pagebuf_lock locks a buffer object.  Note that this in no way
+ *	locks the underlying pages, so it is only useful for synchronizing
+ *	concurrent use of page buffer objects, not for synchronizing independent
+ *	access to the underlying pages.
+ */
+static __inline int
+pagebuf_lock(
+	page_buf_t		*pb)
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+
+	PB_TRACE(pb, PB_TRACE_REC(lock), 0);
+	if (atomic_read(&PBP(pb)->pb_io_remaining))
+		run_task_queue(&tq_disk);
+	down(&PBP(pb)->pb_sema);
+	PB_SET_OWNER(pb);
+	PB_TRACE(pb, PB_TRACE_REC(locked), 0);
+	return 0;
+}
+
+/*
+ *	pagebuf_unlock
+ *
+ *	pagebuf_unlock releases the lock on the buffer object created by
+ *	pagebuf_lock or pagebuf_cond_lock (not any
+ *	pinning of underlying pages created by pagebuf_pin).
+ */
+static __inline void
+pagebuf_unlock(	
+	page_buf_t		*pb)
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+
+	PB_CLEAR_OWNER(pb);
+	up(&PBP(pb)->pb_sema);
+	PB_TRACE(pb, PB_TRACE_REC(unlock), 0);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf_internal.h linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf_internal.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf_internal.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf_internal.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Written by Steve Lord at SGI
+ */
+
+#ifndef __PAGE_BUF_PRIVATE_H__
+#define __PAGE_BUF_PRIVATE_H__
+
+#include "page_buf.h"
+
+#define _PAGE_BUF_INTERNAL_
+#define PB_DEFINE_TRACES
+#include "page_buf_trace.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,9)
+#define page_buffers(page)	((page)->buffers)
+#define page_has_buffers(page)	((page)->buffers)
+#endif
+
+typedef struct page_buf_private_s {
+	page_buf_t		pb_common;	/* public part of structure */
+	struct semaphore	pb_sema;	/* semaphore for lockables  */
+	unsigned long		pb_flushtime;	/* time to flush pagebuf    */
+	atomic_t		pb_io_remaining;/* #outstanding I/O requests */
+	atomic_t		pb_pin_count;	/* pin count		    */
+	wait_queue_head_t	pb_waiters;	/* unpin waiters	    */
+#ifdef PAGEBUF_LOCK_TRACKING
+	int			pb_last_holder;
+#endif
+} page_buf_private_t;
+
+#define PBC(pb) (&((pb)->pb_common))
+#define PBP(pb) ((page_buf_private_t *) (pb))
+
+#ifdef PAGEBUF_LOCK_TRACKING
+#define PB_SET_OWNER(pb)	(PBP(pb)->pb_last_holder = current->pid)
+#define PB_CLEAR_OWNER(pb)	(PBP(pb)->pb_last_holder = -1)
+#define PB_GET_OWNER(pb)	(PBP(pb)->pb_last_holder)
+#else
+#define PB_SET_OWNER(pb)
+#define PB_CLEAR_OWNER(pb)
+#define PB_GET_OWNER(pb)
+#endif /* PAGEBUF_LOCK_TRACKING */
+
+/* Tracing utilities for pagebuf */
+typedef struct {
+	int			event;
+	unsigned long		pb;
+	page_buf_flags_t	flags;
+	unsigned short		hold;
+	unsigned short		lock_value;
+	void			*task;
+	void			*misc;
+	void			*ra;
+	loff_t			offset;
+	size_t			size;
+} pagebuf_trace_t;
+
+struct pagebuf_trace_buf {
+	pagebuf_trace_t		*buf;
+	volatile int		start;
+	volatile int		end;
+};
+
+#define PB_TRACE_BUFSIZE	1024
+#define CIRC_INC(i)	(((i) + 1) & (PB_TRACE_BUFSIZE - 1))
+
+typedef struct pagebuf_daemon {
+	int			active;
+	int			io_active;
+	spinlock_t		pb_delwrite_lock;
+	struct list_head	pb_delwrite_l;
+	int			pb_delwri_cnt;
+} pagebuf_daemon_t;
+
+/*
+ * Tunable pagebuf parameters
+ */
+
+#define P_PARAM 4
+
+typedef union pagebuf_param {
+	struct {
+		ulong	flush_interval; /* interval between runs of the
+					 * delwri flush daemon.	 */
+		ulong	age_buffer;	/* time for buffer to age before
+					 * we flush it.	 */
+		ulong	debug;		/* debug tracing on or off */
+		ulong	stats_clear;	/* clear the pagebuf stats */
+	} p_un;
+	ulong data[P_PARAM];
+} pagebuf_param_t;
+
+enum {
+	PB_FLUSH_INT = 1,
+	PB_FLUSH_AGE = 2,
+	PB_STATS_CLEAR = 3,
+	PB_DEBUG = 4
+};
+
+extern pagebuf_param_t	pb_params;
+
+/*
+ * Pagebuf statistics
+ */
+
+struct pbstats {
+	u_int32_t	pb_get;
+	u_int32_t	pb_create;
+	u_int32_t	pb_get_locked;
+	u_int32_t	pb_get_locked_waited;
+	u_int32_t	pb_busy_locked;
+	u_int32_t	pb_miss_locked;
+	u_int32_t	pb_page_retries;
+	u_int32_t	pb_page_found;
+	u_int32_t	pb_get_read;
+};
+
+extern struct pbstats pbstats;
+
+#define PB_STATS_INC(count)	( count ++ )
+
+#undef assert
+#ifdef PAGEBUF_DEBUG
+# define assert(expr) \
+	if (!(expr)) {						\
+		printk("Assertion failed: %s\n%s::%s line %d\n",\
+		#expr,__FILE__,__FUNCTION__,__LINE__);		\
+		BUG();						\
+	}
+#else
+# define assert(x)	do { } while (0)
+#endif
+
+#ifndef STATIC
+# define STATIC static
+#endif
+
+#endif /* __PAGE_BUF_PRIVATE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf_locking.c linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf_locking.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf_locking.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf_locking.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Portions Copyright (c) 2002 Christoph Hellwig.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ *	page_buf_locking.c
+ *
+ *	The page_buf module provides an abstract buffer cache model on top of
+ *	the Linux page cache.  Cached blocks for a file are hashed to the
+ *	inode for that file, and can be held dirty in delayed write mode in
+ *	the page cache.	 Cached metadata blocks for a file system are hashed
+ *	to the inode for the mounted device.  The page_buf module assembles
+ *	buffer (page_buf_t) objects on demand to aggregate such cached pages
+ *	for I/O.  The page_buf_locking module adds support for locking such
+ *	page buffers.
+ *
+ *	Written by Steve Lord at SGI
+ *
+ */
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/major.h>
+
+#include "page_buf_internal.h"
+
+#ifndef EVMS_MAJOR
+#define EVMS_MAJOR      117
+#endif
+
+/*
+ *	pagebuf_cond_lock
+ *
+ *	pagebuf_cond_lock locks a buffer object, if it is not already locked.
+ *	Note that this in no way
+ *	locks the underlying pages, so it is only useful for synchronizing
+ *	concurrent use of page buffer objects, not for synchronizing independent
+ *	access to the underlying pages.
+ */
+int
+pagebuf_cond_lock(			/* lock buffer, if not locked	*/
+					/* returns -EBUSY if locked)	*/
+	page_buf_t		*pb)
+{
+	int			locked;
+
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+
+	locked = down_trylock(&PBP(pb)->pb_sema) == 0;
+	if (locked) {
+		PB_SET_OWNER(pb);
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(condlck), locked);
+
+	return(locked ? 0 : -EBUSY);
+}
+
+/*
+ *	pagebuf_lock_value
+ *
+ *	Return lock value for a pagebuf
+ */
+int
+pagebuf_lock_value(
+	page_buf_t		*pb)
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+	return(atomic_read(&PBP(pb)->pb_sema.count));
+}
+
+/*
+ *	pagebuf_lock
+ *
+ *	pagebuf_lock locks a buffer object.  Note that this in no way
+ *	locks the underlying pages, so it is only useful for synchronizing
+ *	concurrent use of page buffer objects, not for synchronizing independent
+ *	access to the underlying pages.
+ */
+int
+pagebuf_lock(
+	page_buf_t		*pb)
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+
+	PB_TRACE(pb, PB_TRACE_REC(lock), 0);
+	if (atomic_read(&PBP(pb)->pb_io_remaining))
+		run_task_queue(&tq_disk);
+	down(&PBP(pb)->pb_sema);
+	PB_SET_OWNER(pb);
+	PB_TRACE(pb, PB_TRACE_REC(locked), 0);
+	return 0;
+}
+
+/*
+ *	pagebuf_lock_disable
+ *
+ *	pagebuf_lock_disable disables buffer object locking for an inode.
+ *	remove_super() does a blkdev_put for us on the data device, hence
+ * 	the do_blkdev_put argument.
+ */
+void
+pagebuf_lock_disable(
+	pb_target_t		*target,
+	int			do_blkdev_put)
+{
+	pagebuf_delwri_flush(target, PBDF_WAIT, NULL);
+	if (do_blkdev_put)
+		blkdev_put(target->pbr_bdev, BDEV_FS);
+	kfree(target);
+}
+
+/*
+ *	pagebuf_lock_enable
+ *
+ *	get_sb_bdev() does a blkdev_get for us on the data device, hence
+ *	the do_blkdev_get argument.
+ */
+pb_target_t *
+pagebuf_lock_enable(
+	dev_t			dev,
+	int			do_blkdev_get)
+{
+	struct block_device	*bdev;
+	pb_target_t		*target;
+	int			error = -ENOMEM;
+
+	target = kmalloc(sizeof(pb_target_t), GFP_KERNEL);
+	if (unlikely(!target))
+		return ERR_PTR(error);
+
+	bdev = bdget(dev);
+	if (unlikely(!bdev))
+		goto fail;
+
+	if (do_blkdev_get) {
+		error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS);
+		if (unlikely(error))
+			goto fail;
+	}
+
+	target->pbr_dev = dev;
+	target->pbr_kdev = to_kdev_t(dev);
+	target->pbr_bdev = bdev;
+	target->pbr_mapping = bdev->bd_inode->i_mapping;
+
+	pagebuf_target_blocksize(target, PAGE_CACHE_SIZE);
+	
+	if ((MAJOR(dev) == MD_MAJOR) || (MAJOR(dev) == EVMS_MAJOR))
+		target->pbr_flags = PBR_ALIGNED_ONLY;
+	else if (MAJOR(dev) == LVM_BLK_MAJOR)
+		target->pbr_flags = PBR_SECTOR_ONLY;
+	else
+		target->pbr_flags = 0;
+
+	return target;
+
+fail:
+	kfree(target);
+	return ERR_PTR(error);
+}
+
+void
+pagebuf_target_blocksize(
+	pb_target_t		*target,
+	unsigned int		blocksize)
+{
+	target->pbr_blocksize = blocksize;
+	target->pbr_blocksize_bits = ffs(blocksize) - 1;
+}
+
+void
+pagebuf_target_clear(
+	pb_target_t		*target)
+{
+	destroy_buffers(target->pbr_kdev);
+	truncate_inode_pages(target->pbr_mapping, 0LL);
+}
+
+/*
+ *	pagebuf_unlock
+ *
+ *	pagebuf_unlock releases the lock on the buffer object created by
+ *	pagebuf_lock or pagebuf_cond_lock (not any
+ *	pinning of underlying pages created by pagebuf_pin).
+ */
+void
+pagebuf_unlock(				/* unlock buffer		*/
+	page_buf_t		*pb)	/* buffer to unlock		*/
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+	PB_CLEAR_OWNER(pb);
+	up(&PBP(pb)->pb_sema);
+	PB_TRACE(pb, PB_TRACE_REC(unlock), 0);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf_trace.h linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf_trace.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/pagebuf/page_buf_trace.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/pagebuf/page_buf_trace.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef __PAGEBUF_TRACE__
+#define __PAGEBUF_TRACE__
+
+#ifdef PB_DEFINE_TRACES
+#define PB_TRACE_START	typedef enum {
+#define PB_TRACE_REC(x) pb_trace_point_##x
+#define PB_TRACE_END	} pb_trace_var_t;
+#else
+#define PB_TRACE_START	static char	*event_names[] = {
+#define PB_TRACE_REC(x) #x
+#define PB_TRACE_END	};
+#endif
+
+PB_TRACE_START
+PB_TRACE_REC(get),
+PB_TRACE_REC(get_obj),
+PB_TRACE_REC(free_obj),
+PB_TRACE_REC(look_pg),
+PB_TRACE_REC(get_read),
+PB_TRACE_REC(no_daddr),
+PB_TRACE_REC(hold),
+PB_TRACE_REC(rele),
+PB_TRACE_REC(done),
+PB_TRACE_REC(ioerror),
+PB_TRACE_REC(iostart),
+PB_TRACE_REC(end_io),
+PB_TRACE_REC(do_io),
+PB_TRACE_REC(ioreq),
+PB_TRACE_REC(iowait),
+PB_TRACE_REC(iowaited),
+PB_TRACE_REC(free_lk),
+PB_TRACE_REC(freed_l),
+PB_TRACE_REC(cmp),
+PB_TRACE_REC(get_lk),
+PB_TRACE_REC(got_lk),
+PB_TRACE_REC(skip),
+PB_TRACE_REC(lock),
+PB_TRACE_REC(locked),
+PB_TRACE_REC(unlock),
+PB_TRACE_REC(avl_ret),
+PB_TRACE_REC(condlck),
+PB_TRACE_REC(avl_ins),
+PB_TRACE_REC(walkq1),
+PB_TRACE_REC(walkq2),
+PB_TRACE_REC(walkq3),
+PB_TRACE_REC(delwri_q),
+PB_TRACE_REC(delwri_uq),
+PB_TRACE_REC(pin),
+PB_TRACE_REC(unpin),
+PB_TRACE_REC(file_write),
+PB_TRACE_REC(external),
+PB_TRACE_END
+
+extern void pb_trace_func(page_buf_t *, int, void *, void *);
+#ifdef PAGEBUF_TRACE
+# define PB_TRACE(pb, event, misc)		\
+	pb_trace_func(pb, event, (void *) misc, \
+			(void *)__builtin_return_address(0))
+#else
+# define PB_TRACE(pb, event, misc)	do { } while (0)
+#endif
+
+#endif	/* __PAGEBUF_TRACE__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/Makefile linux-2.4.20-pre10-mjc2/fs/xfs/support/Makefile
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/Makefile	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/Makefile	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+
+EXTRA_CFLAGS += -I..
+
+ifeq ($(CONFIG_XFS_DEBUG),y)
+	EXTRA_CFLAGS += -DDEBUG
+endif
+
+O_TARGET			:= support_xfs.o
+ifneq ($(MAKECMDGOALS),modules_install)
+  obj-m				:= $(O_TARGET)
+endif
+
+export-objs			:= ktrace.o
+
+obj-y				:= debug.o \
+				   kmem.o \
+				   ktrace.o \
+				   move.o \
+				   mrlock.o \
+				   qsort.o \
+				   uuid.o
+
+include $(TOPDIR)/Rules.make
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/Makefile.in linux-2.4.20-pre10-mjc2/fs/xfs/support/Makefile.in
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/Makefile.in	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/Makefile.in	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+
+expsyms(ktrace.o)
+objlink(support_xfs.o debug.o kmem.o ktrace.o move.o mrlock.o qsort.o uuid.o)
+
+# No select() for support_xfs.o in this directory.  It is a sub-component of XFS,
+# see fs/xfs/Makefile.in for the objlink.
+
+extra_cflags_all($(XFS_EXTRA_CFLAGS))
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/atomic.h linux-2.4.20-pre10-mjc2/fs/xfs/support/atomic.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/atomic.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/atomic.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_ATOMIC_H__
+#define __XFS_SUPPORT_ATOMIC_H__
+
+#include <linux/version.h>
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * This is used for two variables in XFS, one of which is a debug trace
+ * buffer index. They are not accessed via any other atomic operations
+ * so this is safe. All other atomic increments and decrements in XFS
+ * now use the linux built in functions.
+ */
+
+extern spinlock_t Atomic_spin;
+
+static __inline__ int atomicIncWithWrap(int *ip, int val)
+{
+	unsigned long flags;
+	int ret;
+	spin_lock_irqsave(&Atomic_spin, flags);
+	ret = *ip;
+	(*ip)++;
+	if (*ip == val) *ip = 0;
+	spin_unlock_irqrestore(&Atomic_spin, flags);
+	return ret;
+}
+
+#endif /* __XFS_SUPPORT_ATOMIC_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/debug.c linux-2.4.20-pre10-mjc2/fs/xfs/support/debug.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/debug.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/debug.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "debug.h"
+
+#include <asm/page.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+
+int			doass = 1;
+static char		message[256];	/* keep it off the stack */
+static spinlock_t 	xfs_err_lock = SPIN_LOCK_UNLOCKED;
+
+void
+assfail(char *a, char *f, int l)
+{
+    printk("XFS assertion failed: %s, file: %s, line: %d\n", a, f, l);
+    BUG();
+}
+
+#ifdef DEBUG
+
+unsigned long
+random(void)
+{
+	static unsigned long	RandomValue = 1;
+	/* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
+	register long	rv = RandomValue;
+	register long	lo;
+	register long	hi;
+
+	hi = rv / 127773;
+	lo = rv % 127773;
+	rv = 16807 * lo - 2836 * hi;
+	if( rv <= 0 ) rv += 2147483647;
+	return( RandomValue = rv );
+}
+
+int
+get_thread_id(void)
+{
+	return current->pid;
+}
+
+# define xdprintk(format...)	printk(format)
+#else
+# define xdprintk(format...)	do { } while (0)
+#endif
+
+void
+cmn_err(register int level, char *fmt, ...)
+{
+	char	*fp = fmt;
+	va_list ap;
+
+	spin_lock(&xfs_err_lock);
+	va_start(ap, fmt);
+	if (*fmt == '!') fp++;
+	vsprintf(message, fp, ap);
+	switch (level) {
+	case CE_CONT:
+		printk("%s", message);
+		break;
+	case CE_DEBUG:
+		xdprintk("%s", message);
+		break;
+	default:
+		printk("%s\n", message);
+		break;
+	}
+	va_end(ap);
+	spin_unlock(&xfs_err_lock);
+
+	if (level == CE_PANIC)
+		BUG();
+}
+
+
+void
+icmn_err(register int level, char *fmt, va_list ap)
+{
+	spin_lock(&xfs_err_lock);
+	vsprintf(message, fmt, ap);
+	switch (level) {
+	case CE_CONT:
+		printk("%s", message);
+		break;
+	case CE_DEBUG:
+		xdprintk("%s", message);
+		break;
+	default:
+		printk("cmn_err level %d ", level);
+		printk("%s\n", message);
+		break;
+	}
+	spin_unlock(&xfs_err_lock);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/debug.h linux-2.4.20-pre10-mjc2/fs/xfs/support/debug.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/debug.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/debug.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_DEBUG_H__
+#define __XFS_SUPPORT_DEBUG_H__
+
+#include <stdarg.h>
+
+#define CE_DEBUG	7		/* debug	*/
+#define CE_CONT		6		/* continuation */
+#define CE_NOTE		5		/* notice	*/
+#define CE_WARN		4		/* warning	*/
+#define CE_ALERT	1		/* alert	*/
+#define CE_PANIC	0		/* panic	*/
+
+extern void icmn_err(int, char *, va_list);
+extern void cmn_err(int, char *, ...);
+
+#ifdef DEBUG
+# ifdef lint
+#  define ASSERT(EX)	((void)0) /* avoid "constant in conditional" babble */
+# else
+#  define ASSERT(EX) ((!doass||(EX))?((void)0):assfail(#EX, __FILE__, __LINE__))
+# endif /* lint */
+#else
+# define ASSERT(x)	((void)0)
+#endif
+
+extern int doass;		/* dynamically turn off asserts */
+extern void assfail(char *, char *, int);
+#ifdef DEBUG
+extern unsigned long random(void);
+extern int get_thread_id(void);
+#endif
+
+#define ASSERT_ALWAYS(EX)  ((EX)?((void)0):assfail(#EX, __FILE__, __LINE__))
+#define debug_stop_all_cpus(param)	/* param is "cpumask_t *" */
+
+#endif	/* __XFS_SUPPORT_DEBUG_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/kmem.c linux-2.4.20-pre10-mjc2/fs/xfs/support/kmem.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/kmem.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/kmem.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include "time.h"
+#include "kmem.h"
+
+#define DEF_PRIORITY	(6)
+#define MAX_SLAB_SIZE	0x10000
+
+static __inline unsigned int flag_convert(int flags)
+{
+#if DEBUG
+	if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS))) {
+		printk(KERN_WARNING
+		    "XFS: memory allocation with wrong flags (%x)\n", flags);
+		BUG();
+	}
+#endif
+
+	if (flags & KM_NOSLEEP)
+		return GFP_ATOMIC;
+	/* If we're in a transaction, FS activity is not ok */
+	else if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+		return GFP_NOFS;
+	else
+		return GFP_KERNEL;
+}
+
+#define MAX_SHAKE 8
+
+static kmem_shake_func_t	shake_list[MAX_SHAKE];
+static DECLARE_MUTEX(shake_sem);
+
+void	kmem_shake_register(kmem_shake_func_t sfunc)
+{
+	int	i;
+
+	down(&shake_sem);
+	for (i = 0; i < MAX_SHAKE; i++) {
+		if (shake_list[i] == NULL) {
+			shake_list[i] = sfunc;
+			break;
+		}
+	}
+	if (i == MAX_SHAKE)
+		BUG();
+	up(&shake_sem);
+}
+
+void	kmem_shake_deregister(kmem_shake_func_t sfunc)
+{
+	int	i;
+
+	down(&shake_sem);
+	for (i = 0; i < MAX_SHAKE; i++) {
+		if (shake_list[i] == sfunc)
+			break;
+	}
+	if (i == MAX_SHAKE)
+		BUG();
+	for (; i < MAX_SHAKE - 1; i++) {
+		shake_list[i] = shake_list[i+1];
+	}
+	shake_list[i] = NULL;
+	up(&shake_sem);
+}
+
+static __inline__ void kmem_shake(void)
+{
+	int	i;
+
+	down(&shake_sem);
+	for (i = 0; i < MAX_SHAKE && shake_list[i]; i++)
+		(*shake_list[i])();
+	up(&shake_sem);
+	delay(10);
+}
+
+void *
+kmem_alloc(size_t size, int flags)
+{
+	int	shrink = DEF_PRIORITY;	/* # times to try to shrink cache */
+	void	*rval;
+
+repeat:
+	if (MAX_SLAB_SIZE < size) {
+		/* Avoid doing filesystem sensitive stuff to get this */
+		rval = __vmalloc(size, flag_convert(flags), PAGE_KERNEL);
+	} else {
+		rval = kmalloc(size, flag_convert(flags));
+	}
+
+	if (rval || (flags & KM_NOSLEEP))
+		return rval;
+
+	/*
+	 * KM_SLEEP callers don't expect a failure
+	 */
+	if (shrink) {
+		kmem_shake();
+
+		shrink--;
+		goto repeat;
+	}
+
+	rval = __vmalloc(size, flag_convert(flags), PAGE_KERNEL);
+	if (!rval && !(flags & KM_SLEEP))
+		panic("kmem_alloc: NULL memory on KM_SLEEP request!");
+
+	return rval;
+}
+
+void *
+kmem_zalloc(size_t size, int flags)
+{
+	void	*ptr;
+
+	ptr = kmem_alloc(size, flags);
+
+	if (ptr)
+		memset((char *)ptr, 0, (int)size);
+
+	return (ptr);
+}
+
+void
+kmem_free(void *ptr, size_t size)
+{
+	if (((unsigned long)ptr < VMALLOC_START) ||
+            ((unsigned long)ptr >= VMALLOC_END)) {
+		kfree(ptr);
+	} else {
+		vfree(ptr);
+	}
+}
+
+void *
+kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
+{
+	void *new;
+
+	new = kmem_alloc(newsize, flags);
+	if (ptr) {
+		memcpy(new, ptr, ((oldsize < newsize) ? oldsize : newsize));
+		kmem_free(ptr, oldsize);
+	}
+
+	return new;
+}
+
+kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+	return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
+}
+
+void *
+kmem_zone_alloc(kmem_zone_t *zone, int flags)
+{
+	int	shrink = DEF_PRIORITY;	/* # times to try to shrink cache */
+	void	*ptr = NULL;
+
+repeat:
+	ptr = kmem_cache_alloc(zone, flag_convert(flags));
+
+	if (ptr || (flags & KM_NOSLEEP))
+		return ptr;
+
+	/*
+	 * KM_SLEEP callers don't expect a failure
+	 */
+	if (shrink) {
+		kmem_shake();
+
+		shrink--;
+		goto repeat;
+	}
+
+	if (flags & KM_SLEEP)
+		panic("kmem_zone_alloc: NULL memory on KM_SLEEP request!");
+
+	return NULL;
+}
+
+void *
+kmem_zone_zalloc(kmem_zone_t *zone, int flags)
+{
+	int	shrink = DEF_PRIORITY;	/* # times to try to shrink cache */
+	void	*ptr = NULL;
+
+repeat:
+	ptr = kmem_cache_zalloc(zone, flag_convert(flags));
+
+	if (ptr || (flags & KM_NOSLEEP))
+		return ptr;
+
+	/*
+	 * KM_SLEEP callers don't expect a failure
+	 */
+	if (shrink) {
+		kmem_shake();
+
+		shrink--;
+		goto repeat;
+	}
+
+	if (flags & KM_SLEEP)
+		panic("kmem_zone_zalloc: NULL memory on KM_SLEEP request!");
+
+	return NULL;
+}
+
+void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+	kmem_cache_free(zone, ptr);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/kmem.h linux-2.4.20-pre10-mjc2/fs/xfs/support/kmem.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/kmem.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/kmem.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/slab.h>
+
+/*
+ * memory management routines
+ */
+#define KM_SLEEP	0x0001
+#define KM_NOSLEEP	0x0002
+#define KM_NOFS		0x0004
+
+#define	kmem_zone	kmem_cache_s
+#define kmem_zone_t	kmem_cache_t
+
+extern kmem_zone_t  *kmem_zone_init(int, char *);
+extern void	    *kmem_zone_zalloc(kmem_zone_t *, int);
+extern void	    *kmem_zone_alloc(kmem_zone_t *, int);
+extern void	    kmem_zone_free(kmem_zone_t *, void *);
+
+extern void	    *kmem_alloc(size_t, int);
+extern void	    *kmem_realloc(void *, size_t, size_t, int);
+extern void	    *kmem_zalloc(size_t, int);
+extern void	    kmem_free(void *, size_t);
+
+typedef void	    (*kmem_shake_func_t)(void);
+
+extern void	    kmem_shake_register(kmem_shake_func_t);
+extern void	    kmem_shake_deregister(kmem_shake_func_t);
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/ktrace.c linux-2.4.20-pre10-mjc2/fs/xfs/support/ktrace.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/ktrace.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/ktrace.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/module.h>
+
+#include <xfs_types.h>
+#include "kmem.h"
+#include "spin.h"
+#include "debug.h"
+#include "atomic.h"
+#include "ktrace.h"
+
+#if	(defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING))
+
+static kmem_zone_t *ktrace_hdr_zone;
+static kmem_zone_t *ktrace_ent_zone;
+static int	    ktrace_zentries;
+
+void
+ktrace_init(int zentries)
+{
+	ktrace_zentries = zentries;
+
+	ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
+					"ktrace_hdr");
+	ASSERT(ktrace_hdr_zone);
+
+	ktrace_ent_zone = kmem_zone_init(ktrace_zentries
+					* sizeof(ktrace_entry_t),
+					"ktrace_ent");
+	ASSERT(ktrace_ent_zone);
+}
+
+void
+ktrace_uninit(void)
+{
+	kmem_cache_destroy(ktrace_hdr_zone);
+	kmem_cache_destroy(ktrace_ent_zone);
+}
+
+/*
+ * ktrace_alloc()
+ *
+ * Allocate a ktrace header and enough buffering for the given
+ * number of entries.
+ */
+ktrace_t *
+ktrace_alloc(int nentries, int sleep)
+{
+	ktrace_t	*ktp;
+	ktrace_entry_t	*ktep;
+
+	ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
+
+	if (ktp == (ktrace_t*)NULL) {
+		/*
+		 * KM_SLEEP callers don't expect failure.
+		 */
+		if (sleep & KM_SLEEP)
+			panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
+
+		return NULL;
+	}
+
+	/*
+	 * Special treatment for buffers with the ktrace_zentries entries
+	 */
+	if (nentries == ktrace_zentries) {
+		ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
+							    sleep);
+	} else {
+		ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
+							    sleep);
+	}
+
+	if (ktep == NULL) {
+		/*
+		 * KM_SLEEP callers don't expect failure.
+		 */
+		if (sleep & KM_SLEEP)
+			panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
+
+		kmem_free(ktp, sizeof(*ktp));
+
+		return NULL;
+	}
+
+	spinlock_init(&(ktp->kt_lock), "kt_lock");
+
+	ktp->kt_entries	 = ktep;
+	ktp->kt_nentries = nentries;
+	ktp->kt_index	 = 0;
+	ktp->kt_rollover = 0;
+
+	return ktp;
+}
+
+
+/*
+ * ktrace_free()
+ *
+ * Free up the ktrace header and buffer.  It is up to the caller
+ * to ensure that no-one is referencing it.
+ */
+void
+ktrace_free(ktrace_t *ktp)
+{
+	int	entries_size;
+
+	if (ktp == (ktrace_t *)NULL)
+		return;
+
+	spinlock_destroy(&ktp->kt_lock);
+
+	/*
+	 * Special treatment for the Vnode trace buffer.
+	 */
+	if (ktp->kt_nentries == ktrace_zentries) {
+		kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
+	} else {
+		entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
+
+		kmem_free(ktp->kt_entries, entries_size);
+	}
+
+	kmem_zone_free(ktrace_hdr_zone, ktp);
+}
+
+
+/*
+ * Enter the given values into the "next" entry in the trace buffer.
+ * kt_index is always the index of the next entry to be filled.
+ */
+void
+ktrace_enter(
+	ktrace_t	*ktp,
+	void		*val0,
+	void		*val1,
+	void		*val2,
+	void		*val3,
+	void		*val4,
+	void		*val5,
+	void		*val6,
+	void		*val7,
+	void		*val8,
+	void		*val9,
+	void		*val10,
+	void		*val11,
+	void		*val12,
+	void		*val13,
+	void		*val14,
+	void		*val15)
+{
+	int		index;
+	ktrace_entry_t	*ktep;
+
+	ASSERT(ktp != NULL);
+
+	/*
+	 * Grab an entry by pushing the index up to the next one.
+	 */
+	index = atomicIncWithWrap(&ktp->kt_index, ktp->kt_nentries);
+
+	if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
+		ktp->kt_rollover = 1;
+
+	ASSERT((index >= 0) && (index < ktp->kt_nentries));
+
+	ktep = &(ktp->kt_entries[index]);
+
+	ktep->val[0]  = val0;
+	ktep->val[1]  = val1;
+	ktep->val[2]  = val2;
+	ktep->val[3]  = val3;
+	ktep->val[4]  = val4;
+	ktep->val[5]  = val5;
+	ktep->val[6]  = val6;
+	ktep->val[7]  = val7;
+	ktep->val[8]  = val8;
+	ktep->val[9]  = val9;
+	ktep->val[10] = val10;
+	ktep->val[11] = val11;
+	ktep->val[12] = val12;
+	ktep->val[13] = val13;
+	ktep->val[14] = val14;
+	ktep->val[15] = val15;
+}
+
+/*
+ * Return the number of entries in the trace buffer.
+ */
+int
+ktrace_nentries(
+	ktrace_t	*ktp)
+{
+	if (ktp == NULL) {
+		return 0;
+	}
+
+	return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
+}
+
+
+/*
+ * ktrace_first()
+ *
+ * This is used to find the start of the trace buffer.
+ * In conjunction with ktrace_next() it can be used to
+ * iterate through the entire trace buffer.  This code does
+ * not do any locking because it is assumed that it is called
+ * from the debugger.
+ *
+ * The caller must pass in a pointer to a ktrace_snap
+ * structure in which we will keep some state used to
+ * iterate through the buffer.	This state must not touched
+ * by any code outside of this module.
+ */
+ktrace_entry_t *
+ktrace_first(ktrace_t	*ktp, ktrace_snap_t	*ktsp)
+{
+	ktrace_entry_t	*ktep;
+	int		index;
+	int		nentries;
+
+	if (ktp->kt_rollover)
+		index = ktp->kt_index;
+	else
+		index = 0;
+
+	ktsp->ks_start = index;
+	ktep = &(ktp->kt_entries[index]);
+
+	nentries = ktrace_nentries(ktp);
+	index++;
+	if (index < nentries) {
+		ktsp->ks_index = index;
+	} else {
+		ktsp->ks_index = 0;
+		if (index > nentries)
+			ktep = NULL;
+	}
+	return ktep;
+}
+
+
+/*
+ * ktrace_next()
+ *
+ * This is used to iterate through the entries of the given
+ * trace buffer.  The caller must pass in the ktrace_snap_t
+ * structure initialized by ktrace_first().  The return value
+ * will be either a pointer to the next ktrace_entry or NULL
+ * if all of the entries have been traversed.
+ */
+ktrace_entry_t *
+ktrace_next(
+	ktrace_t	*ktp,
+	ktrace_snap_t	*ktsp)
+{
+	int		index;
+	ktrace_entry_t	*ktep;
+
+	index = ktsp->ks_index;
+	if (index == ktsp->ks_start) {
+		ktep = NULL;
+	} else {
+		ktep = &ktp->kt_entries[index];
+	}
+
+	index++;
+	if (index == ktrace_nentries(ktp)) {
+		ktsp->ks_index = 0;
+	} else {
+		ktsp->ks_index = index;
+	}
+
+	return ktep;
+}
+
+#if	(defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING))
+EXPORT_SYMBOL(ktrace_first);
+EXPORT_SYMBOL(ktrace_next);
+#endif
+
+/*
+ * ktrace_skip()
+ *
+ * Skip the next "count" entries and return the entry after that.
+ * Return NULL if this causes us to iterate past the beginning again.
+ */
+
+ktrace_entry_t *
+ktrace_skip(
+	ktrace_t	*ktp,
+	int		count,
+	ktrace_snap_t	*ktsp)
+{
+	int		index;
+	int		new_index;
+	ktrace_entry_t	*ktep;
+	int		nentries = ktrace_nentries(ktp);
+
+	index = ktsp->ks_index;
+	new_index = index + count;
+	while (new_index >= nentries) {
+		new_index -= nentries;
+	}
+	if (index == ktsp->ks_start) {
+		/*
+		 * We've iterated around to the start, so we're done.
+		 */
+		ktep = NULL;
+	} else if ((new_index < index) && (index < ktsp->ks_index)) {
+		/*
+		 * We've skipped past the start again, so we're done.
+		 */
+		ktep = NULL;
+		ktsp->ks_index = ktsp->ks_start;
+	} else {
+		ktep = &(ktp->kt_entries[new_index]);
+		new_index++;
+		if (new_index == nentries) {
+			ktsp->ks_index = 0;
+		} else {
+			ktsp->ks_index = new_index;
+		}
+	}
+	return ktep;
+}
+
+#else
+
+ktrace_t *
+ktrace_alloc(int nentries, int sleep)
+{
+	/*
+	 * KM_SLEEP callers don't expect failure.
+	 */
+	if (sleep & KM_SLEEP)
+		panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
+
+	return NULL;
+}
+#endif
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/ktrace.h linux-2.4.20-pre10-mjc2/fs/xfs/support/ktrace.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/ktrace.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/ktrace.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_KTRACE_H__
+#define __XFS_SUPPORT_KTRACE_H__
+
+
+/*
+ * Trace buffer entry structure.
+ */
+typedef struct ktrace_entry {
+	void	*val[16];
+} ktrace_entry_t;
+
+/*
+ * Trace buffer header structure.
+ */
+typedef struct ktrace {
+	lock_t		kt_lock;	/* mutex to guard counters */
+	int		kt_nentries;	/* number of entries in trace buf */
+	int		kt_index;	/* current index in entries */
+	int		kt_rollover;
+	ktrace_entry_t	*kt_entries;	/* buffer of entries */
+} ktrace_t;
+
+/*
+ * Trace buffer snapshot structure.
+ */
+typedef struct ktrace_snap {
+	int		ks_start;	/* kt_index at time of snap */
+	int		ks_index;	/* current index */
+} ktrace_snap_t;
+
+/*
+ * Exported interfaces.
+ */
+extern ktrace_t *ktrace_alloc(int, int);
+
+#if	(defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING))
+
+extern void ktrace_init(int zentries);
+extern void ktrace_uninit(void);
+
+extern void ktrace_free(ktrace_t *);
+
+extern void ktrace_enter(
+	ktrace_t	*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*);
+
+extern ktrace_entry_t	*ktrace_first(ktrace_t *, ktrace_snap_t *);
+extern int		ktrace_nentries(ktrace_t *);
+extern ktrace_entry_t	*ktrace_next(ktrace_t *, ktrace_snap_t *);
+extern ktrace_entry_t	*ktrace_skip(ktrace_t *, int, ktrace_snap_t *);
+
+#else
+
+#define ktrace_free(ktp)
+#define ktrace_enter(ktp,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15)
+
+#endif
+
+#endif /* __XFS_SUPPORT_KTRACE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/move.c linux-2.4.20-pre10-mjc2/fs/xfs/support/move.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/move.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/move.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <asm/uaccess.h>
+
+#include <xfs_types.h>
+#include "debug.h"
+#include "move.h"
+
+/*
+ * Move "n" bytes at byte address "cp"; "rw" indicates the direction
+ * of the move, and the I/O parameters are provided in "uio", which is
+ * update to reflect the data which was moved.	Returns 0 on success or
+ * a non-zero errno on failure.
+ */
+int
+uiomove(void *cp, size_t n, enum uio_rw rw, struct uio *uio)
+{
+	register struct iovec *iov;
+	u_int cnt;
+	int error;
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = (u_int)iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = (u_int)n;
+		switch (uio->uio_segflg) {
+		case UIO_USERSPACE:
+			if (rw == UIO_READ)
+				error = copy_to_user(iov->iov_base, cp, cnt);
+			else
+				error = copy_from_user(cp, iov->iov_base, cnt);
+			if (error)
+				return EFAULT;
+			break;
+
+
+		case UIO_SYSSPACE:
+			if (rw == UIO_READ)
+				bcopy(cp, iov->iov_base, cnt);
+			else
+				bcopy(iov->iov_base, cp, cnt);
+			break;
+
+		default:
+			ASSERT(0);
+			break;
+		}
+		iov->iov_base = (void *)((char *)iov->iov_base + cnt);
+		iov->iov_len -= cnt;
+		uio->uio_resid -= cnt;
+		uio->uio_offset += cnt;
+		cp = (void *)((char *)cp + cnt);
+		n -= cnt;
+	}
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/move.h linux-2.4.20-pre10-mjc2/fs/xfs/support/move.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/move.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/move.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef __XFS_SUPPORT_MOVE_H__
+#define __XFS_SUPPORT_MOVE_H__
+
+#include <linux/uio.h>
+#include <asm/uaccess.h>
+
+#define bzero(p,s)	memset((p), 0, (s))
+#define bcopy(s,d,n)	memcpy((d),(s),(n))
+#define bcmp(s1,s2,l)	memcmp(s1,s2,l)
+#define ovbcopy(from,to,count)	memmove(to,from,count)
+
+typedef struct iovec iovec_t;
+
+typedef struct uio {
+	iovec_t		*uio_iov;	/* pointer to array of iovecs */
+	int		uio_iovcnt;	/* number of iovecs */
+	int		uio_fmode;	/* file mode flags */
+	xfs_off_t	uio_offset;	/* file offset */
+	short		uio_segflg;	/* address space (kernel or user) */
+	ssize_t		uio_resid;	/* residual count */
+} uio_t;
+
+/*
+ * I/O direction.
+ */
+typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t;
+
+/*
+ * Segment flag values.
+ */
+typedef enum uio_seg {
+	UIO_USERSPACE,		/* uio_iov describes user space */
+	UIO_SYSSPACE,		/* uio_iov describes system space */
+} uio_seg_t;
+
+
+extern int	uiomove (void *, size_t, uio_rw_t, uio_t *);
+
+#endif	/* __XFS_SUPPORT_MOVE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/mrlock.c linux-2.4.20-pre10-mjc2/fs/xfs/support/mrlock.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/mrlock.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/mrlock.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <asm/system.h>
+#include <linux/interrupt.h>
+#include <asm/current.h>
+
+#include "mrlock.h"
+
+
+#if USE_RW_WAIT_QUEUE_SPINLOCK
+# define wq_write_lock	write_lock
+#else
+# define wq_write_lock	spin_lock
+#endif
+
+/*
+ * We don't seem to need lock_type (only one supported), name, or
+ * sequence. But, XFS will pass it so let's leave them here for now.
+ */
+/* ARGSUSED */
+void
+mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence)
+{
+	mrp->mr_count = 0;
+	mrp->mr_reads_waiting = 0;
+	mrp->mr_writes_waiting = 0;
+	init_waitqueue_head(&mrp->mr_readerq);
+	init_waitqueue_head(&mrp->mr_writerq);
+	mrp->mr_lock = SPIN_LOCK_UNLOCKED;
+}
+
+/*
+ * Macros to lock/unlock the mrlock_t.
+ */
+
+#define MRLOCK(m)		spin_lock(&(m)->mr_lock);
+#define MRUNLOCK(m)		spin_unlock(&(m)->mr_lock);
+
+
+/*
+ * lock_wait should never be called in an interrupt thread.
+ *
+ * mrlocks can sleep (i.e. call schedule) and so they can't ever
+ * be called from an interrupt thread.
+ *
+ * threads that wake-up should also never be invoked from interrupt threads.
+ *
+ * But, waitqueue_lock is locked from interrupt threads - and we are
+ * called with interrupts disabled, so it is all OK.
+ */
+
+/* ARGSUSED */
+void
+lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw)
+{
+	DECLARE_WAITQUEUE( wait, current );
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+
+	wq_write_lock(&q->lock);
+	if (rw) {
+		__add_wait_queue_tail(q, &wait);
+	} else {
+		__add_wait_queue(q, &wait);
+	}
+
+	wq_write_unlock(&q->lock);
+	spin_unlock(lock);
+
+	schedule();
+
+	set_current_state(TASK_RUNNING);
+
+	wq_write_lock(&q->lock);
+	__remove_wait_queue(q, &wait);
+	wq_write_unlock(&q->lock);
+
+	spin_lock(lock);
+
+	/* return with lock held */
+}
+
+/* ARGSUSED */
+void
+mrfree(mrlock_t *mrp)
+{
+}
+
+/* ARGSUSED */
+void
+mrlock(mrlock_t *mrp, int type, int flags)
+{
+	if (type == MR_ACCESS)
+		mraccess(mrp);
+	else
+		mrupdate(mrp);
+}
+
+/* ARGSUSED */
+void
+mraccessf(mrlock_t *mrp, int flags)
+{
+	MRLOCK(mrp);
+	if(mrp->mr_writes_waiting > 0) {
+		mrp->mr_reads_waiting++;
+		lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
+		mrp->mr_reads_waiting--;
+	}
+	while (mrp->mr_count < 0) {
+		mrp->mr_reads_waiting++;
+		lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
+		mrp->mr_reads_waiting--;
+	}
+	mrp->mr_count++;
+	MRUNLOCK(mrp);
+}
+
+/* ARGSUSED */
+void
+mrupdatef(mrlock_t *mrp, int flags)
+{
+	MRLOCK(mrp);
+	while(mrp->mr_count) {
+		mrp->mr_writes_waiting++;
+		lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1);
+		mrp->mr_writes_waiting--;
+	}
+
+	mrp->mr_count = -1; /* writer on it */
+	MRUNLOCK(mrp);
+}
+
+int
+mrtryaccess(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+	/*
+	 * If anyone is waiting for update access or the lock is held for update
+	 * fail the request.
+	 */
+	if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) {
+		MRUNLOCK(mrp);
+		return 0;
+	}
+	mrp->mr_count++;
+	MRUNLOCK(mrp);
+	return 1;
+}
+
+int
+mrtrypromote(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+
+	if(mrp->mr_count == 1) { /* We are the only thread with the lock */
+		mrp->mr_count = -1; /* writer on it */
+		MRUNLOCK(mrp);
+		return 1;
+	}
+
+	MRUNLOCK(mrp);
+	return 0;
+}
+
+int
+mrtryupdate(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+
+	if(mrp->mr_count) {
+		MRUNLOCK(mrp);
+		return 0;
+	}
+
+	mrp->mr_count = -1; /* writer on it */
+	MRUNLOCK(mrp);
+	return 1;
+}
+
+static __inline__ void mrwake(mrlock_t *mrp)
+{
+	/*
+	 * First, if the count is now 0, we need to wake-up anyone waiting.
+	 */
+	if (!mrp->mr_count) {
+		if (mrp->mr_writes_waiting) {	/* Wake-up first writer waiting */
+			wake_up(&mrp->mr_writerq);
+		} else if (mrp->mr_reads_waiting) {	/* Wakeup any readers waiting */
+			wake_up(&mrp->mr_readerq);
+		}
+	}
+}
+
+void
+mraccunlock(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+	mrp->mr_count--;
+	mrwake(mrp);
+	MRUNLOCK(mrp);
+}
+
+void
+mrunlock(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+	if (mrp->mr_count < 0) {
+		mrp->mr_count = 0;
+	} else {
+		mrp->mr_count--;
+	}
+	mrwake(mrp);
+	MRUNLOCK(mrp);
+}
+
+int
+ismrlocked(mrlock_t *mrp, int type)	/* No need to lock since info can change */
+{
+	if (type == MR_ACCESS)
+		return (mrp->mr_count > 0); /* Read lock */
+	else if (type == MR_UPDATE)
+		return (mrp->mr_count < 0); /* Write lock */
+	else if (type == (MR_UPDATE | MR_ACCESS))
+		return (mrp->mr_count); /* Any type of lock held */
+	else /* Any waiters */
+		return (mrp->mr_reads_waiting | mrp->mr_writes_waiting);
+}
+
+/*
+ * Demote from update to access. We better be the only thread with the
+ * lock in update mode so it should be easy to set to 1.
+ * Wake-up any readers waiting.
+ */
+
+void
+mrdemote(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+	mrp->mr_count = 1;
+	if (mrp->mr_reads_waiting) {	/* Wakeup all readers waiting */
+		wake_up(&mrp->mr_readerq);
+	}
+	MRUNLOCK(mrp);
+}
+
+int
+mrislocked_access(mrlock_t *mrp)
+{
+	return(mrp->mr_count > 0);
+}
+
+int
+mrislocked_update(mrlock_t *mrp)
+{
+	return(mrp->mr_count < 0);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/mrlock.h linux-2.4.20-pre10-mjc2/fs/xfs/support/mrlock.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/mrlock.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/mrlock.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+
+#include <linux/version.h>
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * Implement mrlocks on Linux that work for XFS.
+ *
+ * These are sleep locks and not spinlocks. If one wants read/write spinlocks,
+ * use read_lock, write_lock, ... see spinlock.h.
+ */
+
+typedef struct mrlock_s {
+	int			mr_count;
+	unsigned short		mr_reads_waiting;
+	unsigned short		mr_writes_waiting;
+	wait_queue_head_t	mr_readerq;
+	wait_queue_head_t	mr_writerq;
+	spinlock_t		mr_lock;
+} mrlock_t;
+
+#define MR_ACCESS	1
+#define MR_UPDATE	2
+
+#define MRLOCK_BARRIER		0x1
+#define MRLOCK_ALLOW_EQUAL_PRI	0x8
+
+/*
+ * mraccessf/mrupdatef take flags to be passed in while sleeping;
+ * only PLTWAIT is currently supported.
+ */
+
+extern void	mraccessf(mrlock_t *, int);
+extern void	mrupdatef(mrlock_t *, int);
+extern void	mrlock(mrlock_t *, int, int);
+extern void	mrunlock(mrlock_t *);
+extern void	mraccunlock(mrlock_t *);
+extern int	mrtryupdate(mrlock_t *);
+extern int	mrtryaccess(mrlock_t *);
+extern int	mrtrypromote(mrlock_t *);
+extern void	mrdemote(mrlock_t *);
+
+extern int	ismrlocked(mrlock_t *, int);
+extern void	mrlock_init(mrlock_t *, int type, char *name, long sequence);
+extern void	mrfree(mrlock_t *);
+
+#define mrinit(mrp, name)	mrlock_init(mrp, MRLOCK_BARRIER, name, -1)
+#define mraccess(mrp)	mraccessf(mrp, 0)	/* grab for READ/ACCESS */
+#define mrupdate(mrp)	mrupdatef(mrp, 0)	/* grab for WRITE/UPDATE */
+
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/mutex.h linux-2.4.20-pre10-mjc2/fs/xfs/support/mutex.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/mutex.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/mutex.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Portions Copyright (c) 2002 Christoph Hellwig.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MUTEX_H__
+#define __XFS_SUPPORT_MUTEX_H__
+
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+
+/*
+ * Map the mutex'es from IRIX to Linux semaphores.
+ *
+ * Destroy just simply initializes to -99 which should block all other
+ * callers.
+ */
+#define MUTEX_DEFAULT		0x0
+typedef struct semaphore	mutex_t;
+
+#define mutex_init(lock, type, name)		sema_init(lock, 1)
+#define init_mutex(ptr, type, name, sequence)	sema_init(lock, 1)
+#define mutex_destroy(lock)			sema_init(lock, -99)
+#define mutex_lock(lock, num)			down(lock)
+#define mutex_trylock(lock)			(down_trylock(lock) ? 0 : 1)
+#define mutex_unlock(lock)			up(lock)
+
+#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/qsort.c linux-2.4.20-pre10-mjc2/fs/xfs/support/qsort.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/qsort.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/qsort.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,243 @@
+/* Copyright (C) 1991, 1992, 1996, 1997, 1999 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Written by Douglas C. Schmidt (schmidt@ics.uci.edu).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/* If you consider tuning this algorithm, you should consult first:
+   Engineering a sort function; Jon Bentley and M. Douglas McIlroy;
+   Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993.  */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+/* Byte-wise swap two items of size SIZE. */
+#define SWAP(a, b, size)						      \
+  do									      \
+    {									      \
+      register size_t __size = (size);					      \
+      register char *__a = (a), *__b = (b);				      \
+      do								      \
+	{								      \
+	  char __tmp = *__a;						      \
+	  *__a++ = *__b;						      \
+	  *__b++ = __tmp;						      \
+	} while (--__size > 0);						      \
+    } while (0)
+
+/* Discontinue quicksort algorithm when partition gets below this size.
+   This particular magic number was chosen to work best on a Sun 4/260. */
+#define MAX_THRESH 4
+
+/* Stack node declarations used to store unfulfilled partition obligations. */
+typedef struct
+  {
+    char *lo;
+    char *hi;
+  } stack_node;
+
+/* The next 4 #defines implement a very fast in-line stack abstraction. */
+/* The stack needs log (total_elements) entries (we could even subtract
+   log(MAX_THRESH)).  Since total_elements has type size_t, we get as
+   upper bound for log (total_elements):
+   bits per byte (CHAR_BIT) * sizeof(size_t).  */
+#define STACK_SIZE	(8 * sizeof(unsigned long int))
+#define PUSH(low, high) ((void) ((top->lo = (low)), (top->hi = (high)), ++top))
+#define POP(low, high)	((void) (--top, (low = top->lo), (high = top->hi)))
+#define STACK_NOT_EMPTY (stack < top)
+
+
+/* Order size using quicksort.	This implementation incorporates
+   four optimizations discussed in Sedgewick:
+
+   1. Non-recursive, using an explicit stack of pointer that store the
+      next array partition to sort.  To save time, this maximum amount
+      of space required to store an array of SIZE_MAX is allocated on the
+      stack.  Assuming a 32-bit (64 bit) integer for size_t, this needs
+      only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
+      Pretty cheap, actually.
+
+   2. Chose the pivot element using a median-of-three decision tree.
+      This reduces the probability of selecting a bad pivot value and
+      eliminates certain extraneous comparisons.
+
+   3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
+      insertion sort to order the MAX_THRESH items within each partition.
+      This is a big win, since insertion sort is faster for small, mostly
+      sorted array segments.
+
+   4. The larger of the two sub-partitions is always pushed onto the
+      stack first, with the algorithm then concentrating on the
+      smaller partition.  This *guarantees* no more than log (total_elems)
+      stack size is needed (actually O(1) in this case)!  */
+
+void
+qsort (void *const pbase, size_t total_elems, size_t size,
+       int (*cmp)(const void *, const void *))
+{
+  register char *base_ptr = (char *) pbase;
+
+  const size_t max_thresh = MAX_THRESH * size;
+
+  if (total_elems == 0)
+    /* Avoid lossage with unsigned arithmetic below.  */
+    return;
+
+  if (total_elems > MAX_THRESH)
+    {
+      char *lo = base_ptr;
+      char *hi = &lo[size * (total_elems - 1)];
+      stack_node stack[STACK_SIZE];
+      stack_node *top = stack + 1;
+
+      while (STACK_NOT_EMPTY)
+	{
+	  char *left_ptr;
+	  char *right_ptr;
+
+	  /* Select median value from among LO, MID, and HI. Rearrange
+	     LO and HI so the three values are sorted. This lowers the
+	     probability of picking a pathological pivot value and
+	     skips a comparison for both the LEFT_PTR and RIGHT_PTR in
+	     the while loops. */
+
+	  char *mid = lo + size * ((hi - lo) / size >> 1);
+
+	  if ((*cmp) ((void *) mid, (void *) lo) < 0)
+	    SWAP (mid, lo, size);
+	  if ((*cmp) ((void *) hi, (void *) mid) < 0)
+	    SWAP (mid, hi, size);
+	  else
+	    goto jump_over;
+	  if ((*cmp) ((void *) mid, (void *) lo) < 0)
+	    SWAP (mid, lo, size);
+	jump_over:;
+
+	  left_ptr  = lo + size;
+	  right_ptr = hi - size;
+
+	  /* Here's the famous ``collapse the walls'' section of quicksort.
+	     Gotta like those tight inner loops!  They are the main reason
+	     that this algorithm runs much faster than others. */
+	  do
+	    {
+	      while ((*cmp) ((void *) left_ptr, (void *) mid) < 0)
+		left_ptr += size;
+
+	      while ((*cmp) ((void *) mid, (void *) right_ptr) < 0)
+		right_ptr -= size;
+
+	      if (left_ptr < right_ptr)
+		{
+		  SWAP (left_ptr, right_ptr, size);
+		  if (mid == left_ptr)
+		    mid = right_ptr;
+		  else if (mid == right_ptr)
+		    mid = left_ptr;
+		  left_ptr += size;
+		  right_ptr -= size;
+		}
+	      else if (left_ptr == right_ptr)
+		{
+		  left_ptr += size;
+		  right_ptr -= size;
+		  break;
+		}
+	    }
+	  while (left_ptr <= right_ptr);
+
+	  /* Set up pointers for next iteration.  First determine whether
+	     left and right partitions are below the threshold size.  If so,
+	     ignore one or both.  Otherwise, push the larger partition's
+	     bounds on the stack and continue sorting the smaller one. */
+
+	  if ((size_t) (right_ptr - lo) <= max_thresh)
+	    {
+	      if ((size_t) (hi - left_ptr) <= max_thresh)
+		/* Ignore both small partitions. */
+		POP (lo, hi);
+	      else
+		/* Ignore small left partition. */
+		lo = left_ptr;
+	    }
+	  else if ((size_t) (hi - left_ptr) <= max_thresh)
+	    /* Ignore small right partition. */
+	    hi = right_ptr;
+	  else if ((right_ptr - lo) > (hi - left_ptr))
+	    {
+	      /* Push larger left partition indices. */
+	      PUSH (lo, right_ptr);
+	      lo = left_ptr;
+	    }
+	  else
+	    {
+	      /* Push larger right partition indices. */
+	      PUSH (left_ptr, hi);
+	      hi = right_ptr;
+	    }
+	}
+    }
+
+  /* Once the BASE_PTR array is partially sorted by quicksort the rest
+     is completely sorted using insertion sort, since this is efficient
+     for partitions below MAX_THRESH size. BASE_PTR points to the beginning
+     of the array to sort, and END_PTR points at the very last element in
+     the array (*not* one beyond it!). */
+  {
+    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
+    char *tmp_ptr = base_ptr;
+    char *thresh = min(end_ptr, base_ptr + max_thresh);
+    register char *run_ptr;
+
+    /* Find smallest element in first threshold and place it at the
+       array's beginning.  This is the smallest array element,
+       and the operation speeds up insertion sort's inner loop. */
+
+    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
+      if ((*cmp) ((void *) run_ptr, (void *) tmp_ptr) < 0)
+	tmp_ptr = run_ptr;
+
+    if (tmp_ptr != base_ptr)
+      SWAP (tmp_ptr, base_ptr, size);
+
+    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
+
+    run_ptr = base_ptr + size;
+    while ((run_ptr += size) <= end_ptr)
+      {
+	tmp_ptr = run_ptr - size;
+	while ((*cmp) ((void *) run_ptr, (void *) tmp_ptr) < 0)
+	  tmp_ptr -= size;
+
+	tmp_ptr += size;
+	if (tmp_ptr != run_ptr)
+	  {
+	    char *trav;
+
+	    trav = run_ptr + size;
+	    while (--trav >= run_ptr)
+	      {
+		char c = *trav;
+		char *hi, *lo;
+
+		for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
+		  *hi = *lo;
+		*hi = c;
+	      }
+	  }
+      }
+  }
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/qsort.h linux-2.4.20-pre10-mjc2/fs/xfs/support/qsort.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/qsort.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/qsort.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef QSORT_H
+#define QSORT_H
+
+extern void qsort (void *const pbase,
+		    size_t total_elems,
+		    size_t size,
+		    int (*cmp)(const void *, const void *));
+
+#endif
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/rwsem.h linux-2.4.20-pre10-mjc2/fs/xfs/support/rwsem.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/rwsem.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/rwsem.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_RWSEM_H__
+#define __XFS_SUPPORT_RWSEM_H__
+
+#include <linux/config.h>
+#include <linux/rwsem.h>
+
+#define MR_ACCESS	1
+#define MR_UPDATE	2
+
+/*
+ * The IRIX mrlocks have been totally replaced by rw_semaphores on Linux.
+ *
+ * But the Linux rw_semaphores don't provide any asserts on the lock
+ * holders.  XFS on ther hand makes excessive use of those in debug builds.
+ *
+ * To keep this infrastructure on Linux we implement open-coded versions
+ * of the lock asserts for the two most common rw_semaphore implementations.
+ *
+ * There is NO support for debug builds on architectures using the other
+ * implementations.
+ */
+
+#ifdef DEBUG
+#if defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
+static inline int rwsem_held(struct rw_semaphore *sem, int type)
+{
+	switch (type) {
+	case MR_ACCESS:			/* Read lock */
+		return (sem->activity > 0);
+	case MR_UPDATE:			/* Write lock */
+		return (sem->activity < 0);
+	case (MR_UPDATE|MR_ACCESS):	/* Any type of lock held */
+		return (sem->activity != 0);
+	default:			/* Any waiters */
+		return (!list_empty(&sem->wait_list));
+	}
+}
+#elif defined(__i386__)
+static inline int rwsem_held(struct rw_semaphore *sem, int type)
+{
+	switch (type) {
+	case MR_ACCESS:			/* Read lock */
+		return (sem->count > 0);
+	case MR_UPDATE:			/* Write lock */
+		return (sem->count < 0);
+	case (MR_UPDATE|MR_ACCESS):	/* Any type of lock held */
+		return (sem->count != 0);
+	default:			/* Any waiters */
+		return (!list_empty(&sem->wait_list));
+	}
+}
+#else
+#error "Sorry, XFS debug builds are not supported in this configuration"
+#endif
+#endif /* DEBUG */
+
+#endif /* __XFS_SUPPORT_RWSEM_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/sema.h linux-2.4.20-pre10-mjc2/fs/xfs/support/sema.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/sema.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/sema.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SEMA_H__
+#define __XFS_SUPPORT_SEMA_H__
+
+#include <linux/version.h>
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * sema_t structure just maps to struct semaphore in Linux kernel.
+ */
+
+typedef struct semaphore sema_t;
+
+#define init_sema(sp, val, c, d)	sema_init(sp, val)
+#define initsema(sp, val)		sema_init(sp, val)
+#define initnsema(sp, val, name)	sema_init(sp, val)
+#define psema(sp, b)			down(sp)
+#define vsema(sp)			up(sp)
+#define valusema(sp)			(atomic_read(&(sp)->count))
+#define freesema(sema)
+
+/*
+ * Map cpsema (try to get the sema) to down_trylock. We need to switch
+ * the return values since cpsema returns 1 (acquired) 0 (failed) and
+ * down_trylock returns the reverse 0 (acquired) 1 (failed).
+ */
+
+#define cpsema(sp)			(down_trylock(sp) ? 0 : 1)
+
+/*
+ * Didn't do cvsema(sp). Not sure how to map this to up/down/...
+ * It does a vsema if the values is < 0 other wise nothing.
+ */
+
+#endif /* __XFS_SUPPORT_SEMA_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/spin.h linux-2.4.20-pre10-mjc2/fs/xfs/support/spin.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/spin.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/spin.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Portions Copyright (c) 2002 Christoph Hellwig.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SPIN_H__
+#define __XFS_SUPPORT_SPIN_H__
+
+#include <linux/sched.h>	/* preempt needs this */
+#include <linux/spinlock.h>
+
+/*
+ * Map lock_t from IRIX to Linux spinlocks.
+ *
+ * Note that linux turns on/off spinlocks depending on CONFIG_SMP.
+ * We don't need to worry about SMP or not here.
+ */
+
+typedef spinlock_t lock_t;
+
+#define spinlock_init(lock, name)	spin_lock_init(lock)
+#define init_spinlock(lock, name, ll)	spin_lock_init(lock)
+#define spinlock_destroy(lock)
+
+static inline unsigned long mutex_spinlock(lock_t *lock)
+{
+	spin_lock(lock);
+	return 0;
+}
+
+/*ARGSUSED*/
+static inline void mutex_spinunlock(lock_t *lock, unsigned long s)
+{
+	spin_unlock(lock);
+}
+
+static inline void nested_spinlock(lock_t *lock)
+{
+	spin_lock(lock);
+}
+
+static inline void nested_spinunlock(lock_t *lock)
+{
+	spin_unlock(lock);
+}
+
+#endif /* __XFS_SUPPORT_SPIN_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/sv.h linux-2.4.20-pre10-mjc2/fs/xfs/support/sv.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/sv.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/sv.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Portions Copyright (c) 2002 Christoph Hellwig.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SV_H__
+#define __XFS_SUPPORT_SV_H__
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+/*
+ * Synchronisation variables.
+ *
+ * (Parameters "pri", "svf" and "rts" are not implemented)
+ */
+
+typedef struct sv_s {
+	wait_queue_head_t waiters;
+} sv_t;
+
+#define SV_FIFO		0x0		/* sv_t is FIFO type */
+#define SV_LIFO		0x2		/* sv_t is LIFO type */
+#define SV_PRIO		0x4		/* sv_t is PRIO type */
+#define SV_KEYED	0x6		/* sv_t is KEYED type */
+#define SV_DEFAULT	SV_FIFO
+
+
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
+			     unsigned long timeout)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(&sv->waiters, &wait);
+	set_current_state(state);
+	spin_unlock(lock);
+
+	schedule_timeout(timeout);
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&sv->waiters, &wait);
+}
+
+#define init_sv(sv,type,name,flag) \
+	init_waitqueue_head(&(sv)->waiters)
+#define sv_init(sv,flag,name) \
+	init_waitqueue_head(&(sv)->waiters)
+#define sv_destroy(sv) \
+	/*NOTHING*/
+#define sv_wait(sv, pri, lock, s) \
+	_sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_wait_sig(sv, pri, lock, s)	\
+	_sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
+	_sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
+	_sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_signal(sv) \
+	wake_up(&(sv)->waiters)
+#define sv_broadcast(sv) \
+	wake_up_all(&(sv)->waiters)
+
+#endif /* __XFS_SUPPORT_SV_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/time.h linux-2.4.20-pre10-mjc2/fs/xfs/support/time.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/time.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/time.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_TIME_H__
+#define __XFS_SUPPORT_TIME_H__
+
+#include <linux/sched.h>
+
+static inline void delay(long ticks)
+{
+	current->state = TASK_UNINTERRUPTIBLE;
+	schedule_timeout(ticks);
+}
+
+static inline void nanotime(struct timespec *tvp)
+{
+	tvp->tv_sec = xtime.tv_sec;
+	tvp->tv_nsec = xtime.tv_usec * 1000;
+}
+
+#endif /* __XFS_SUPPORT_TIME_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/uuid.c linux-2.4.20-pre10-mjc2/fs/xfs/support/uuid.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/uuid.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/uuid.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/time.h>
+
+#ifdef __sparc__
+#include <asm/idprom.h>
+#else
+#include <linux/netdevice.h>
+#endif
+
+#include <xfs_types.h>
+#include <xfs_arch.h>
+#include "time.h"
+#include "move.h"
+#include "uuid.h"
+
+#ifndef CONFIG_NET
+#define dev_get_by_name(x)	(NULL)
+#define dev_put(x)		do { } while (0)
+#endif
+
+/* NODE_SIZE is the number of bytes used for the node identifier portion. */
+#define NODE_SIZE	6
+
+/*
+ * Total size must be 128 bits.	 N.B. definition of uuid_t in uuid.h!
+ */
+typedef struct {
+	u_int32_t	uu_timelow;	/* time "low" */
+	u_int16_t	uu_timemid;	/* time "mid" */
+	u_int16_t	uu_timehi;	/* time "hi" and version */
+	u_int16_t	uu_clockseq;	/* "reserved" and clock sequence */
+	u_int16_t	uu_node[NODE_SIZE / 2]; /* ethernet hardware address */
+} uu_t;
+
+/*
+ * The Time Base Correction is the amount to add on to a UNIX-based
+ * time value (i.e. seconds since 1 Jan. 1970) to convert it to the
+ * time base for UUIDs (15 Oct. 1582).
+ */
+#define UUID_TBC	0x01B21DD2138140LL
+
+static	short		uuid_eaddr[NODE_SIZE / 2];	/* ethernet address */
+static	__int64_t	uuid_time;	/* last time basis used */
+static	u_int16_t	uuid_clockseq;	/* boot-time randomizer */
+DECLARE_MUTEX(uuid_lock);
+
+/*
+ * uuid_init - called from out of init_tbl[]
+ */
+void
+uuid_init(void)
+{
+}
+
+/*
+ * uuid_getnodeuniq - obtain the node unique fields of a UUID.
+ *
+ * This is not in any way a standard or condoned UUID function;
+ * it just something that's needed for user-level file handles.
+ */
+void
+uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
+{
+	char	*uu=(char*)uuid;
+
+	/* on IRIX, this function assumes big-endian fields within
+	 * the uuid, so we use INT_GET to get the same result on
+	 * little-endian systems
+	 */
+
+	fsid[0] = (INT_GET(*(u_int16_t*)(uu+8), ARCH_CONVERT) << 16) +
+		   INT_GET(*(u_int16_t*)(uu+4), ARCH_CONVERT);
+	fsid[1] =  INT_GET(*(u_int32_t*)(uu  ), ARCH_CONVERT);
+}
+
+void
+uuid_create_nil(uuid_t *uuid)
+{
+	bzero(uuid, sizeof *uuid);
+}
+
+int
+uuid_is_nil(uuid_t *uuid)
+{
+	int	i;
+	char	*cp = (char *)uuid;
+
+	if (uuid == NULL)
+		return B_TRUE;
+	/* implied check of version number here... */
+	for (i = 0; i < sizeof *uuid; i++)
+		if (*cp++) return B_FALSE;	/* not nil */
+	return B_TRUE;	/* is nil */
+}
+
+int
+uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
+{
+	return bcmp(uuid1, uuid2, sizeof(uuid_t)) ? B_FALSE : B_TRUE;
+}
+
+/*
+ * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
+ * 64-bit words.  NOTE: This function can not be changed EVER.	Although
+ * brain-dead, some applications depend on this 64-bit value remaining
+ * persistent.	Specifically, DMI vendors store the value as a persistent
+ * filehandle.
+ */
+__uint64_t
+uuid_hash64(uuid_t *uuid)
+{
+	__uint64_t	*sp = (__uint64_t *)uuid;
+
+	return sp[0] + sp[1];
+}	/* uuid_hash64 */
+
+static void
+get_eaddr(char *junk)
+{
+#ifdef __sparc__
+	memcpy(uuid_eaddr, idprom->id_ethaddr, 6);
+#else
+	struct net_device *dev;
+
+	dev = dev_get_by_name("eth0");
+	if (!dev || !dev->addr_len) {
+		get_random_bytes(uuid_eaddr, sizeof(uuid_eaddr));
+	} else {
+		memcpy(uuid_eaddr, dev->dev_addr,
+			dev->addr_len<sizeof(uuid_eaddr)?
+			dev->addr_len:sizeof(uuid_eaddr));
+		dev_put(dev);
+	}
+#endif
+}
+
+/*
+ * uuid_create - kernel version, does the actual work
+ */
+void
+uuid_create(uuid_t *uuid)
+{
+	int		i;
+	uu_t		*uu = (uu_t *)uuid;
+	static int	uuid_have_eaddr = 0;	/* ethernet addr inited? */
+	static int	uuid_is_init = 0;	/* time/clockseq inited? */
+
+	down(&uuid_lock);
+	if (!uuid_is_init) {
+		timespec_t	ts;
+
+		nanotime(&ts);
+		/*
+		 * The clock sequence must be initialized randomly.
+		 */
+		uuid_clockseq = ((unsigned long)jiffies & 0xfff) | 0x8000;
+		/*
+		 * Initialize the uuid time, it's in 100 nanosecond
+		 * units since a time base in 1582.
+		 */
+		uuid_time = ts.tv_sec * 10000000LL +
+			    ts.tv_nsec / 100LL +
+			    UUID_TBC;
+		uuid_is_init = 1;
+	}
+	if (!uuid_have_eaddr) {
+		uuid_have_eaddr = 1;
+		get_eaddr((char *)uuid_eaddr);
+	}
+	uuid_time++;
+	uu->uu_timelow = (u_int32_t)(uuid_time & 0x00000000ffffffffLL);
+	uu->uu_timemid = (u_int16_t)((uuid_time >> 32) & 0x0000ffff);
+	uu->uu_timehi = (u_int16_t)((uuid_time >> 48) & 0x00000fff) | 0x1000;
+	up(&uuid_lock);
+	uu->uu_clockseq = uuid_clockseq;
+	for (i = 0; i < (NODE_SIZE / 2); i++)
+		uu->uu_node [i] = uuid_eaddr [i];
+}
+
+int
+uuid_compare(uuid_t *uuid1, uuid_t *uuid2)
+{
+	int	i;
+	char	*cp1 = (char *) uuid1;
+	char	*cp2 = (char *) uuid2;
+
+	if (uuid1 == NULL) {
+		if (uuid2 == NULL) {
+			return 0;	/* equal because both are nil */
+		} else	{
+			return -1;	/* uuid1 nil, so precedes uuid2 */
+		}
+	} else if (uuid2 == NULL) {
+		return 1;
+	}
+
+	/* implied check of version number here... */
+	for (i = 0; i < sizeof(uuid_t); i++) {
+		if (*cp1 < *cp2)
+			return -1;
+		if (*cp1++ > *cp2++)
+			return 1;
+	}
+	return 0;	/* they're equal */
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/support/uuid.h linux-2.4.20-pre10-mjc2/fs/xfs/support/uuid.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/support/uuid.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/support/uuid.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_UUID_H__
+#define __XFS_SUPPORT_UUID_H__
+
+void uuid_create_nil(uuid_t *uuid);
+int uuid_is_nil(uuid_t *uuid);
+int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
+void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+__uint64_t uuid_hash64(uuid_t *uuid);
+
+#endif	/* __XFS_SUPPORT_UUID_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_H__
+#define __XFS_H__
+
+#include <linux/config.h>
+
+#include <xfs_types.h>
+#include <xfs_arch.h>
+
+#include <support/kmem.h>
+#include <support/mrlock.h>
+#include <support/qsort.h>
+#include <support/spin.h>
+#include <support/sv.h>
+#include <support/ktrace.h>
+#include <support/mutex.h>
+#include <support/sema.h>
+#include <support/atomic.h>
+#include <support/debug.h>
+#include <support/move.h>
+#include <support/uuid.h>
+#include <support/time.h>
+
+#include <linux/xfs_linux.h>
+
+#include <xfs_fs.h>
+#include <xfs_buf.h>
+#include <xfs_macros.h>
+#include <xfs_inum.h>
+#include <xfs_log.h>
+#include <xfs_clnt.h>
+#include <xfs_trans.h>
+#include <xfs_sb.h>
+#include <xfs_ag.h>
+#include <xfs_dir.h>
+#include <xfs_dir2.h>
+#include <xfs_imap.h>
+#include <xfs_mount.h>
+#include <xfs_alloc_btree.h>
+#include <xfs_bmap_btree.h>
+#include <xfs_ialloc_btree.h>
+#include <xfs_btree.h>
+#include <xfs_ialloc.h>
+#include <xfs_attr_sf.h>
+#include <xfs_dir_sf.h>
+#include <xfs_dir2_sf.h>
+#include <xfs_dinode.h>
+#include <xfs_inode.h>
+#include <xfs_alloc.h>
+#include <xfs_bmap.h>
+#include <xfs_bit.h>
+#include <xfs_rtalloc.h>
+#include <xfs_error.h>
+#include <xfs_quota.h>
+#include <xfs_itable.h>
+#include <xfs_dqblk.h>
+#include <xfs_dquot_item.h>
+#include <xfs_dquot.h>
+#include <xfs_qm.h>
+#include <xfs_rw.h>
+#include <xfs_da_btree.h>
+#include <xfs_dir_leaf.h>
+#include <xfs_dir2_data.h>
+#include <xfs_dir2_leaf.h>
+#include <xfs_dir2_block.h>
+#include <xfs_dir2_node.h>
+#include <xfs_dir2_trace.h>
+#include <xfs_acl.h>
+#include <xfs_cap.h>
+#include <xfs_mac.h>
+#include <xfs_attr.h>
+#include <xfs_attr_leaf.h>
+#include <xfs_inode_item.h>
+#include <xfs_buf_item.h>
+#include <xfs_extfree_item.h>
+#include <xfs_log_priv.h>
+#include <xfs_trans_priv.h>
+#include <xfs_trans_space.h>
+#include <xfs_utils.h>
+#include <xfs_dmapi.h>
+
+#include <linux/xfs_lrw.h>
+
+#endif	/* __XFS_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_acl.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_acl.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_acl.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_acl.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,893 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/posix_acl_xattr.h>
+
+STATIC int	xfs_acl_setmode(vnode_t *, xfs_acl_t *);
+STATIC void	xfs_acl_filter_mode(mode_t, xfs_acl_t *);
+STATIC void	xfs_acl_get_endian(xfs_acl_t *);
+STATIC int	xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
+STATIC int	xfs_acl_invalid(xfs_acl_t *);
+STATIC void	xfs_acl_sync_mode(mode_t, xfs_acl_t *);
+STATIC void	xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *);
+STATIC void	xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *);
+STATIC int	xfs_acl_allow_set(vnode_t *, int);
+
+kmem_zone_t *xfs_acl_zone;
+
+
+/*
+ * Test for existence of access ACL attribute as efficiently as possible.
+ */
+int
+xfs_acl_vhasacl_access(
+	vnode_t		*vp)
+{
+	int		error;
+
+	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
+	return (error == 0);
+}
+
+/*
+ * Test for existence of default ACL attribute as efficiently as possible.
+ */
+int
+xfs_acl_vhasacl_default(
+	vnode_t		*vp)
+{
+	int		error;
+
+	if (vp->v_type != VDIR)
+		return 0;
+	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
+	return (error == 0);
+}
+
+/*
+ * Convert from extended attribute representation to in-memory for XFS.
+ */
+STATIC int
+posix_acl_xattr_to_xfs(
+	posix_acl_xattr_header	*src,
+	size_t			size,
+	xfs_acl_t		*dest)
+{
+	posix_acl_xattr_entry	*src_entry;
+	xfs_acl_entry_t		*dest_entry;
+	int			n;
+
+	if (!src || !dest)
+		return EINVAL;
+
+	if (size < sizeof(posix_acl_xattr_header))
+		return EINVAL;
+
+	if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+		return EINVAL;
+
+	memset(dest, 0, sizeof(xfs_acl_t));
+	dest->acl_cnt = posix_acl_xattr_count(size);
+	if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
+		return EINVAL;
+
+	/*
+	 * acl_set_file(3) may request that we set default ACLs with
+	 * zero length -- defend (gracefully) against that here.
+	 */
+	if (!dest->acl_cnt)
+		return 0;
+
+	src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
+	dest_entry = &dest->acl_entry[0];
+
+	for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
+		dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
+		if (_ACL_PERM_INVALID(dest_entry->ae_perm))
+			return EINVAL;
+		dest_entry->ae_tag  = le16_to_cpu(src_entry->e_tag);
+		switch(dest_entry->ae_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			dest_entry->ae_id = ACL_UNDEFINED_ID;
+			break;
+		default:
+			return EINVAL;
+		}
+	}
+	if (xfs_acl_invalid(dest))
+		return EINVAL;
+
+	return 0;
+}
+
+/*
+ * Comparison function called from qsort().
+ * Primary key is ae_tag, secondary key is ae_id.
+ */
+STATIC int
+xfs_acl_entry_compare(
+	const void	*va,
+	const void	*vb)
+{
+	xfs_acl_entry_t *a = (xfs_acl_entry_t *)va,
+			*b = (xfs_acl_entry_t *)vb;
+
+	if (a->ae_tag == b->ae_tag)
+		return (a->ae_id - b->ae_id);
+	return (a->ae_tag - b->ae_tag);
+}
+
+/*
+ * Convert from in-memory XFS to extended attribute representation.
+ */
+STATIC int
+posix_acl_xfs_to_xattr(
+	xfs_acl_t		*src,
+	posix_acl_xattr_header	*dest,
+	size_t			size)
+{
+	int			n;
+	size_t			new_size = posix_acl_xattr_size(src->acl_cnt);
+	posix_acl_xattr_entry	*dest_entry;
+	xfs_acl_entry_t		*src_entry;
+
+	if (size < new_size)
+		return -ERANGE;
+
+	/* Need to sort src XFS ACL by <ae_tag,ae_id> */
+	qsort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
+		xfs_acl_entry_compare);
+
+	dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+	dest_entry = &dest->a_entries[0];
+	src_entry = &src->acl_entry[0];
+	for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
+		dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
+		if (_ACL_PERM_INVALID(src_entry->ae_perm))
+			return -EINVAL;
+		dest_entry->e_tag  = cpu_to_le16(src_entry->ae_tag);
+		switch (src_entry->ae_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
+				break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return new_size;
+}
+
+int
+xfs_acl_vget(
+	vnode_t		*vp,
+	void		*acl,
+	size_t		size,
+	int		kind)
+{
+	int			error;
+	xfs_acl_t		*xfs_acl;
+	posix_acl_xattr_header	*ext_acl = acl;
+
+	VN_HOLD(vp);
+	if ((error = _MAC_VACCESS(vp, NULL, VREAD)))
+		goto out;
+	if (!(_ACL_ALLOC(xfs_acl))) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	memset(xfs_acl, 0, sizeof(xfs_acl_t));
+	xfs_acl_get_attr(vp, xfs_acl, kind, size? 0 : ATTR_KERNOVAL, &error);
+	if (error)
+		goto out;
+
+	if (!size) {
+		error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
+	} else {
+		if (xfs_acl_invalid(xfs_acl)) {
+			error = EINVAL;
+			goto out;
+		}
+		if (kind == _ACL_TYPE_ACCESS) {
+			vattr_t va;
+
+			va.va_mask = AT_MODE;
+			VOP_GETATTR(vp, &va, 0, sys_cred, error);
+			if (error)
+				goto out;
+			xfs_acl_sync_mode(va.va_mode, xfs_acl);
+		}
+		error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
+	}
+out:
+	VN_RELE(vp);
+	_ACL_FREE(xfs_acl);
+	return -error;
+}
+
+int
+xfs_acl_vremove(
+	vnode_t		*vp,
+	int		kind)
+{
+	int		error;
+
+	VN_HOLD(vp);
+	error = xfs_acl_allow_set(vp, kind);
+	if (!error) {
+		VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT?
+				SGI_ACL_DEFAULT: SGI_ACL_FILE,
+				ATTR_ROOT, sys_cred, error);
+		if (error == ENOATTR)
+			error = 0;	/* 'scool */
+	}
+	VN_RELE(vp);
+	return -error;
+}
+
+int
+xfs_acl_vset(
+	vnode_t			*vp,
+	void			*acl,
+	size_t			size,
+	int			kind)
+{
+	posix_acl_xattr_header	*ext_acl = acl;
+	xfs_acl_t		*xfs_acl;
+	int			error;
+
+	if (!acl)
+		return -EINVAL;
+
+	if (!(_ACL_ALLOC(xfs_acl)))
+		return -ENOMEM;
+
+	error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
+	if (error) {
+		_ACL_FREE(xfs_acl);
+		return -error;
+	}
+	if (!xfs_acl->acl_cnt) {
+		_ACL_FREE(xfs_acl);
+		return 0;
+	}
+
+	VN_HOLD(vp);
+	error = xfs_acl_allow_set(vp, kind);
+	if (error)
+		goto out;
+
+	/* Incoming ACL exists, set file mode based on its value */
+	if (kind == _ACL_TYPE_ACCESS)
+		xfs_acl_setmode(vp, xfs_acl);
+	xfs_acl_set_attr(vp, xfs_acl, kind, &error);
+out:
+	VN_RELE(vp);
+	_ACL_FREE(xfs_acl);
+	return -error;
+}
+
+int
+xfs_acl_iaccess(
+	xfs_inode_t	*ip,
+	mode_t		mode,
+	cred_t		*cr)
+{
+	xfs_acl_t	*acl;
+	int		error;
+
+	if (!(_ACL_ALLOC(acl)))
+		return -1;
+
+	/* If the file has no ACL return -1. */
+	if (xfs_attr_fetch(ip, SGI_ACL_FILE, (char *)acl, sizeof(xfs_acl_t))) {
+		_ACL_FREE(acl);
+		return -1;
+	}
+	xfs_acl_get_endian(acl);
+
+	/* If the file has an empty ACL return -1. */
+	if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
+		_ACL_FREE(acl);
+		return -1;
+	}
+
+	/* Synchronize ACL with mode bits */
+	xfs_acl_sync_mode(ip->i_d.di_mode, acl);
+
+	error = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
+	_ACL_FREE(acl);
+	return error;
+}
+
+STATIC int
+xfs_acl_allow_set(
+	vnode_t		*vp,
+	int		kind)
+{
+	vattr_t		va;
+	int		error;
+
+	if (kind == _ACL_TYPE_DEFAULT && vp->v_type != VDIR)
+		return ENOTDIR;
+	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+		return EROFS;
+	if ((error = _MAC_VACCESS(vp, NULL, VWRITE)))
+		return error;
+	va.va_mask = AT_UID;
+	VOP_GETATTR(vp, &va, 0, NULL, error);
+	if (error)
+		return error;
+	if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
+		return EPERM;
+	return error;
+}
+
+/*
+ * The access control process to determine the access permission:
+ *	if uid == file owner id, use the file owner bits.
+ *	if gid == file owner group id, use the file group bits.
+ *	scan ACL for a maching user or group, and use matched entry
+ *	permission. Use total permissions of all matching group entries,
+ *	until all acl entries are exhausted. The final permission produced
+ *	by matching acl entry or entries needs to be & with group permission.
+ *	if not owner, owning group, or matching entry in ACL, use file
+ *	other bits.
+ */
+STATIC int
+xfs_acl_capability_check(
+	mode_t		mode,
+	cred_t		*cr)
+{
+	if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH))
+		return EACCES;
+	if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
+		return EACCES;
+	if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
+		return EACCES;
+	return 0;
+}
+
+/*
+ * Note: cr is only used here for the capability check if the ACL test fails.
+ *	 It is not used to find out the credentials uid or groups etc, as was
+ *	 done in IRIX. It is assumed that the uid and groups for the current
+ *	 thread are taken from "current" instead of the cr parameter.
+ */
+STATIC int
+xfs_acl_access(
+	uid_t		fuid,
+	gid_t		fgid,
+	xfs_acl_t	*fap,
+	mode_t		md,
+	cred_t		*cr)
+{
+	xfs_acl_entry_t matched;
+	int		i, allows;
+	int		maskallows = -1;	/* true, but not 1, either */
+	int		seen_userobj = 0;
+
+	matched.ae_tag = 0;	/* Invalid type */
+	md >>= 6;	/* Normalize the bits for comparison */
+
+	for (i = 0; i < fap->acl_cnt; i++) {
+		/*
+		 * Break out if we've got a user_obj entry or
+		 * a user entry and the mask (and have processed USER_OBJ)
+		 */
+		if (matched.ae_tag == ACL_USER_OBJ)
+			break;
+		if (matched.ae_tag == ACL_USER) {
+			if (maskallows != -1 && seen_userobj)
+				break;
+			if (fap->acl_entry[i].ae_tag != ACL_MASK &&
+			    fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
+				continue;
+		}
+		/* True if this entry allows the requested access */
+		allows = ((fap->acl_entry[i].ae_perm & md) == md);
+
+		switch (fap->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			seen_userobj = 1;
+			if (fuid != current->fsuid)
+				continue;
+			matched.ae_tag = ACL_USER_OBJ;
+			matched.ae_perm = allows;
+			break;
+		case ACL_USER:
+			if (fap->acl_entry[i].ae_id != current->fsuid)
+				continue;
+			matched.ae_tag = ACL_USER;
+			matched.ae_perm = allows;
+			break;
+		case ACL_GROUP_OBJ:
+			if ((matched.ae_tag == ACL_GROUP_OBJ ||
+			    matched.ae_tag == ACL_GROUP) && !allows)
+				continue;
+			if (!in_group_p(fgid))
+				continue;
+			matched.ae_tag = ACL_GROUP_OBJ;
+			matched.ae_perm = allows;
+			break;
+		case ACL_GROUP:
+			if ((matched.ae_tag == ACL_GROUP_OBJ ||
+			    matched.ae_tag == ACL_GROUP) && !allows)
+				continue;
+			if (!in_group_p(fap->acl_entry[i].ae_id))
+				continue;
+			matched.ae_tag = ACL_GROUP;
+			matched.ae_perm = allows;
+			break;
+		case ACL_MASK:
+			maskallows = allows;
+			break;
+		case ACL_OTHER:
+			if (matched.ae_tag != 0)
+				continue;
+			matched.ae_tag = ACL_OTHER;
+			matched.ae_perm = allows;
+			break;
+		}
+	}
+	/*
+	 * First possibility is that no matched entry allows access.
+	 * The capability to override DAC may exist, so check for it.
+	 */
+	switch (matched.ae_tag) {
+	case ACL_OTHER:
+	case ACL_USER_OBJ:
+		if (matched.ae_perm)
+			return 0;
+		break;
+	case ACL_USER:
+	case ACL_GROUP_OBJ:
+	case ACL_GROUP:
+		if (maskallows && matched.ae_perm)
+			return 0;
+		break;
+	case 0:
+		break;
+	}
+	return xfs_acl_capability_check(md, cr);
+}
+
+/*
+ * ACL validity checker.
+ *   This acl validation routine checks each ACL entry read in makes sense.
+ */
+STATIC int
+xfs_acl_invalid(
+	xfs_acl_t	*aclp)
+{
+	xfs_acl_entry_t *entry, *e;
+	int		user = 0, group = 0, other = 0, mask = 0;
+	int		mask_required = 0;
+	int		i, j;
+
+	if (!aclp)
+		goto acl_invalid;
+
+	if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
+		goto acl_invalid;
+
+	for (i = 0; i < aclp->acl_cnt; i++) {
+		entry = &aclp->acl_entry[i];
+		switch (entry->ae_tag) {
+		case ACL_USER_OBJ:
+			if (user++)
+				goto acl_invalid;
+			break;
+		case ACL_GROUP_OBJ:
+			if (group++)
+				goto acl_invalid;
+			break;
+		case ACL_OTHER:
+			if (other++)
+				goto acl_invalid;
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			for (j = i + 1; j < aclp->acl_cnt; j++) {
+				e = &aclp->acl_entry[j];
+				if (e->ae_id == entry->ae_id &&
+				    e->ae_tag == entry->ae_tag)
+					goto acl_invalid;
+			}
+			mask_required++;
+			break;
+		case ACL_MASK:
+			if (mask++)
+				goto acl_invalid;
+			break;
+		default:
+			goto acl_invalid;
+		}
+	}
+	if (!user || !group || !other || (mask_required && !mask))
+		goto acl_invalid;
+	else
+		return 0;
+acl_invalid:
+	return EINVAL;
+}
+
+/*
+ * Do ACL endian conversion.
+ */
+STATIC void
+xfs_acl_get_endian(
+	xfs_acl_t	*aclp)
+{
+	xfs_acl_entry_t *ace, *end;
+
+	INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
+	end = &aclp->acl_entry[0]+aclp->acl_cnt;
+	for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
+		INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
+		INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
+		INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
+	}
+}
+
+/*
+ * Get the ACL from the EA and do endian conversion.
+ */
+STATIC void
+xfs_acl_get_attr(
+	vnode_t		*vp,
+	xfs_acl_t	*aclp,
+	int		kind,
+	int		flags,
+	int		*error)
+{
+	int		len = sizeof(xfs_acl_t);
+
+	flags |= ATTR_ROOT;
+	VOP_ATTR_GET(vp,
+		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT,
+		(char *)aclp, &len, flags, sys_cred, *error);
+	if (*error || (flags & ATTR_KERNOVAL))
+		return;
+	xfs_acl_get_endian(aclp);
+}
+
+/*
+ * Set the EA with the ACL and do endian conversion.
+ */
+STATIC void
+xfs_acl_set_attr(
+	vnode_t		*vp,
+	xfs_acl_t	*aclp,
+	int		kind,
+	int		*error)
+{
+	xfs_acl_entry_t *ace, *newace, *end;
+	xfs_acl_t	*newacl;
+	int		len;
+
+	if (!(_ACL_ALLOC(newacl))) {
+		*error = ENOMEM;
+		return;
+	}
+
+	len = sizeof(xfs_acl_t) -
+	      (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
+	end = &aclp->acl_entry[0]+aclp->acl_cnt;
+	for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
+	     ace < end;
+	     ace++, newace++) {
+		INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
+		INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
+		INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
+	}
+	INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
+	VOP_ATTR_SET(vp,
+		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT,
+		(char *)newacl, len, ATTR_ROOT, sys_cred, *error);
+	_ACL_FREE(newacl);
+}
+
+int
+xfs_acl_vtoacl(
+	vnode_t		*vp,
+	xfs_acl_t	*access_acl,
+	xfs_acl_t	*default_acl)
+{
+	vattr_t		va;
+	int		error = 0;
+
+	if (access_acl) {
+		/*
+		 * Get the Access ACL and the mode.  If either cannot
+		 * be obtained for some reason, invalidate the access ACL.
+		 */
+		xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
+		if (!error) {
+			/* Got the ACL, need the mode... */
+			va.va_mask = AT_MODE;
+			VOP_GETATTR(vp, &va, 0, sys_cred, error);
+		}
+
+		if (error)
+			access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
+		else /* We have a good ACL and the file mode, synchronize. */
+			xfs_acl_sync_mode(va.va_mode, access_acl);
+	}
+
+	if (default_acl) {
+		xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
+		if (error)
+			default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
+	}
+	return error;
+}
+
+/*
+ * This function retrieves the parent directory's acl, processes it
+ * and lets the child inherit the acl(s) that it should.
+ */
+int
+xfs_acl_inherit(
+	vnode_t		*vp,
+	vattr_t		*vap,
+	xfs_acl_t	*pdaclp)
+{
+	xfs_acl_t	*cacl;
+	int		error = 0;
+
+	/*
+	 * If the parent does not have a default ACL, or it's an
+	 * invalid ACL, we're done.
+	 */
+	if (!vp)
+		return 0;
+	if (!pdaclp || xfs_acl_invalid(pdaclp))
+		return 0;
+
+	/*
+	 * Copy the default ACL of the containing directory to
+	 * the access ACL of the new file and use the mode that
+	 * was passed in to set up the correct initial values for
+	 * the u::,g::[m::], and o:: entries.  This is what makes
+	 * umask() "work" with ACL's.
+	 */
+
+	if (!(_ACL_ALLOC(cacl)))
+		return ENOMEM;
+
+	memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
+	xfs_acl_filter_mode(vap->va_mode, cacl);
+	xfs_acl_setmode(vp, cacl);
+
+	/*
+	 * Set the Default and Access ACL on the file.	The mode is already
+	 * set on the file, so we don't need to worry about that.
+	 *
+	 * If the new file is a directory, its default ACL is a copy of
+	 * the containing directory's default ACL.
+	 */
+	if (vp->v_type == VDIR)
+		xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
+	if (!error)
+		xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
+	_ACL_FREE(cacl);
+	return error;
+}
+
+/*
+ * Set up the correct mode on the file based on the supplied ACL.  This
+ * makes sure that the mode on the file reflects the state of the
+ * u::,g::[m::], and o:: entries in the ACL.  Since the mode is where
+ * the ACL is going to get the permissions for these entries, we must
+ * synchronize the mode whenever we set the ACL on a file.
+ */
+STATIC int
+xfs_acl_setmode(
+	vnode_t		*vp,
+	xfs_acl_t	*acl)
+{
+	vattr_t		va;
+	xfs_acl_entry_t *ap;
+	xfs_acl_entry_t *gap = NULL;
+	int		i, error, nomask = 1;
+
+	if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
+		return 0;
+
+	/*
+	 * Copy the u::, g::, o::, and m:: bits from the ACL into the
+	 * mode.  The m:: bits take precedence over the g:: bits.
+	 */
+	va.va_mask = AT_MODE;
+	VOP_GETATTR(vp, &va, 0, sys_cred, error);
+	if (error)
+		return error;
+
+	va.va_mask = AT_MODE;
+	va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
+	ap = acl->acl_entry;
+	for (i = 0; i < acl->acl_cnt; ++i) {
+		switch (ap->ae_tag) {
+		case ACL_USER_OBJ:
+			va.va_mode |= ap->ae_perm << 6;
+			break;
+		case ACL_GROUP_OBJ:
+			gap = ap;
+			break;
+		case ACL_MASK:
+			nomask = 0;
+			va.va_mode |= ap->ae_perm << 3;
+			break;
+		case ACL_OTHER:
+			va.va_mode |= ap->ae_perm;
+			break;
+		default:
+			break;
+		}
+		ap++;
+	}
+
+	/* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
+	if (gap && nomask)
+		va.va_mode |= gap->ae_perm << 3;
+
+	VOP_SETATTR(vp, &va, 0, sys_cred, error);
+	return error;
+}
+
+/*
+ * The permissions for the special ACL entries (u::, g::[m::], o::) are
+ * actually stored in the file mode (if there is both a group and a mask,
+ * the group is stored in the ACL entry and the mask is stored on the file).
+ * This allows the mode to remain automatically in sync with the ACL without
+ * the need for a call-back to the ACL system at every point where the mode
+ * could change.  This function takes the permissions from the specified mode
+ * and places it in the supplied ACL.
+ *
+ * This implementation draws its validity from the fact that, when the ACL
+ * was assigned, the mode was copied from the ACL.
+ * If the mode did not change, therefore, the mode remains exactly what was
+ * taken from the special ACL entries at assignment.
+ * If a subsequent chmod() was done, the POSIX spec says that the change in
+ * mode must cause an update to the ACL seen at user level and used for
+ * access checks.  Before and after a mode change, therefore, the file mode
+ * most accurately reflects what the special ACL entries should permit/deny.
+ *
+ * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
+ *	   the existing mode bits will override whatever is in the
+ *	   ACL. Similarly, if there is a pre-existing ACL that was
+ *	   never in sync with its mode (owing to a bug in 6.5 and
+ *	   before), it will now magically (or mystically) be
+ *	   synchronized.  This could cause slight astonishment, but
+ *	   it is better than inconsistent permissions.
+ *
+ * The supplied ACL is a template that may contain any combination
+ * of special entries.	These are treated as place holders when we fill
+ * out the ACL.	 This routine does not add or remove special entries, it
+ * simply unites each special entry with its associated set of permissions.
+ */
+STATIC void
+xfs_acl_sync_mode(
+	mode_t		mode,
+	xfs_acl_t	*acl)
+{
+	int		i, nomask = 1;
+	xfs_acl_entry_t *ap;
+	xfs_acl_entry_t *gap = NULL;
+
+	/*
+	 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
+	 * be set instead of the GROUP entry, if there is a MASK.
+	 */
+	for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
+		switch (ap->ae_tag) {
+		case ACL_USER_OBJ:
+			ap->ae_perm = (mode >> 6) & 0x7;
+			break;
+		case ACL_GROUP_OBJ:
+			gap = ap;
+			break;
+		case ACL_MASK:
+			nomask = 0;
+			ap->ae_perm = (mode >> 3) & 0x7;
+			break;
+		case ACL_OTHER:
+			ap->ae_perm = mode & 0x7;
+			break;
+		default:
+			break;
+		}
+	}
+	/* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
+	if (gap && nomask)
+		gap->ae_perm = (mode >> 3) & 0x7;
+}
+
+/*
+ * When inheriting an Access ACL from a directory Default ACL,
+ * the ACL bits are set to the intersection of the ACL default
+ * permission bits and the file permission bits in mode. If there
+ * are no permission bits on the file then we must not give them
+ * the ACL. This is what what makes umask() work with ACLs.
+ */
+STATIC void
+xfs_acl_filter_mode(
+	mode_t		mode,
+	xfs_acl_t	*acl)
+{
+	int		i, nomask = 1;
+	xfs_acl_entry_t *ap;
+	xfs_acl_entry_t *gap = NULL;
+
+	/*
+	 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
+	 * be merged with GROUP entry, if there is a MASK.
+	 */
+	for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
+		switch (ap->ae_tag) {
+		case ACL_USER_OBJ:
+			ap->ae_perm &= (mode >> 6) & 0x7;
+			break;
+		case ACL_GROUP_OBJ:
+			gap = ap;
+			break;
+		case ACL_MASK:
+			nomask = 0;
+			ap->ae_perm &= (mode >> 3) & 0x7;
+			break;
+		case ACL_OTHER:
+			ap->ae_perm &= mode & 0x7;
+			break;
+		default:
+			break;
+		}
+	}
+	/* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
+	if (gap && nomask)
+		gap->ae_perm &= (mode >> 3) & 0x7;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_acl.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_acl.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_acl.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_acl.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ACL_H__
+#define __XFS_ACL_H__
+
+/*
+ * Access Control Lists
+ */
+typedef __uint16_t	xfs_acl_perm_t;
+typedef __int32_t	xfs_acl_type_t;
+typedef __int32_t	xfs_acl_tag_t;
+typedef __int32_t	xfs_acl_id_t;
+
+#define XFS_ACL_MAX_ENTRIES 25
+#define XFS_ACL_NOT_PRESENT (-1)
+
+typedef struct xfs_acl_entry {
+	xfs_acl_tag_t	ae_tag;
+	xfs_acl_id_t	ae_id;
+	xfs_acl_perm_t	ae_perm;
+} xfs_acl_entry_t;
+
+typedef struct xfs_acl {
+	__int32_t	acl_cnt;
+	xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES];
+} xfs_acl_t;
+
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE	"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+struct vattr;
+struct vnode;
+struct xfs_inode;
+
+extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *);
+extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
+extern int xfs_acl_get(struct vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_set(struct vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vhasacl_access(struct vnode *);
+extern int xfs_acl_vhasacl_default(struct vnode *);
+extern int xfs_acl_vset(struct vnode *, void *, size_t, int);
+extern int xfs_acl_vget(struct vnode *, void *, size_t, int);
+extern int xfs_acl_vremove(struct vnode *vp, int);
+
+extern struct kmem_zone *xfs_acl_zone;
+
+#define _ACL_TYPE_ACCESS	1
+#define _ACL_TYPE_DEFAULT	2
+#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
+
+#define _ACL_DECL(a)		xfs_acl_t *(a) = NULL
+#define _ACL_ALLOC(a)		((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
+#define _ACL_FREE(a)		((a)? kmem_zone_free(xfs_acl_zone, (a)) : 0)
+#define _ACL_ZONE_INIT(z,name)	((z) = kmem_zone_init(sizeof(xfs_acl_t), name))
+#define _ACL_ZONE_DESTROY(z)	(kmem_cache_destroy(z))
+#define _ACL_INHERIT(c,v,d)	(xfs_acl_inherit(c,v,d))
+#define _ACL_GET_ACCESS(pv,pa)	(xfs_acl_vtoacl(pv,pa,NULL) == 0)
+#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0)
+#define _ACL_ACCESS_EXISTS	xfs_acl_vhasacl_access
+#define _ACL_DEFAULT_EXISTS	xfs_acl_vhasacl_default
+#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1)
+
+#else
+#define xfs_acl_vset(v,p,sz,t)	(-EOPNOTSUPP)
+#define xfs_acl_vget(v,p,sz,t)	(-EOPNOTSUPP)
+#define xfs_acl_vremove(v,t)	(-EOPNOTSUPP)
+#define _ACL_DECL(a)		((void)0)
+#define _ACL_ALLOC(a)		(1)	/* successfully allocate nothing */
+#define _ACL_FREE(a)		((void)0)
+#define _ACL_ZONE_INIT(z,name)	((void)0)
+#define _ACL_ZONE_DESTROY(z)	((void)0)
+#define _ACL_INHERIT(c,v,d)	(0)
+#define _ACL_GET_ACCESS(pv,pa)	(0)
+#define _ACL_GET_DEFAULT(pv,pd) (0)
+#define _ACL_ACCESS_EXISTS	(NULL)
+#define _ACL_DEFAULT_EXISTS	(NULL)
+#define _ACL_XFS_IACCESS(i,m,c) (-1)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_ACL_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ag.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ag.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ag.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ag.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_AG_H__
+#define __XFS_AG_H__
+
+/*
+ * Allocation group header
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_trans;
+
+#define XFS_AGF_MAGIC	0x58414746	/* 'XAGF' */
+#define XFS_AGI_MAGIC	0x58414749	/* 'XAGI' */
+#define XFS_AGF_VERSION 1
+#define XFS_AGI_VERSION 1
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_GOOD_VERSION)
+int xfs_agf_good_version(unsigned v);
+#define XFS_AGF_GOOD_VERSION(v) xfs_agf_good_version(v)
+#else
+#define XFS_AGF_GOOD_VERSION(v)		((v) == XFS_AGF_VERSION)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_GOOD_VERSION)
+int xfs_agi_good_version(unsigned v);
+#define XFS_AGI_GOOD_VERSION(v) xfs_agi_good_version(v)
+#else
+#define XFS_AGI_GOOD_VERSION(v)		((v) == XFS_AGI_VERSION)
+#endif
+
+/*
+ * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
+ * arrays below.
+ */
+#define XFS_BTNUM_AGF	((int)XFS_BTNUM_CNTi + 1)
+
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number.  Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+
+typedef struct xfs_agf
+{
+	/*
+	 * Common allocation group header information
+	 */
+	__uint32_t	agf_magicnum;	/* magic number == XFS_AGF_MAGIC */
+	__uint32_t	agf_versionnum; /* header version == XFS_AGF_VERSION */
+	xfs_agnumber_t	agf_seqno;	/* sequence # starting from 0 */
+	xfs_agblock_t	agf_length;	/* size in blocks of a.g. */
+	/*
+	 * Freespace information
+	 */
+	xfs_agblock_t	agf_roots[XFS_BTNUM_AGF];	/* root blocks */
+	__uint32_t	agf_spare0;	/* spare field */
+	__uint32_t	agf_levels[XFS_BTNUM_AGF];	/* btree levels */
+	__uint32_t	agf_spare1;	/* spare field */
+	__uint32_t	agf_flfirst;	/* first freelist block's index */
+	__uint32_t	agf_fllast;	/* last freelist block's index */
+	__uint32_t	agf_flcount;	/* count of blocks in freelist */
+	xfs_extlen_t	agf_freeblks;	/* total free blocks */
+	xfs_extlen_t	agf_longest;	/* longest free space */
+} xfs_agf_t;
+
+#define XFS_AGF_MAGICNUM	0x00000001
+#define XFS_AGF_VERSIONNUM	0x00000002
+#define XFS_AGF_SEQNO		0x00000004
+#define XFS_AGF_LENGTH		0x00000008
+#define XFS_AGF_ROOTS		0x00000010
+#define XFS_AGF_LEVELS		0x00000020
+#define XFS_AGF_FLFIRST		0x00000040
+#define XFS_AGF_FLLAST		0x00000080
+#define XFS_AGF_FLCOUNT		0x00000100
+#define XFS_AGF_FREEBLKS	0x00000200
+#define XFS_AGF_LONGEST		0x00000400
+#define XFS_AGF_NUM_BITS	11
+#define XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR		((xfs_daddr_t)1)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_BLOCK)
+xfs_agblock_t xfs_agf_block(struct xfs_mount *mp);
+#define XFS_AGF_BLOCK(mp)	xfs_agf_block(mp)
+#else
+#define XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR)
+#endif
+
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define XFS_AGI_UNLINKED_BUCKETS	64
+
+typedef struct xfs_agi
+{
+	/*
+	 * Common allocation group header information
+	 */
+	__uint32_t	agi_magicnum;	/* magic number == XFS_AGI_MAGIC */
+	__uint32_t	agi_versionnum; /* header version == XFS_AGI_VERSION */
+	xfs_agnumber_t	agi_seqno;	/* sequence # starting from 0 */
+	xfs_agblock_t	agi_length;	/* size in blocks of a.g. */
+	/*
+	 * Inode information
+	 * Inodes are mapped by interpreting the inode number, so no
+	 * mapping data is needed here.
+	 */
+	xfs_agino_t	agi_count;	/* count of allocated inodes */
+	xfs_agblock_t	agi_root;	/* root of inode btree */
+	__uint32_t	agi_level;	/* levels in inode btree */
+	xfs_agino_t	agi_freecount;	/* number of free inodes */
+	xfs_agino_t	agi_newino;	/* new inode just allocated */
+	xfs_agino_t	agi_dirino;	/* last directory inode chunk */
+	/*
+	 * Hash table of inodes which have been unlinked but are
+	 * still being referenced.
+	 */
+	xfs_agino_t	agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+} xfs_agi_t;
+
+#define XFS_AGI_MAGICNUM	0x00000001
+#define XFS_AGI_VERSIONNUM	0x00000002
+#define XFS_AGI_SEQNO		0x00000004
+#define XFS_AGI_LENGTH		0x00000008
+#define XFS_AGI_COUNT		0x00000010
+#define XFS_AGI_ROOT		0x00000020
+#define XFS_AGI_LEVEL		0x00000040
+#define XFS_AGI_FREECOUNT	0x00000080
+#define XFS_AGI_NEWINO		0x00000100
+#define XFS_AGI_DIRINO		0x00000200
+#define XFS_AGI_UNLINKED	0x00000400
+#define XFS_AGI_NUM_BITS	11
+#define XFS_AGI_ALL_BITS	((1 << XFS_AGI_NUM_BITS) - 1)
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR		((xfs_daddr_t)2)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_BLOCK)
+xfs_agblock_t xfs_agi_block(struct xfs_mount *mp);
+#define XFS_AGI_BLOCK(mp)	xfs_agi_block(mp)
+#else
+#define XFS_AGI_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGI_DADDR)
+#endif
+
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR		((xfs_daddr_t)3)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGFL_BLOCK)
+xfs_agblock_t xfs_agfl_block(struct xfs_mount *mp);
+#define XFS_AGFL_BLOCK(mp)	xfs_agfl_block(mp)
+#else
+#define XFS_AGFL_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR)
+#endif
+#define XFS_AGFL_SIZE		(BBSIZE / sizeof(xfs_agblock_t))
+typedef struct xfs_agfl
+{
+	xfs_agblock_t	agfl_bno[XFS_AGFL_SIZE];
+} xfs_agfl_t;
+
+/*
+ * Busy block/extent entry.  Used in perag to mark blocks that have been freed
+ * but whose transactions aren't committed to disk yet.
+ */
+typedef struct xfs_perag_busy {
+	xfs_agblock_t	busy_start;
+	xfs_extlen_t	busy_length;
+	struct xfs_trans *busy_tp;	/* transaction that did the free */
+} xfs_perag_busy_t;
+
+/*
+ * Per-ag incore structure, copies of information in agf and agi,
+ * to improve the performance of allocation group selection.
+ *
+ * pick sizes which fit in allocation buckets well
+ */
+#if (BITS_PER_LONG == 32)
+#define XFS_PAGB_NUM_SLOTS	84
+#elif (BITS_PER_LONG == 64)
+#define XFS_PAGB_NUM_SLOTS	128
+#endif
+
+typedef struct xfs_perag
+{
+	char		pagf_init;	/* this agf's entry is initialized */
+	char		pagi_init;	/* this agi's entry is initialized */
+	char		pagf_metadata;	/* the agf is prefered to be metadata */
+	char		pagi_inodeok;	/* The agi is ok for inodes */
+	__uint8_t	pagf_levels[XFS_BTNUM_AGF];
+					/* # of levels in bno & cnt btree */
+	__uint32_t	pagf_flcount;	/* count of blocks in freelist */
+	xfs_extlen_t	pagf_freeblks;	/* total free blocks */
+	xfs_extlen_t	pagf_longest;	/* longest free space */
+	xfs_agino_t	pagi_freecount; /* number of free inodes */
+#ifdef __KERNEL__
+	lock_t		pagb_lock;	/* lock for pagb_list */
+#endif
+	int		pagb_count;	/* pagb slots in use */
+	xfs_perag_busy_t *pagb_list;	/* unstable blocks */
+} xfs_perag_t;
+
+#define XFS_AG_MIN_BYTES	(1LL << 24)	/* 16 MB */
+#define XFS_AG_BEST_BYTES	(1LL << 30)	/*  1 GB */
+#define XFS_AG_MAX_BYTES	(1LL << 32)	/*  4 GB */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MIN_BLOCKS)
+xfs_extlen_t xfs_ag_min_blocks(int bl);
+#define XFS_AG_MIN_BLOCKS(bl)		xfs_ag_min_blocks(bl)
+#else
+#define XFS_AG_MIN_BLOCKS(bl)	((xfs_extlen_t)(XFS_AG_MIN_BYTES >> bl))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_BEST_BLOCKS)
+xfs_extlen_t xfs_ag_best_blocks(int bl, xfs_drfsbno_t blks);
+#define XFS_AG_BEST_BLOCKS(bl,blks)	xfs_ag_best_blocks(bl,blks)
+#else
+/*--#define	XFS_AG_BEST_BLOCKS(bl)	((xfs_extlen_t)(XFS_AG_BEST_BYTES >> bl))*/
+/*
+ * Best is XFS_AG_BEST_BLOCKS at and below 64 Gigabyte filesystems, and
+ * XFS_AG_MAX_BLOCKS above 64 Gigabytes.
+ */
+#define XFS_AG_BEST_BLOCKS(bl,blks)	((xfs_extlen_t)((1LL << (36 - bl)) >= \
+							blks) ? \
+							((xfs_extlen_t)(XFS_AG_BEST_BYTES >> bl)) : \
+							XFS_AG_MAX_BLOCKS(bl))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MAX_BLOCKS)
+xfs_extlen_t xfs_ag_max_blocks(int bl);
+#define XFS_AG_MAX_BLOCKS(bl)		xfs_ag_max_blocks(bl)
+#else
+#define XFS_AG_MAX_BLOCKS(bl)	((xfs_extlen_t)(XFS_AG_MAX_BYTES >> bl))
+#endif
+
+#define XFS_MAX_AGNUMBER	((xfs_agnumber_t)(NULLAGNUMBER - 1))
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MAXLEVELS)
+int xfs_ag_maxlevels(struct xfs_mount *mp);
+#define XFS_AG_MAXLEVELS(mp)		xfs_ag_maxlevels(mp)
+#else
+#define XFS_AG_MAXLEVELS(mp)	((mp)->m_ag_maxlevels)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST)
+int xfs_min_freelist(xfs_agf_t *a, struct xfs_mount *mp);
+#define XFS_MIN_FREELIST(a,mp)		xfs_min_freelist(a,mp)
+#else
+#define XFS_MIN_FREELIST(a,mp)	\
+	XFS_MIN_FREELIST_RAW(	\
+		INT_GET((a)->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT), \
+		INT_GET((a)->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT), mp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_PAG)
+int xfs_min_freelist_pag(xfs_perag_t *pag, struct xfs_mount *mp);
+#define XFS_MIN_FREELIST_PAG(pag,mp)	xfs_min_freelist_pag(pag,mp)
+#else
+#define XFS_MIN_FREELIST_PAG(pag,mp)	\
+	XFS_MIN_FREELIST_RAW((uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+			     (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_RAW)
+int xfs_min_freelist_raw(int bl, int cl, struct xfs_mount *mp);
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp)	xfs_min_freelist_raw(bl,cl,mp)
+#else
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp)	\
+	(MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + \
+	 MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_FSB)
+xfs_fsblock_t xfs_agb_to_fsb(struct xfs_mount *mp, xfs_agnumber_t agno,
+			     xfs_agblock_t agbno);
+#define XFS_AGB_TO_FSB(mp,agno,agbno)	xfs_agb_to_fsb(mp,agno,agbno)
+#else
+#define XFS_AGB_TO_FSB(mp,agno,agbno) \
+	(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGNO)
+xfs_agnumber_t xfs_fsb_to_agno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+#define XFS_FSB_TO_AGNO(mp,fsbno)	xfs_fsb_to_agno(mp,fsbno)
+#else
+#define XFS_FSB_TO_AGNO(mp,fsbno) \
+	((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGBNO)
+xfs_agblock_t xfs_fsb_to_agbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+#define XFS_FSB_TO_AGBNO(mp,fsbno)	xfs_fsb_to_agbno(mp,fsbno)
+#else
+#define XFS_FSB_TO_AGBNO(mp,fsbno) \
+	((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_DADDR)
+xfs_daddr_t xfs_agb_to_daddr(struct xfs_mount *mp, xfs_agnumber_t agno,
+			 xfs_agblock_t agbno);
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) xfs_agb_to_daddr(mp,agno,agbno)
+#else
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
+	((xfs_daddr_t)(XFS_FSB_TO_BB(mp, \
+		(xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))))
+#endif
+/*
+ * XFS_DADDR_TO_AGNO and XFS_DADDR_TO_AGBNO moved to xfs_mount.h
+ * to avoid header file ordering change
+ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_DADDR)
+xfs_daddr_t xfs_ag_daddr(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_daddr_t d);
+#define XFS_AG_DADDR(mp,agno,d)		xfs_ag_daddr(mp,agno,d)
+#else
+#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGF)
+xfs_agf_t *xfs_buf_to_agf(struct xfs_buf *bp);
+#define XFS_BUF_TO_AGF(bp)		xfs_buf_to_agf(bp)
+#else
+#define XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)XFS_BUF_PTR(bp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGI)
+xfs_agi_t *xfs_buf_to_agi(struct xfs_buf *bp);
+#define XFS_BUF_TO_AGI(bp)		xfs_buf_to_agi(bp)
+#else
+#define XFS_BUF_TO_AGI(bp)	((xfs_agi_t *)XFS_BUF_PTR(bp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGFL)
+xfs_agfl_t *xfs_buf_to_agfl(struct xfs_buf *bp);
+#define XFS_BUF_TO_AGFL(bp)		xfs_buf_to_agfl(bp)
+#else
+#define XFS_BUF_TO_AGFL(bp)	((xfs_agfl_t *)XFS_BUF_PTR(bp))
+#endif
+
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_CHECK_DADDR)
+void xfs_ag_check_daddr(struct xfs_mount *mp, xfs_daddr_t d, xfs_extlen_t len);
+#define XFS_AG_CHECK_DADDR(mp,d,len)	xfs_ag_check_daddr(mp,d,len)
+#else
+#define XFS_AG_CHECK_DADDR(mp,d,len)	\
+	((len) == 1 ? \
+	    ASSERT((d) == XFS_SB_DADDR || \
+		   XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \
+	    ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \
+		   XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1)))
+#endif
+
+#endif	/* __XFS_AG_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_alloc.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_alloc.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_alloc.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_alloc.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,2639 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Free space allocation for XFS.
+ */
+#include <xfs.h>
+
+#if defined(DEBUG)
+/*
+ * Allocation tracing.
+ */
+ktrace_t	*xfs_alloc_trace_buf;
+#endif
+
+#define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+
+#define XFSA_FIXUP_BNO_OK	1
+#define XFSA_FIXUP_CNT_OK	2
+
+int
+xfs_alloc_search_busy(xfs_trans_t *tp,
+		    xfs_agnumber_t agno,
+		    xfs_agblock_t bno,
+		    xfs_extlen_t len);
+
+#if defined(XFS_ALLOC_TRACE)
+#define TRACE_ALLOC(s,a)	\
+	xfs_alloc_trace_alloc(fname, s, a, __LINE__)
+#define TRACE_FREE(s,a,b,x,f)	\
+	xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__)
+#define TRACE_MODAGF(s,a,f)	\
+	xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__)
+#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp)	\
+	xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
+#define TRACE_UNBUSY(fname,s,ag,sl,tp)	\
+	xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
+#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)	\
+	xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+
+
+#else
+#define TRACE_ALLOC(s,a)
+#define TRACE_FREE(s,a,b,x,f)
+#define TRACE_MODAGF(s,a,f)
+#define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
+#define TRACE_UNBUSY(fname,s,ag,sl,tp)
+#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
+#endif	/* XFS_ALLOC_TRACE */
+
+/*
+ * Prototypes for per-ag allocation routines
+ */
+
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
+	xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC int				/* success (>= minlen) */
+xfs_alloc_compute_aligned(
+	xfs_agblock_t	foundbno,	/* starting block in found extent */
+	xfs_extlen_t	foundlen,	/* length in found extent */
+	xfs_extlen_t	alignment,	/* alignment for allocation */
+	xfs_extlen_t	minlen,		/* minimum length for allocation */
+	xfs_agblock_t	*resbno,	/* result block number */
+	xfs_extlen_t	*reslen)	/* result length */
+{
+	xfs_agblock_t	bno;
+	xfs_extlen_t	diff;
+	xfs_extlen_t	len;
+
+	if (alignment > 1 && foundlen >= minlen) {
+		bno = roundup(foundbno, alignment);
+		diff = bno - foundbno;
+		len = diff >= foundlen ? 0 : foundlen - diff;
+	} else {
+		bno = foundbno;
+		len = foundlen;
+	}
+	*resbno = bno;
+	*reslen = len;
+	return len >= minlen;
+}
+
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t			/* difference value (absolute) */
+xfs_alloc_compute_diff(
+	xfs_agblock_t	wantbno,	/* target starting block */
+	xfs_extlen_t	wantlen,	/* target length */
+	xfs_extlen_t	alignment,	/* target alignment */
+	xfs_agblock_t	freebno,	/* freespace's starting block */
+	xfs_extlen_t	freelen,	/* freespace's length */
+	xfs_agblock_t	*newbnop)	/* result: best start block from free */
+{
+	xfs_agblock_t	freeend;	/* end of freespace extent */
+	xfs_agblock_t	newbno1;	/* return block number */
+	xfs_agblock_t	newbno2;	/* other new block number */
+	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
+	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
+	xfs_agblock_t	wantend;	/* end of target extent */
+
+	ASSERT(freelen >= wantlen);
+	freeend = freebno + freelen;
+	wantend = wantbno + wantlen;
+	if (freebno >= wantbno) {
+		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else if (freeend >= wantend && alignment > 1) {
+		newbno1 = roundup(wantbno, alignment);
+		newbno2 = newbno1 - alignment;
+		if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+		else
+			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+		if (newbno2 < freebno)
+			newbno2 = NULLAGBLOCK;
+		else
+			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+			if (newlen1 < newlen2 ||
+			    (newlen1 == newlen2 &&
+			     XFS_ABSDIFF(newbno1, wantbno) >
+			     XFS_ABSDIFF(newbno2, wantbno)))
+				newbno1 = newbno2;
+		} else if (newbno2 != NULLAGBLOCK)
+			newbno1 = newbno2;
+	} else if (freeend >= wantend) {
+		newbno1 = wantbno;
+	} else if (alignment > 1) {
+		newbno1 = roundup(freeend - wantlen, alignment);
+		if (newbno1 > freeend - wantlen &&
+		    newbno1 - alignment >= freebno)
+			newbno1 -= alignment;
+		else if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else
+		newbno1 = freeend - wantlen;
+	*newbnop = newbno1;
+	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+	xfs_alloc_arg_t *args)		/* allocation argument structure */
+{
+	xfs_extlen_t	k;
+	xfs_extlen_t	rlen;
+
+	ASSERT(args->mod < args->prod);
+	rlen = args->len;
+	ASSERT(rlen >= args->minlen);
+	ASSERT(rlen <= args->maxlen);
+	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+	    (args->mod == 0 && rlen < args->prod))
+		return;
+	k = rlen % args->prod;
+	if (k == args->mod)
+		return;
+	if (k > args->mod) {
+		if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
+			return;
+	} else {
+		if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
+		    (int)args->minlen)
+			return;
+	}
+	ASSERT(rlen >= args->minlen);
+	ASSERT(rlen <= args->maxlen);
+	args->len = rlen;
+}
+
+/*
+ * Fix up length if there is too little space left in the a.g.
+ * Return 1 if ok, 0 if too little, should give up.
+ */
+STATIC int
+xfs_alloc_fix_minleft(
+	xfs_alloc_arg_t *args)		/* allocation argument structure */
+{
+	xfs_agf_t	*agf;		/* a.g. freelist header */
+	int		diff;		/* free space difference */
+
+	if (args->minleft == 0)
+		return 1;
+	agf = XFS_BUF_TO_AGF(args->agbp);
+	diff = INT_GET(agf->agf_freeblks, ARCH_CONVERT)
+		+ INT_GET(agf->agf_flcount, ARCH_CONVERT)
+		- args->len - args->minleft;
+	if (diff >= 0)
+		return 1;
+	args->len += diff;		/* shrink the allocated space */
+	if (args->len >= args->minlen)
+		return 1;
+	args->agbno = NULLAGBLOCK;
+	return 0;
+}
+
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks.  The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int				/* error code */
+xfs_alloc_fixup_trees(
+	xfs_btree_cur_t *cnt_cur,	/* cursor for by-size btree */
+	xfs_btree_cur_t *bno_cur,	/* cursor for by-block btree */
+	xfs_agblock_t	fbno,		/* starting block of free extent */
+	xfs_extlen_t	flen,		/* length of free extent */
+	xfs_agblock_t	rbno,		/* starting block of returned extent */
+	xfs_extlen_t	rlen,		/* length of returned extent */
+	int		flags)		/* flags, XFSA_FIXUP_... */
+{
+	int		error;		/* error code */
+	int		i;		/* operation results */
+	xfs_agblock_t	nfbno1;		/* first new free startblock */
+	xfs_agblock_t	nfbno2;		/* second new free startblock */
+	xfs_extlen_t	nflen1=0;	/* first new free length */
+	xfs_extlen_t	nflen2=0;	/* second new free length */
+
+	/*
+	 * Look up the record in the by-size tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	/*
+	 * Look up the record in the by-block tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+#ifdef DEBUG
+	{
+		xfs_alloc_block_t	*bnoblock;
+		xfs_alloc_block_t	*cntblock;
+
+		if (bno_cur->bc_nlevels == 1 &&
+		    cnt_cur->bc_nlevels == 1) {
+			bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
+			cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
+			XFS_WANT_CORRUPTED_RETURN(
+				INT_GET(bnoblock->bb_numrecs, ARCH_CONVERT) == INT_GET(cntblock->bb_numrecs, ARCH_CONVERT));
+		}
+	}
+#endif
+	/*
+	 * Deal with all four cases: the allocated record is contained
+	 * within the freespace record, so we can have new freespace
+	 * at either (or both) end, or no freespace remaining.
+	 */
+	if (rbno == fbno && rlen == flen)
+		nfbno1 = nfbno2 = NULLAGBLOCK;
+	else if (rbno == fbno) {
+		nfbno1 = rbno + rlen;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else if (rbno + rlen == fbno + flen) {
+		nfbno1 = fbno;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else {
+		nfbno1 = fbno;
+		nflen1 = rbno - fbno;
+		nfbno2 = rbno + rlen;
+		nflen2 = (fbno + flen) - nfbno2;
+	}
+	/*
+	 * Delete the entry from the by-size btree.
+	 */
+	if ((error = xfs_alloc_delete(cnt_cur, &i)))
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(i == 1);
+	/*
+	 * Add new by-size btree entry(s).
+	 */
+	if (nfbno1 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_alloc_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_alloc_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	/*
+	 * Fix up the by-block btree entry(s).
+	 */
+	if (nfbno1 == NULLAGBLOCK) {
+		/*
+		 * No remaining freespace, just delete the by-block tree entry.
+		 */
+		if ((error = xfs_alloc_delete(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	} else {
+		/*
+		 * Update the by-block entry to start later|be shorter.
+		 */
+		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+			return error;
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		/*
+		 * 2 resulting free entries, need to add one.
+		 */
+		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_alloc_insert(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	return 0;
+}
+
+/*
+ * Read in the allocation group free block array.
+ */
+STATIC int				/* error */
+xfs_alloc_read_agfl(
+	xfs_mount_t	*mp,		/* mount point structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_buf_t	**bpp)		/* buffer for the ag free block array */
+{
+	xfs_buf_t	*bp;		/* return value */
+	xfs_daddr_t	d;		/* disk block address */
+	int		error;
+
+	ASSERT(agno != NULLAGNUMBER);
+	d = XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 1, 0, &bp)))
+		return error;
+	ASSERT(bp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+	*bpp = bp;
+	return 0;
+}
+
+#if defined(XFS_ALLOC_TRACE)
+/*
+ * Add an allocation trace entry for an alloc call.
+ */
+STATIC void
+xfs_alloc_trace_alloc(
+	char		*name,		/* function tag string */
+	char		*str,		/* additional string */
+	xfs_alloc_arg_t *args,		/* allocation argument structure */
+	int		line)		/* source line number */
+{
+	ktrace_enter(xfs_alloc_trace_buf,
+		(void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)),
+		(void *)name,
+		(void *)str,
+		(void *)args->mp,
+		(void *)(__psunsigned_t)args->agno,
+		(void *)(__psunsigned_t)args->agbno,
+		(void *)(__psunsigned_t)args->minlen,
+		(void *)(__psunsigned_t)args->maxlen,
+		(void *)(__psunsigned_t)args->mod,
+		(void *)(__psunsigned_t)args->prod,
+		(void *)(__psunsigned_t)args->minleft,
+		(void *)(__psunsigned_t)args->total,
+		(void *)(__psunsigned_t)args->alignment,
+		(void *)(__psunsigned_t)args->len,
+		(void *)((((__psint_t)args->type) << 16) |
+			 (__psint_t)args->otype),
+		(void *)(__psint_t)((args->wasdel << 3) |
+				    (args->wasfromfl << 2) |
+				    (args->isfl << 1) |
+				    (args->userdata << 0)));
+}
+
+/*
+ * Add an allocation trace entry for a free call.
+ */
+STATIC void
+xfs_alloc_trace_free(
+	char		*name,		/* function tag string */
+	char		*str,		/* additional string */
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* a.g. relative block number */
+	xfs_extlen_t	len,		/* length of extent */
+	int		isfl,		/* set if is freelist allocation/free */
+	int		line)		/* source line number */
+{
+	ktrace_enter(xfs_alloc_trace_buf,
+		(void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)),
+		(void *)name,
+		(void *)str,
+		(void *)mp,
+		(void *)(__psunsigned_t)agno,
+		(void *)(__psunsigned_t)agbno,
+		(void *)(__psunsigned_t)len,
+		(void *)(__psint_t)isfl,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add an allocation trace entry for modifying an agf.
+ */
+STATIC void
+xfs_alloc_trace_modagf(
+	char		*name,		/* function tag string */
+	char		*str,		/* additional string */
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_agf_t	*agf,		/* new agf value */
+	int		flags,		/* logging flags for agf */
+	int		line)		/* source line number */
+{
+	ktrace_enter(xfs_alloc_trace_buf,
+		(void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)),
+		(void *)name,
+		(void *)str,
+		(void *)mp,
+		(void *)(__psint_t)flags,
+		(void *)(__psunsigned_t)INT_GET(agf->agf_seqno, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_length, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_BNO],
+						ARCH_CONVERT);
+		(void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_CNT],
+						ARCH_CONVERT);
+		(void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_BNO],
+						ARCH_CONVERT);
+		(void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_CNT],
+						ARCH_CONVERT);
+		(void *)(__psunsigned_t)INT_GET(agf->agf_flfirst, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_fllast, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_flcount, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_freeblks, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_longest, ARCH_CONVERT));
+}
+
+STATIC void
+xfs_alloc_trace_busy(
+	char		*name,		/* function tag string */
+	char		*str,		/* additional string */
+	xfs_mount_t	*mp,		/* file system mount poing */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* a.g. relative block number */
+	xfs_extlen_t	len,		/* length of extent */
+	int		slot,		/* perag Busy slot */
+	xfs_trans_t	*tp,
+	int		trtype,		/* type: add, delete, search */
+	int		line)		/* source line number */
+{
+	ktrace_enter(xfs_alloc_trace_buf,
+		(void *)(__psint_t)(trtype | (line << 16)),
+		(void *)name,
+		(void *)str,
+		(void *)mp,
+		(void *)(__psunsigned_t)agno,
+		(void *)(__psunsigned_t)agbno,
+		(void *)(__psunsigned_t)len,
+		(void *)(__psint_t)slot,
+		(void *)tp,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+}
+#endif	/* XFS_ALLOC_TRACE */
+
+/*
+ * Allocation group level functions.
+ */
+
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent(
+	xfs_alloc_arg_t *args)	/* argument structure for allocation */
+{
+	int		error=0;
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent";
+#endif
+
+	ASSERT(args->minlen > 0);
+	ASSERT(args->maxlen > 0);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->mod < args->prod);
+	ASSERT(args->alignment > 0);
+	/*
+	 * Branch to correct routine based on the type.
+	 */
+	args->wasfromfl = 0;
+	switch (args->type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+		error = xfs_alloc_ag_vextent_size(args);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_alloc_ag_vextent_near(args);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_alloc_ag_vextent_exact(args);
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+	if (error)
+		return error;
+	/*
+	 * If the allocation worked, need to change the agf structure
+	 * (and log it), and the superblock.
+	 */
+	if (args->agbno != NULLAGBLOCK) {
+		xfs_agf_t	*agf;	/* allocation group freelist header */
+#ifdef XFS_ALLOC_TRACE
+		xfs_mount_t	*mp = args->mp;
+#endif
+		long		slen = (long)args->len;
+
+		ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
+		ASSERT(!(args->wasfromfl) || !args->isfl);
+		ASSERT(args->agbno % args->alignment == 0);
+		if (!(args->wasfromfl)) {
+
+			agf = XFS_BUF_TO_AGF(args->agbp);
+			INT_MOD(agf->agf_freeblks, ARCH_CONVERT, -(args->len));
+			xfs_trans_agblocks_delta(args->tp,
+						 -((long)(args->len)));
+			args->pag->pagf_freeblks -= args->len;
+			ASSERT(INT_GET(agf->agf_freeblks, ARCH_CONVERT)
+				<= INT_GET(agf->agf_length, ARCH_CONVERT));
+			TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
+			xfs_alloc_log_agf(args->tp, args->agbp,
+						XFS_AGF_FREEBLKS);
+			/* search the busylist for these blocks */
+			xfs_alloc_search_busy(args->tp, args->agno,
+					args->agbno, args->len);
+		}
+		if (!args->isfl)
+			xfs_trans_mod_sb(args->tp,
+				args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+					XFS_TRANS_SB_FDBLOCKS, -slen);
+		XFS_STATS_INC(xfsstats.xs_allocx);
+		XFS_STATS_ADD(xfsstats.xs_allocb, args->len);
+	}
+	return 0;
+}
+
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_exact(
+	xfs_alloc_arg_t *args)	/* allocation argument structure */
+{
+	xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
+	xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+	xfs_agblock_t	end;	/* end of allocated extent */
+	int		error;
+	xfs_agblock_t	fbno;	/* start block of found extent */
+	xfs_agblock_t	fend;	/* end block of found extent */
+	xfs_extlen_t	flen;	/* length of found extent */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent_exact";
+#endif
+	int		i;	/* success/failure of operation */
+	xfs_agblock_t	maxend; /* end of maximal extent */
+	xfs_agblock_t	minend; /* end of minimal extent */
+	xfs_extlen_t	rlen;	/* length of returned extent */
+
+	ASSERT(args->alignment == 1);
+	/*
+	 * Allocate/initialize a cursor for the by-number freespace btree.
+	 */
+	bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO, 0, 0);
+	/*
+	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+	 * Look for the closest free block <= bno, it must contain bno
+	 * if any free block does.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * Didn't find it, return null.
+		 */
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * Grab the freespace record.
+	 */
+	if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	ASSERT(fbno <= args->agbno);
+	minend = args->agbno + args->minlen;
+	maxend = args->agbno + args->maxlen;
+	fend = fbno + flen;
+	/*
+	 * Give up if the freespace isn't long enough for the minimum request.
+	 */
+	if (fend < minend) {
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * End of extent will be smaller of the freespace end and the
+	 * maximal requested end.
+	 */
+	end = XFS_AGBLOCK_MIN(fend, maxend);
+	/*
+	 * Fix the length according to mod and prod if given.
+	 */
+	args->len = end - args->agbno;
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args)) {
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+		return 0;
+	}
+	rlen = args->len;
+	ASSERT(args->agbno + rlen <= fend);
+	end = args->agbno + rlen;
+	/*
+	 * We are allocating agbno for rlen [agbno .. end]
+	 * Allocate/initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT, 0, 0);
+	ASSERT(args->agbno + args->len <=
+		INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+			ARCH_CONVERT));
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+			args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+		goto error0;
+	}
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	TRACE_ALLOC("normal", args);
+	args->wasfromfl = 0;
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	TRACE_ALLOC("error", args);
+	return error;
+}
+
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_near(
+	xfs_alloc_arg_t *args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t *bno_cur_gt;	/* cursor for bno btree, right side */
+	xfs_btree_cur_t *bno_cur_lt;	/* cursor for bno btree, left side */
+	xfs_btree_cur_t *cnt_cur;	/* cursor for count btree */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent_near";
+#endif
+	xfs_agblock_t	gtbno;		/* start bno of right side entry */
+	xfs_agblock_t	gtbnoa;		/* aligned ... */
+	xfs_extlen_t	gtdiff;		/* difference to right side entry */
+	xfs_extlen_t	gtlen;		/* length of right side entry */
+	xfs_extlen_t	gtlena;		/* aligned ... */
+	xfs_agblock_t	gtnew;		/* useful start bno of right side */
+	int		error;		/* error code */
+	int		i;		/* result code, temporary */
+	int		j;		/* result code, temporary */
+	xfs_agblock_t	ltbno;		/* start bno of left side entry */
+	xfs_agblock_t	ltbnoa;		/* aligned ... */
+	xfs_extlen_t	ltdiff;		/* difference to left side entry */
+	/*REFERENCED*/
+	xfs_agblock_t	ltend;		/* end bno of left side entry */
+	xfs_extlen_t	ltlen;		/* length of left side entry */
+	xfs_extlen_t	ltlena;		/* aligned ... */
+	xfs_agblock_t	ltnew;		/* useful start bno of left side */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+#if defined(DEBUG) && defined(__KERNEL__)
+	/*
+	 * Randomly don't execute the first algorithm.
+	 */
+	static int	seed;		/* randomizing seed value */
+	int		dofirst;	/* set to do first algorithm */
+	timespec_t	now;		/* current time */
+
+	if (!seed) {
+		nanotime(&now);
+		seed = (int)now.tv_sec ^ (int)now.tv_nsec;
+	}
+	dofirst = random() & 1;
+#endif
+	/*
+	 * Get a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT, 0, 0);
+	ltlen = 0;
+	bno_cur_lt = bno_cur_gt = NULL;
+	/*
+	 * See if there are any free extents as big as maxlen.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
+		goto error0;
+	/*
+	 * If none, then pick up the last entry in the tree unless the
+	 * tree is empty.
+	 */
+	if (!i) {
+		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
+				&ltlen, &i)))
+			goto error0;
+		if (i == 0 || ltlen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			return 0;
+		}
+		ASSERT(i == 1);
+	}
+	args->wasfromfl = 0;
+	/*
+	 * First algorithm.
+	 * If the requested extent is large wrt the freespaces available
+	 * in this a.g., then the cursor will be pointing to a btree entry
+	 * near the right edge of the tree.  If it's in the last btree leaf
+	 * block, then we just examine all the entries in that block
+	 * that are big enough, and pick the best one.
+	 * This is written as a while loop so we can break out of it,
+	 * but we never loop back to the top.
+	 */
+	while (xfs_btree_islastblock(cnt_cur, 0)) {
+		xfs_extlen_t	bdiff;
+		int		besti=0;
+		xfs_extlen_t	blen=0;
+		xfs_agblock_t	bnew=0;
+
+#if defined(DEBUG) && defined(__KERNEL__)
+		if (!dofirst)
+			break;
+#endif
+		/*
+		 * Start from the entry that lookup found, sequence through
+		 * all larger free blocks.  If we're actually pointing at a
+		 * record smaller than maxlen, go to the start of this block,
+		 * and skip all those smaller than minlen.
+		 */
+		if (ltlen || args->alignment > 1) {
+			cnt_cur->bc_ptrs[0] = 1;
+			do {
+				if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
+						&ltlen, &i)))
+					goto error0;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+				if (ltlen >= args->minlen)
+					break;
+				if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+					goto error0;
+			} while (i);
+			ASSERT(ltlen >= args->minlen);
+			if (!i)
+				break;
+		}
+		i = cnt_cur->bc_ptrs[0];
+		for (j = 1, blen = 0, bdiff = 0;
+		     !error && j && (blen < args->maxlen || bdiff > 0);
+		     error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+			/*
+			 * For each entry, decide if it's better than
+			 * the previous best entry.
+			 */
+			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (!xfs_alloc_compute_aligned(ltbno, ltlen,
+					args->alignment, args->minlen,
+					&ltbnoa, &ltlena))
+				continue;
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			ASSERT(args->len >= args->minlen);
+			if (args->len < blen)
+				continue;
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, ltbno, ltlen, &ltnew);
+			if (ltnew != NULLAGBLOCK &&
+			    (args->len > blen || ltdiff < bdiff)) {
+				bdiff = ltdiff;
+				bnew = ltnew;
+				blen = args->len;
+				besti = cnt_cur->bc_ptrs[0];
+			}
+		}
+		/*
+		 * It didn't work.  We COULD be in a case where
+		 * there's a good record somewhere, so try again.
+		 */
+		if (blen == 0)
+			break;
+		/*
+		 * Point at the best entry, and retrieve it again.
+		 */
+		cnt_cur->bc_ptrs[0] = besti;
+		if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		ltend = ltbno + ltlen;
+		ASSERT(ltend <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+				ARCH_CONVERT));
+		args->len = blen;
+		if (!xfs_alloc_fix_minleft(args)) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			TRACE_ALLOC("nominleft", args);
+			return 0;
+		}
+		blen = args->len;
+		/*
+		 * We are allocating starting at bnew for blen blocks.
+		 */
+		args->agbno = bnew;
+		ASSERT(bnew >= ltbno);
+		ASSERT(bnew + blen <= ltend);
+		/*
+		 * Set up a cursor for the by-bno tree.
+		 */
+		bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
+			args->agbp, args->agno, XFS_BTNUM_BNO, 0, 0);
+		/*
+		 * Fix up the btree entries.
+		 */
+		if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
+				ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
+			goto error0;
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+		TRACE_ALLOC("first", args);
+		return 0;
+	}
+	/*
+	 * Second algorithm.
+	 * Search in the by-bno tree to the left and to the right
+	 * simultaneously, until in each case we find a space big enough,
+	 * or run into the edge of the tree.  When we run into the edge,
+	 * we deallocate that cursor.
+	 * If both searches succeed, we compare the two spaces and pick
+	 * the better one.
+	 * With alignment, it's possible for both to fail; the upper
+	 * level algorithm that picks allocation groups for allocations
+	 * is not supposed to do this.
+	 */
+	/*
+	 * Allocate and initialize the cursor for the leftward search.
+	 */
+	bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO, 0, 0);
+	/*
+	 * Lookup <= bno to find the leftward search's starting point.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * Didn't find anything; use this cursor for the rightward
+		 * search.
+		 */
+		bno_cur_gt = bno_cur_lt;
+		bno_cur_lt = 0;
+	}
+	/*
+	 * Found something.  Duplicate the cursor for the rightward search.
+	 */
+	else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
+		goto error0;
+	/*
+	 * Increment the cursor, so we will point at the entry just right
+	 * of the leftward entry if any, or to the leftmost entry.
+	 */
+	if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * It failed, there are no rightward entries.
+		 */
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
+		bno_cur_gt = NULL;
+	}
+	/*
+	 * Loop going left with the leftward cursor, right with the
+	 * rightward cursor, until either both directions give up or
+	 * we find an entry at least as big as minlen.
+	 */
+	do {
+		if (bno_cur_lt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (xfs_alloc_compute_aligned(ltbno, ltlen,
+					args->alignment, args->minlen,
+					&ltbnoa, &ltlena))
+				break;
+			if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_lt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_lt = NULL;
+			}
+		}
+		if (bno_cur_gt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (xfs_alloc_compute_aligned(gtbno, gtlen,
+					args->alignment, args->minlen,
+					&gtbnoa, &gtlena))
+				break;
+			if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_gt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_gt = NULL;
+			}
+		}
+	} while (bno_cur_lt || bno_cur_gt);
+	/*
+	 * Got both cursors still active, need to find better entry.
+	 */
+	if (bno_cur_lt && bno_cur_gt) {
+		/*
+		 * Left side is long enough, look for a right side entry.
+		 */
+		if (ltlena >= args->minlen) {
+			/*
+			 * Fix up the length.
+			 */
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			rlen = args->len;
+			ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+				args->alignment, ltbno, ltlen, &ltnew);
+			/*
+			 * Not perfect.
+			 */
+			if (ltdiff) {
+				/*
+				 * Look until we find a better one, run out of
+				 * space, or run off the end.
+				 */
+				while (bno_cur_lt && bno_cur_gt) {
+					if ((error = xfs_alloc_get_rec(
+							bno_cur_gt, &gtbno,
+							&gtlen, &i)))
+						goto error0;
+					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+					xfs_alloc_compute_aligned(gtbno, gtlen,
+						args->alignment, args->minlen,
+						&gtbnoa, &gtlena);
+					/*
+					 * The left one is clearly better.
+					 */
+					if (gtbnoa >= args->agbno + ltdiff) {
+						xfs_btree_del_cursor(
+							bno_cur_gt,
+							XFS_BTREE_NOERROR);
+						bno_cur_gt = NULL;
+						break;
+					}
+					/*
+					 * If we reach a big enough entry,
+					 * compare the two and pick the best.
+					 */
+					if (gtlena >= args->minlen) {
+						args->len =
+							XFS_EXTLEN_MIN(gtlena,
+								args->maxlen);
+						xfs_alloc_fix_len(args);
+						rlen = args->len;
+						gtdiff = xfs_alloc_compute_diff(
+							args->agbno, rlen,
+							args->alignment,
+							gtbno, gtlen, &gtnew);
+						/*
+						 * Right side is better.
+						 */
+						if (gtdiff < ltdiff) {
+							xfs_btree_del_cursor(
+								bno_cur_lt,
+								XFS_BTREE_NOERROR);
+							bno_cur_lt = NULL;
+						}
+						/*
+						 * Left side is better.
+						 */
+						else {
+							xfs_btree_del_cursor(
+								bno_cur_gt,
+								XFS_BTREE_NOERROR);
+							bno_cur_gt = NULL;
+						}
+						break;
+					}
+					/*
+					 * Fell off the right end.
+					 */
+					if ((error = xfs_alloc_increment(
+							bno_cur_gt, 0, &i)))
+						goto error0;
+					if (!i) {
+						xfs_btree_del_cursor(
+							bno_cur_gt,
+							XFS_BTREE_NOERROR);
+						bno_cur_gt = NULL;
+						break;
+					}
+				}
+			}
+			/*
+			 * The left side is perfect, trash the right side.
+			 */
+			else {
+				xfs_btree_del_cursor(bno_cur_gt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_gt = NULL;
+			}
+		}
+		/*
+		 * It's the right side that was found first, look left.
+		 */
+		else {
+			/*
+			 * Fix up the length.
+			 */
+			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			rlen = args->len;
+			gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+				args->alignment, gtbno, gtlen, &gtnew);
+			/*
+			 * Right side entry isn't perfect.
+			 */
+			if (gtdiff) {
+				/*
+				 * Look until we find a better one, run out of
+				 * space, or run off the end.
+				 */
+				while (bno_cur_lt && bno_cur_gt) {
+					if ((error = xfs_alloc_get_rec(
+							bno_cur_lt, &ltbno,
+							&ltlen, &i)))
+						goto error0;
+					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+					xfs_alloc_compute_aligned(ltbno, ltlen,
+						args->alignment, args->minlen,
+						&ltbnoa, &ltlena);
+					/*
+					 * The right one is clearly better.
+					 */
+					if (ltbnoa <= args->agbno - gtdiff) {
+						xfs_btree_del_cursor(
+							bno_cur_lt,
+							XFS_BTREE_NOERROR);
+						bno_cur_lt = NULL;
+						break;
+					}
+					/*
+					 * If we reach a big enough entry,
+					 * compare the two and pick the best.
+					 */
+					if (ltlena >= args->minlen) {
+						args->len = XFS_EXTLEN_MIN(
+							ltlena, args->maxlen);
+						xfs_alloc_fix_len(args);
+						rlen = args->len;
+						ltdiff = xfs_alloc_compute_diff(
+							args->agbno, rlen,
+							args->alignment,
+							ltbno, ltlen, &ltnew);
+						/*
+						 * Left side is better.
+						 */
+						if (ltdiff < gtdiff) {
+							xfs_btree_del_cursor(
+								bno_cur_gt,
+								XFS_BTREE_NOERROR);
+							bno_cur_gt = NULL;
+						}
+						/*
+						 * Right side is better.
+						 */
+						else {
+							xfs_btree_del_cursor(
+								bno_cur_lt,
+								XFS_BTREE_NOERROR);
+							bno_cur_lt = NULL;
+						}
+						break;
+					}
+					/*
+					 * Fell off the left end.
+					 */
+					if ((error = xfs_alloc_decrement(
+							bno_cur_lt, 0, &i)))
+						goto error0;
+					if (!i) {
+						xfs_btree_del_cursor(bno_cur_lt,
+							XFS_BTREE_NOERROR);
+						bno_cur_lt = NULL;
+						break;
+					}
+				}
+			}
+			/*
+			 * The right side is perfect, trash the left side.
+			 */
+			else {
+				xfs_btree_del_cursor(bno_cur_lt,
+					XFS_BTREE_NOERROR);
+				bno_cur_lt = NULL;
+			}
+		}
+	}
+	/*
+	 * If we couldn't get anything, give up.
+	 */
+	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+		TRACE_ALLOC("neither", args);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * At this point we have selected a freespace entry, either to the
+	 * left or to the right.  If it's on the right, copy all the
+	 * useful variables to the "left" set so we only have one
+	 * copy of this code.
+	 */
+	if (bno_cur_gt) {
+		bno_cur_lt = bno_cur_gt;
+		bno_cur_gt = NULL;
+		ltbno = gtbno;
+		ltbnoa = gtbnoa;
+		ltlen = gtlen;
+		ltlena = gtlena;
+		j = 1;
+	} else
+		j = 0;
+	/*
+	 * Fix up the length and compute the useful address.
+	 */
+	ltend = ltbno + ltlen;
+	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args)) {
+		TRACE_ALLOC("nominleft", args);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		return 0;
+	}
+	rlen = args->len;
+	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
+		ltlen, &ltnew);
+	ASSERT(ltnew >= ltbno);
+	ASSERT(ltnew + rlen <= ltend);
+	ASSERT(ltnew + rlen <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+		ARCH_CONVERT));
+	args->agbno = ltnew;
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
+			ltnew, rlen, XFSA_FIXUP_BNO_OK)))
+		goto error0;
+	TRACE_ALLOC(j ? "gt" : "lt", args);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+	return 0;
+
+ error0:
+	TRACE_ALLOC("error", args);
+	if (cnt_cur != NULL)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur_lt != NULL)
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
+	if (bno_cur_gt != NULL)
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_size(
+	xfs_alloc_arg_t *args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t *bno_cur;	/* cursor for bno btree */
+	xfs_btree_cur_t *cnt_cur;	/* cursor for cnt btree */
+	int		error;		/* error result */
+	xfs_agblock_t	fbno;		/* start of found freespace */
+	xfs_extlen_t	flen;		/* length of found freespace */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent_size";
+#endif
+	int		i;		/* temp status variable */
+	xfs_agblock_t	rbno;		/* returned block number */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+
+	/*
+	 * Allocate and initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT, 0, 0);
+	bno_cur = NULL;
+	/*
+	 * Look for an entry >= maxlen+alignment-1 blocks.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+			args->maxlen + args->alignment - 1, &i)))
+		goto error0;
+	/*
+	 * If none, then pick up the last entry in the tree unless the
+	 * tree is empty.
+	 */
+	if (!i) {
+		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
+				&flen, &i)))
+			goto error0;
+		if (i == 0 || flen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			TRACE_ALLOC("noentry", args);
+			return 0;
+		}
+		ASSERT(i == 1);
+	}
+	/*
+	 * There's a freespace as big as maxlen+alignment-1, get it.
+	 */
+	else {
+		if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	/*
+	 * In the first case above, we got the last entry in the
+	 * by-size btree.  Now we check to see if the space hits maxlen
+	 * once aligned; if not, we search left for something better.
+	 * This can't happen in the second case above.
+	 */
+	xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
+		&rbno, &rlen);
+	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+	XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+			(rlen <= flen && rbno + rlen <= fbno + flen), error0);
+	if (rlen < args->maxlen) {
+		xfs_agblock_t	bestfbno;
+		xfs_extlen_t	bestflen;
+		xfs_agblock_t	bestrbno;
+		xfs_extlen_t	bestrlen;
+
+		bestrlen = rlen;
+		bestrbno = rbno;
+		bestflen = flen;
+		bestfbno = fbno;
+		for (;;) {
+			if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+				goto error0;
+			if (i == 0)
+				break;
+			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (flen < bestrlen)
+				break;
+			xfs_alloc_compute_aligned(fbno, flen, args->alignment,
+				args->minlen, &rbno, &rlen);
+			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+			XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+				(rlen <= flen && rbno + rlen <= fbno + flen),
+				error0);
+			if (rlen > bestrlen) {
+				bestrlen = rlen;
+				bestrbno = rbno;
+				bestflen = flen;
+				bestfbno = fbno;
+				if (rlen == args->maxlen)
+					break;
+			}
+		}
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+				&i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		rlen = bestrlen;
+		rbno = bestrbno;
+		flen = bestflen;
+		fbno = bestfbno;
+	}
+	args->wasfromfl = 0;
+	/*
+	 * Fix up the length.
+	 */
+	args->len = rlen;
+	xfs_alloc_fix_len(args);
+	if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		TRACE_ALLOC("nominleft", args);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+	rlen = args->len;
+	XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+	/*
+	 * Allocate and initialize a cursor for the by-block tree.
+	 */
+	bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO, 0, 0);
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+			rbno, rlen, XFSA_FIXUP_CNT_OK)))
+		goto error0;
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	cnt_cur = bno_cur = NULL;
+	args->len = rlen;
+	args->agbno = rbno;
+	XFS_WANT_CORRUPTED_GOTO(
+		args->agbno + args->len <=
+			INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+			ARCH_CONVERT),
+		error0);
+	TRACE_ALLOC("normal", args);
+	return 0;
+
+error0:
+	TRACE_ALLOC("error", args);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Deal with the case where only small freespaces remain.
+ * Either return the contents of the last freespace record,
+ * or allocate space from the freelist if there is nothing in the tree.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_small(
+	xfs_alloc_arg_t *args,	/* allocation argument structure */
+	xfs_btree_cur_t *ccur,	/* by-size cursor */
+	xfs_agblock_t	*fbnop, /* result block number */
+	xfs_extlen_t	*flenp, /* result length */
+	int		*stat)	/* status: 0-freelist, 1-normal/none */
+{
+	int		error;
+	xfs_agblock_t	fbno;
+	xfs_extlen_t	flen;
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent_small";
+#endif
+	int		i;
+
+	if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+		goto error0;
+	if (i) {
+		if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	/*
+	 * Nothing in the btree, try the freelist.  Make sure
+	 * to respect minleft even when pulling from the
+	 * freelist.
+	 */
+	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+		 (INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_flcount,
+			ARCH_CONVERT) > args->minleft)) {
+		if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno)))
+			goto error0;
+		if (fbno != NULLAGBLOCK) {
+			if (args->userdata) {
+				xfs_buf_t	*bp;
+
+				bp = xfs_btree_get_bufs(args->mp, args->tp,
+					args->agno, fbno, 0);
+				xfs_trans_binval(args->tp, bp);
+			}
+			args->len = 1;
+			args->agbno = fbno;
+			XFS_WANT_CORRUPTED_GOTO(
+				args->agbno + args->len <=
+				INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+					ARCH_CONVERT),
+				error0);
+			args->wasfromfl = 1;
+			TRACE_ALLOC("freelist", args);
+			*stat = 0;
+			return 0;
+		}
+		/*
+		 * Nothing in the freelist.
+		 */
+		else
+			flen = 0;
+	}
+	/*
+	 * Can't allocate from the freelist for some reason.
+	 */
+	else
+		flen = 0;
+	/*
+	 * Can't do the allocation, give up.
+	 */
+	if (flen < args->minlen) {
+		args->agbno = NULLAGBLOCK;
+		TRACE_ALLOC("notenough", args);
+		flen = 0;
+	}
+	*fbnop = fbno;
+	*flenp = flen;
+	*stat = 1;
+	TRACE_ALLOC("normal", args);
+	return 0;
+
+error0:
+	TRACE_ALLOC("error", args);
+	return error;
+}
+
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int			/* error */
+xfs_free_ag_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer for a.g. freelist header */
+	xfs_agnumber_t	agno,	/* allocation group number */
+	xfs_agblock_t	bno,	/* starting block number */
+	xfs_extlen_t	len,	/* length of extent */
+	int		isfl)	/* set if is freelist blocks - no sb acctg */
+{
+	xfs_btree_cur_t *bno_cur;	/* cursor for by-block btree */
+	xfs_btree_cur_t *cnt_cur;	/* cursor for by-size btree */
+	int		error;		/* error return value */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_free_ag_extent";
+#endif
+	xfs_agblock_t	gtbno;		/* start of right neighbor block */
+	xfs_extlen_t	gtlen;		/* length of right neighbor block */
+	int		haveleft;	/* have a left neighbor block */
+	int		haveright;	/* have a right neighbor block */
+	int		i;		/* temp, result code */
+	xfs_agblock_t	ltbno;		/* start of left neighbor block */
+	xfs_extlen_t	ltlen;		/* length of left neighbor block */
+	xfs_mount_t	*mp;		/* mount point struct for filesystem */
+	xfs_agblock_t	nbno;		/* new starting block of freespace */
+	xfs_extlen_t	nlen;		/* new length of freespace */
+
+	mp = tp->t_mountp;
+	/*
+	 * Allocate and initialize a cursor for the by-block btree.
+	 */
+	bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, 0,
+		0);
+	cnt_cur = NULL;
+	/*
+	 * Look for a neighboring block on the left (lower block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+		goto error0;
+	if (haveleft) {
+		/*
+		 * There is a block to our left.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (ltbno + ltlen < bno)
+			haveleft = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+		}
+	}
+	/*
+	 * Look for a neighboring block on the right (higher block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+		goto error0;
+	if (haveright) {
+		/*
+		 * There is a block to our right.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (bno + len < gtbno)
+			haveright = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+		}
+	}
+	/*
+	 * Now allocate and initialize a cursor for the by-size tree.
+	 */
+	cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, 0,
+		0);
+	/*
+	 * Have both left and right contiguous neighbors.
+	 * Merge all three into a single free block.
+	 */
+	if (haveleft && haveright) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Delete the old by-block entry for the right block.
+		 */
+		if ((error = xfs_alloc_delete(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Move the by-block cursor back to the left neighbor.
+		 */
+		if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+#ifdef DEBUG
+		/*
+		 * Check that this is the right record: delete didn't
+		 * mangle the cursor.
+		 */
+		{
+			xfs_agblock_t	xxbno;
+			xfs_extlen_t	xxlen;
+
+			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(
+				i == 1 && xxbno == ltbno && xxlen == ltlen,
+				error0);
+		}
+#endif
+		/*
+		 * Update remaining by-block entry to the new, joined block.
+		 */
+		nbno = ltbno;
+		nlen = len + ltlen + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a left contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveleft) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Back up the by-block cursor to the left neighbor, and
+		 * update its length.
+		 */
+		if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		nbno = ltbno;
+		nlen = len + ltlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a right contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveright) {
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Update the starting block and length of the right
+		 * neighbor in the by-block tree.
+		 */
+		nbno = bno;
+		nlen = len + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * No contiguous neighbors.
+	 * Insert the new freespace into the by-block tree.
+	 */
+	else {
+		nbno = bno;
+		nlen = len;
+		if ((error = xfs_alloc_insert(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	bno_cur = NULL;
+	/*
+	 * In all cases we need to insert the new freespace in the by-size tree.
+	 */
+	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+	if ((error = xfs_alloc_insert(cnt_cur, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	cnt_cur = NULL;
+	/*
+	 * Update the freespace totals in the ag and superblock.
+	 */
+	{
+		xfs_agf_t	*agf;
+		xfs_perag_t	*pag;		/* per allocation group data */
+
+		agf = XFS_BUF_TO_AGF(agbp);
+		pag = &mp->m_perag[agno];
+		INT_MOD(agf->agf_freeblks, ARCH_CONVERT, len);
+		xfs_trans_agblocks_delta(tp, len);
+		pag->pagf_freeblks += len;
+		XFS_WANT_CORRUPTED_GOTO(
+			INT_GET(agf->agf_freeblks, ARCH_CONVERT)
+				<= INT_GET(agf->agf_length, ARCH_CONVERT),
+			error0);
+		TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
+		xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+		if (!isfl)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+		XFS_STATS_INC(xfsstats.xs_freex);
+		XFS_STATS_ADD(xfsstats.xs_freeb, len);
+	}
+	TRACE_FREE(haveleft ?
+			(haveright ? "both" : "left") :
+			(haveright ? "right" : "none"),
+		agno, bno, len, isfl);
+
+	/*
+	 * Since blocks move to the free list without the coordination
+	 * used in xfs_bmap_finish, we can't allow block to be available
+	 * for reallocation and non-transaction writing (user data)
+	 * until we know that the transaction that moved it to the free
+	 * list is permanently on disk.	 We track the blocks by declaring
+	 * these blocks as "busy"; the busy list is maintained on a per-ag
+	 * basis and each transaction records which entries should be removed
+	 * when the iclog commits to disk.  If a busy block is allocated,
+	 * the iclog is pushed up to the LSN that freed the block.
+	 */
+	xfs_alloc_mark_busy(tp, agno, bno, len);
+	return 0;
+
+ error0:
+	TRACE_FREE("error", agno, bno, len, isfl);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_ag_maxlevels = level;
+}
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ * This is external so mkfs can call it, too.
+ */
+int				/* error */
+xfs_alloc_fix_freelist(
+	xfs_alloc_arg_t *args,	/* allocation argument structure */
+	int		flags)	/* XFS_ALLOC_FLAG_... */
+{
+	xfs_buf_t	*agbp;	/* agf buffer pointer */
+	xfs_agf_t	*agf;	/* a.g. freespace structure pointer */
+	xfs_buf_t	*agflbp;/* agfl buffer pointer */
+	xfs_agblock_t	bno;	/* freelist block */
+	xfs_extlen_t	delta;	/* new blocks needed in freelist */
+	int		error;	/* error result code */
+	xfs_extlen_t	longest;/* longest extent in allocation group */
+	xfs_mount_t	*mp;	/* file system mount point structure */
+	xfs_extlen_t	need;	/* total blocks needed in freelist */
+	xfs_perag_t	*pag;	/* per-ag information structure */
+	xfs_alloc_arg_t targs;	/* local allocation arguments */
+	xfs_trans_t	*tp;	/* transaction pointer */
+
+	mp = args->mp;
+
+	pag = args->pag;
+	tp = args->tp;
+	if (!pag->pagf_init) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (!pag->pagf_init) {
+			args->agbp = NULL;
+			return 0;
+		}
+	} else
+		agbp = NULL;
+
+	/* If this is a metadata prefered pag and we are user data
+	 * then try somewhere else if we are not being asked to
+	 * try harder at this point
+	 */
+	if (pag->pagf_metadata && args->userdata && flags) {
+		args->agbp = NULL;
+		return 0;
+	}
+
+	need = XFS_MIN_FREELIST_PAG(pag, mp);
+	delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
+	/*
+	 * If it looks like there isn't a long enough extent, or enough
+	 * total blocks, reject it.
+	 */
+	longest = (pag->pagf_longest > delta) ?
+		(pag->pagf_longest - delta) :
+		(pag->pagf_flcount > 0 || pag->pagf_longest > 0);
+	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
+	    (args->minleft &&
+	     (int)(pag->pagf_freeblks + pag->pagf_flcount -
+		   need - args->total) <
+	     (int)args->minleft)) {
+		if (agbp)
+			xfs_trans_brelse(tp, agbp);
+		args->agbp = NULL;
+		return 0;
+	}
+	/*
+	 * Get the a.g. freespace buffer.
+	 * Can fail if we're not blocking on locks, and it's held.
+	 */
+	if (agbp == NULL) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (agbp == NULL) {
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+	/*
+	 * Figure out how many blocks we should have in the freelist.
+	 */
+	agf = XFS_BUF_TO_AGF(agbp);
+	need = XFS_MIN_FREELIST(agf, mp);
+	delta = need > INT_GET(agf->agf_flcount, ARCH_CONVERT) ?
+		(need - INT_GET(agf->agf_flcount, ARCH_CONVERT)) : 0;
+	/*
+	 * If there isn't enough total or single-extent, reject it.
+	 */
+	longest = INT_GET(agf->agf_longest, ARCH_CONVERT);
+	longest = (longest > delta) ? (longest - delta) :
+		(INT_GET(agf->agf_flcount, ARCH_CONVERT) > 0 || longest > 0);
+	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
+	     (args->minleft &&
+		(int)(INT_GET(agf->agf_freeblks, ARCH_CONVERT) +
+		   INT_GET(agf->agf_flcount, ARCH_CONVERT) - need - args->total) <
+	     (int)args->minleft)) {
+		xfs_trans_brelse(tp, agbp);
+		args->agbp = NULL;
+		return 0;
+	}
+	/*
+	 * Make the freelist shorter if it's too long.
+	 */
+	while (INT_GET(agf->agf_flcount, ARCH_CONVERT) > need) {
+		xfs_buf_t	*bp;
+
+		if ((error = xfs_alloc_get_freelist(tp, agbp, &bno)))
+			return error;
+		if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
+			return error;
+		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Initialize the args structure.
+	 */
+	targs.tp = tp;
+	targs.mp = mp;
+	targs.agbp = agbp;
+	targs.agno = args->agno;
+	targs.mod = targs.minleft = targs.wasdel = targs.userdata =
+		targs.minalignslop = 0;
+	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+	targs.type = XFS_ALLOCTYPE_THIS_AG;
+	targs.pag = pag;
+	if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+		return error;
+	/*
+	 * Make the freelist longer if it's too short.
+	 */
+	while (INT_GET(agf->agf_flcount, ARCH_CONVERT) < need) {
+		targs.agbno = 0;
+		targs.maxlen = need - INT_GET(agf->agf_flcount, ARCH_CONVERT);
+		/*
+		 * Allocate as many blocks as possible at once.
+		 */
+		if ((error = xfs_alloc_ag_vextent(&targs)))
+			return error;
+		/*
+		 * Stop if we run out.	Won't happen if callers are obeying
+		 * the restrictions correctly.	Can happen for free calls
+		 * on a completely full ag.
+		 */
+		if (targs.agbno == NULLAGBLOCK)
+			break;
+		/*
+		 * Put each allocated block on the list.
+		 */
+		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+			if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp,
+					bno)))
+				return error;
+		}
+	}
+	args->agbp = agbp;
+	return 0;
+}
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int				/* error */
+xfs_alloc_get_freelist(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer containing the agf structure */
+	xfs_agblock_t	*bnop)	/* block address retrieved from freelist */
+{
+	xfs_agf_t	*agf;	/* a.g. freespace structure */
+	xfs_agfl_t	*agfl;	/* a.g. freelist structure */
+	xfs_buf_t	*agflbp;/* buffer for a.g. freelist structure */
+	xfs_agblock_t	bno;	/* block number returned */
+	int		error;
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_get_freelist";
+#endif
+	xfs_mount_t	*mp;	/* mount structure */
+	xfs_perag_t	*pag;	/* per allocation group data */
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	/*
+	 * Freelist is empty, give up.
+	 */
+	if (INT_ISZERO(agf->agf_flcount, ARCH_CONVERT)) {
+		*bnop = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * Read the array of free blocks.
+	 */
+	mp = tp->t_mountp;
+	if ((error = xfs_alloc_read_agfl(mp, tp,
+			INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp)))
+		return error;
+	agfl = XFS_BUF_TO_AGFL(agflbp);
+	/*
+	 * Get the block number and update the data structures.
+	 */
+	bno = INT_GET(agfl->agfl_bno[INT_GET(agf->agf_flfirst, ARCH_CONVERT)], ARCH_CONVERT);
+	INT_MOD(agf->agf_flfirst, ARCH_CONVERT, 1);
+	xfs_trans_brelse(tp, agflbp);
+	if (INT_GET(agf->agf_flfirst, ARCH_CONVERT) == XFS_AGFL_SIZE)
+		INT_ZERO(agf->agf_flfirst, ARCH_CONVERT);
+	pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)];
+	INT_MOD(agf->agf_flcount, ARCH_CONVERT, -1);
+	xfs_trans_agflist_delta(tp, -1);
+	pag->pagf_flcount--;
+	TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
+	*bnop = bno;
+
+	/*
+	 * As blocks are freed, they are added to the per-ag busy list
+	 * and remain there until the freeing transaction is committed to
+	 * disk.  Now that we have allocated blocks, this list must be
+	 * searched to see if a block is being reused.	If one is, then
+	 * the freeing transaction must be pushed to disk NOW by forcing
+	 * to disk all iclogs up that transaction's LSN.
+	 */
+	xfs_alloc_search_busy(tp, INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
+	return 0;
+}
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*bp,	/* buffer for a.g. freelist header */
+	int		fields) /* mask of fields to be logged (XFS_AGF_...) */
+{
+	int	first;		/* first byte offset */
+	int	last;		/* last byte offset */
+	static const short	offsets[] = {
+		offsetof(xfs_agf_t, agf_magicnum),
+		offsetof(xfs_agf_t, agf_versionnum),
+		offsetof(xfs_agf_t, agf_seqno),
+		offsetof(xfs_agf_t, agf_length),
+		offsetof(xfs_agf_t, agf_roots[0]),
+		offsetof(xfs_agf_t, agf_levels[0]),
+		offsetof(xfs_agf_t, agf_flfirst),
+		offsetof(xfs_agf_t, agf_fllast),
+		offsetof(xfs_agf_t, agf_flcount),
+		offsetof(xfs_agf_t, agf_freeblks),
+		offsetof(xfs_agf_t, agf_longest),
+		sizeof(xfs_agf_t)
+	};
+
+	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int					/* error */
+xfs_alloc_pagf_init(
+	xfs_mount_t		*mp,	/* file system mount structure */
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags)	/* XFS_ALLOC_FLAGS_... */
+{
+	xfs_buf_t		*bp;
+	int			error;
+
+	if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
+		return error;
+	if (bp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int					/* error */
+xfs_alloc_put_freelist(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_buf_t		*agbp,	/* buffer for a.g. freelist header */
+	xfs_buf_t		*agflbp,/* buffer for a.g. free block array */
+	xfs_agblock_t		bno)	/* block being freed */
+{
+	xfs_agf_t		*agf;	/* a.g. freespace structure */
+	xfs_agfl_t		*agfl;	/* a.g. free block array */
+	xfs_agblock_t		*blockp;/* pointer to array entry */
+	int			error;
+#ifdef XFS_ALLOC_TRACE
+	static char		fname[] = "xfs_alloc_put_freelist";
+#endif
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_perag_t		*pag;	/* per allocation group data */
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	mp = tp->t_mountp;
+
+	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
+			INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp)))
+		return error;
+	agfl = XFS_BUF_TO_AGFL(agflbp);
+	INT_MOD(agf->agf_fllast, ARCH_CONVERT, 1);
+	if (INT_GET(agf->agf_fllast, ARCH_CONVERT) == XFS_AGFL_SIZE)
+		INT_ZERO(agf->agf_fllast, ARCH_CONVERT);
+	pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)];
+	INT_MOD(agf->agf_flcount, ARCH_CONVERT, 1);
+	xfs_trans_agflist_delta(tp, 1);
+	pag->pagf_flcount++;
+	ASSERT(INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE);
+	blockp = &agfl->agfl_bno[INT_GET(agf->agf_fllast, ARCH_CONVERT)];
+	INT_SET(*blockp, ARCH_CONVERT, bno);
+	TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
+	xfs_trans_log_buf(tp, agflbp,
+		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
+		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
+			sizeof(xfs_agblock_t) - 1));
+	/*
+	 * Since blocks move to the free list without the coordination
+	 * used in xfs_bmap_finish, we can't allow block to be available
+	 * for reallocation and non-transaction writing (user data)
+	 * until we know that the transaction that moved it to the free
+	 * list is permanently on disk.	 We track the blocks by declaring
+	 * these blocks as "busy"; the busy list is maintained on a per-ag
+	 * basis and each transaction records which entries should be removed
+	 * when the iclog commits to disk.  If a busy block is allocated,
+	 * the iclog is pushed up to the LSN that freed the block.
+	 */
+	xfs_alloc_mark_busy(tp, INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
+
+	return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error */
+xfs_alloc_read_agf(
+	xfs_mount_t	*mp,		/* mount point structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	int		flags,		/* XFS_ALLOC_FLAG_... */
+	xfs_buf_t	**bpp)		/* buffer for the ag freelist header */
+{
+	xfs_agf_t	*agf;		/* ag freelist header */
+	int		agf_ok;		/* set if agf is consistent */
+	xfs_buf_t	*bp;		/* return value */
+	xfs_daddr_t	d;		/* disk block address */
+	int		error;
+	xfs_perag_t	*pag;		/* per allocation group data */
+
+	ASSERT(agno != NULLAGNUMBER);
+	d = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 1,
+			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
+			&bp)))
+		return error;
+	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+	if (!bp) {
+		*bpp = NULL;
+		return 0;
+	}
+	/*
+	 * Validate the magic number of the agf block.
+	 */
+	agf = XFS_BUF_TO_AGF(bp);
+	agf_ok =
+		INT_GET(agf->agf_magicnum, ARCH_CONVERT) == XFS_AGF_MAGIC &&
+		XFS_AGF_GOOD_VERSION(INT_GET(agf->agf_versionnum, ARCH_CONVERT)) &&
+		INT_GET(agf->agf_freeblks, ARCH_CONVERT) <=
+				INT_GET(agf->agf_length, ARCH_CONVERT) &&
+		INT_GET(agf->agf_flfirst, ARCH_CONVERT) < XFS_AGFL_SIZE &&
+		INT_GET(agf->agf_fllast,  ARCH_CONVERT) < XFS_AGFL_SIZE &&
+		INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE;
+	if (XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+			XFS_RANDOM_ALLOC_READ_AGF)) {
+		xfs_trans_brelse(tp, bp);
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"xfs_alloc_read_agf: error in <%s> AG %d",
+			mp->m_fsname, agno);
+		if (INT_GET(agf->agf_magicnum, ARCH_CONVERT) != XFS_AGF_MAGIC)
+			cmn_err(CE_NOTE, "bad agf_magicnum 0x%x",
+				INT_GET(agf->agf_magicnum, ARCH_CONVERT));
+		if (!XFS_AGF_GOOD_VERSION(INT_GET(agf->agf_versionnum, ARCH_CONVERT)))
+			cmn_err(CE_NOTE, "Bad version number 0x%x",
+				INT_GET(agf->agf_versionnum, ARCH_CONVERT));
+		if (!(INT_GET(agf->agf_freeblks, ARCH_CONVERT) <=
+				INT_GET(agf->agf_length, ARCH_CONVERT)))
+			cmn_err(CE_NOTE, "Bad freeblks %d %d",
+				INT_GET(agf->agf_freeblks, ARCH_CONVERT),
+				INT_GET(agf->agf_length, ARCH_CONVERT));
+		if (!(INT_GET(agf->agf_flfirst, ARCH_CONVERT) < XFS_AGFL_SIZE))
+			cmn_err(CE_NOTE, "Bad flfirst %d",
+				INT_GET(agf->agf_flfirst, ARCH_CONVERT));
+		if (!(INT_GET(agf->agf_fllast, ARCH_CONVERT) < XFS_AGFL_SIZE))
+			cmn_err(CE_NOTE, "Bad fllast %d",
+				INT_GET(agf->agf_fllast, ARCH_CONVERT));
+		if (!(INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE))
+			cmn_err(CE_NOTE, "Bad flcount %d",
+				INT_GET(agf->agf_flcount, ARCH_CONVERT));
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	pag = &mp->m_perag[agno];
+	if (!pag->pagf_init) {
+		pag->pagf_freeblks = INT_GET(agf->agf_freeblks, ARCH_CONVERT);
+		pag->pagf_flcount = INT_GET(agf->agf_flcount, ARCH_CONVERT);
+		pag->pagf_longest = INT_GET(agf->agf_longest, ARCH_CONVERT);
+		pag->pagf_levels[XFS_BTNUM_BNOi] =
+			INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT);
+		pag->pagf_levels[XFS_BTNUM_CNTi] =
+			INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT);
+		spinlock_init(&pag->pagb_lock, "xfspagb");
+		pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
+					sizeof(xfs_perag_busy_t), KM_SLEEP);
+		pag->pagf_init = 1;
+	}
+#ifdef DEBUG
+	else if (!XFS_FORCED_SHUTDOWN(mp)) {
+		ASSERT(pag->pagf_freeblks == INT_GET(agf->agf_freeblks, ARCH_CONVERT));
+		ASSERT(pag->pagf_flcount == INT_GET(agf->agf_flcount, ARCH_CONVERT));
+		ASSERT(pag->pagf_longest == INT_GET(agf->agf_longest, ARCH_CONVERT));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+		       INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+		       INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT));
+	}
+#endif
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t *args)	/* allocation argument structure */
+{
+	xfs_agblock_t	agsize; /* allocation group size */
+	int		error;
+	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_vextent";
+#endif
+	xfs_extlen_t	minleft;/* minimum left value, temp copy */
+	xfs_mount_t	*mp;	/* mount structure pointer */
+	xfs_agnumber_t	sagno;	/* starting allocation group number */
+	xfs_alloctype_t type;	/* input allocation type */
+	int		bump_rotor = 0;
+	int		no_min = 0;
+
+	mp = args->mp;
+	type = args->otype = args->type;
+	args->agbno = NULLAGBLOCK;
+	/*
+	 * Just fix this up, for the case where the last a.g. is shorter
+	 * (or there's only one a.g.) and the caller couldn't easily figure
+	 * that out (xfs_bmap_alloc).
+	 */
+	agsize = mp->m_sb.sb_agblocks;
+	if (args->maxlen > agsize)
+		args->maxlen = agsize;
+	if (args->alignment == 0)
+		args->alignment = 1;
+	ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->minlen <= agsize);
+	ASSERT(args->mod < args->prod);
+	if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+	    XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+	    args->minlen > args->maxlen || args->minlen > agsize ||
+	    args->mod >= args->prod) {
+		args->fsbno = NULLFSBLOCK;
+		TRACE_ALLOC("badargs", args);
+		return 0;
+	}
+	minleft = args->minleft;
+
+	switch (type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+	case XFS_ALLOCTYPE_NEAR_BNO:
+	case XFS_ALLOCTYPE_THIS_BNO:
+		/*
+		 * These three force us into a single a.g.
+		 */
+		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+		down_read(&mp->m_peraglock);
+		args->pag = &mp->m_perag[args->agno];
+		args->minleft = 0;
+		error = xfs_alloc_fix_freelist(args, 0);
+		args->minleft = minleft;
+		if (error) {
+			TRACE_ALLOC("nofix", args);
+			goto error0;
+		}
+		if (!args->agbp) {
+			up_read(&mp->m_peraglock);
+			TRACE_ALLOC("noagbp", args);
+			break;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		if ((error = xfs_alloc_ag_vextent(args)))
+			goto error0;
+		up_read(&mp->m_peraglock);
+		break;
+	case XFS_ALLOCTYPE_START_BNO:
+		/*
+		 * Try near allocation first, then anywhere-in-ag after
+		 * the first a.g. fails.
+		 */
+		if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+			args->fsbno = XFS_AGB_TO_FSB(mp, mp->m_agfrotor, 0);
+			bump_rotor = 1;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		args->type = XFS_ALLOCTYPE_NEAR_BNO;
+		/* FALLTHROUGH */
+	case XFS_ALLOCTYPE_ANY_AG:
+	case XFS_ALLOCTYPE_START_AG:
+	case XFS_ALLOCTYPE_FIRST_AG:
+		/*
+		 * Rotate through the allocation groups looking for a winner.
+		 */
+		if (type == XFS_ALLOCTYPE_ANY_AG) {
+			/*
+			 * Start with the last place we left off.
+			 */
+			args->agno = sagno = mp->m_agfrotor;
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		} else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+			/*
+			 * Start with allocation group given by bno.
+			 */
+			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			sagno = 0;
+			flags = 0;
+		} else {
+			if (type == XFS_ALLOCTYPE_START_AG)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			/*
+			 * Start with the given allocation group.
+			 */
+			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		}
+		/*
+		 * Loop over allocation groups twice; first time with
+		 * trylock set, second time without.
+		 */
+		down_read(&mp->m_peraglock);
+		for (;;) {
+			args->pag = &mp->m_perag[args->agno];
+			if (no_min) args->minleft = 0;
+			error = xfs_alloc_fix_freelist(args, flags);
+			args->minleft = minleft;
+			if (error) {
+				TRACE_ALLOC("nofix", args);
+				goto error0;
+			}
+			/*
+			 * If we get a buffer back then the allocation will fly.
+			 */
+			if (args->agbp) {
+				if ((error = xfs_alloc_ag_vextent(args)))
+					goto error0;
+				break;
+			}
+			TRACE_ALLOC("loopfailed", args);
+			/*
+			 * Didn't work, figure out the next iteration.
+			 */
+			if (args->agno == sagno &&
+			    type == XFS_ALLOCTYPE_START_BNO)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			if (++(args->agno) == mp->m_sb.sb_agcount)
+				args->agno = 0;
+			/*
+			 * Reached the starting a.g., must either be done
+			 * or switch to non-trylock mode.
+			 */
+			if (args->agno == sagno) {
+				if (no_min == 1) {
+					args->agbno = NULLAGBLOCK;
+					TRACE_ALLOC("allfailed", args);
+					break;
+				}
+				if (flags == 0) {
+					no_min = 1;
+				} else {
+					flags = 0;
+					if (type == XFS_ALLOCTYPE_START_BNO) {
+						args->agbno = XFS_FSB_TO_AGBNO(mp,
+							args->fsbno);
+						args->type = XFS_ALLOCTYPE_NEAR_BNO;
+					}
+				}
+			}
+		}
+		up_read(&mp->m_peraglock);
+		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG))
+			mp->m_agfrotor = (args->agno + 1) % mp->m_sb.sb_agcount;
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+	if (args->agbno == NULLAGBLOCK)
+		args->fsbno = NULLFSBLOCK;
+	else {
+		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+		ASSERT(args->len >= args->minlen);
+		ASSERT(args->len <= args->maxlen);
+		ASSERT(args->agbno % args->alignment == 0);
+		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+			args->len);
+#endif
+	}
+	return 0;
+error0:
+	up_read(&mp->m_peraglock);
+	return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int				/* error */
+xfs_free_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_fsblock_t	bno,	/* starting block number of extent */
+	xfs_extlen_t	len)	/* length of extent */
+{
+#ifdef DEBUG
+	xfs_agf_t	*agf;	/* a.g. freespace header */
+#endif
+	xfs_alloc_arg_t args;	/* allocation argument structure */
+	int		error;
+
+	ASSERT(len != 0);
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+	ASSERT(args.agno < args.mp->m_sb.sb_agcount);
+	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+	args.alignment = 1;
+	args.minlen = args.minleft = args.minalignslop = 0;
+	down_read(&args.mp->m_peraglock);
+	args.pag = &args.mp->m_perag[args.agno];
+	if ((error = xfs_alloc_fix_freelist(&args, 0)))
+		goto error0;
+#ifdef DEBUG
+	ASSERT(args.agbp != NULL);
+	agf = XFS_BUF_TO_AGF(args.agbp);
+	ASSERT(args.agbno + len <= INT_GET(agf->agf_length, ARCH_CONVERT));
+#endif
+	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno,
+		len, 0);
+error0:
+	up_read(&args.mp->m_peraglock);
+	return error;
+}
+
+
+/*
+ * AG Busy list management
+ * The busy list contains block ranges that have been freed but whose
+ * transacations have not yet hit disk.	 If any block listed in a busy
+ * list is reused, the transaction that freed it must be forced to disk
+ * before continuing to use the block.
+ *
+ * xfs_alloc_mark_busy - add to the per-ag busy list
+ * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ */
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+		    xfs_agnumber_t agno,
+		    xfs_agblock_t bno,
+		    xfs_extlen_t len)
+{
+	xfs_mount_t		*mp;
+	xfs_perag_busy_t	*bsy;
+	int			n;
+	SPLDECL(s);
+
+	mp = tp->t_mountp;
+	s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
+
+	/* search pagb_list for an open slot */
+	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+	     n < XFS_PAGB_NUM_SLOTS;
+	     bsy++, n++) {
+		if (bsy->busy_tp == NULL) {
+			break;
+		}
+	}
+
+	if (n < XFS_PAGB_NUM_SLOTS) {
+		bsy = &mp->m_perag[agno].pagb_list[n];
+		mp->m_perag[agno].pagb_count++;
+		TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp);
+		bsy->busy_start = bno;
+		bsy->busy_length = len;
+		bsy->busy_tp = tp;
+		xfs_trans_add_busy(tp, agno, n);
+	} else {
+		TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp);
+		/*
+		 * The busy list is full!  Since it is now not possible to
+		 * track the free block, make this a synchronous transaction
+		 * to insure that the block is not reused before this
+		 * transaction commits.
+		 */
+		xfs_trans_set_sync(tp);
+	}
+
+	mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+}
+
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+		     xfs_agnumber_t agno,
+		     int idx)
+{
+	xfs_mount_t		*mp;
+	xfs_perag_busy_t	*list;
+	SPLDECL(s);
+
+	mp = tp->t_mountp;
+
+	s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
+	list = mp->m_perag[agno].pagb_list;
+
+	ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+	if (list[idx].busy_tp == tp) {
+		TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp);
+		list[idx].busy_tp = NULL;
+		mp->m_perag[agno].pagb_count--;
+	} else {
+		TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp);
+	}
+
+	mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+}
+
+
+/*
+ * returns non-zero if any of (agno,bno):len is in a busy list
+ */
+int
+xfs_alloc_search_busy(xfs_trans_t *tp,
+		    xfs_agnumber_t agno,
+		    xfs_agblock_t bno,
+		    xfs_extlen_t len)
+{
+	xfs_mount_t		*mp;
+	xfs_perag_busy_t	*bsy;
+	int			n;
+	xfs_agblock_t		uend, bend;
+	xfs_lsn_t		lsn;
+	int			cnt;
+	SPLDECL(s);
+
+	mp = tp->t_mountp;
+
+	s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
+	cnt = mp->m_perag[agno].pagb_count;
+
+	uend = bno + len;
+
+	/* search pagb_list for this slot, skipping open slots */
+	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+	     cnt; bsy++, n++) {
+
+		/*
+		 * (start1,length1) within (start2, length2)
+		 */
+		if (bsy->busy_tp != NULL) {
+			bend = bsy->busy_start + bsy->busy_length;
+			if ( (bno >= bsy->busy_start && bno <= bend) ||
+			     (uend >= bsy->busy_start && uend <= bend) ||
+			     (bno <= bsy->busy_start && uend >= bsy->busy_start) ) {
+				TRACE_BUSYSEARCH("xfs_alloc_search_busy",
+						 "found1", agno, bno, len, n,
+						 tp);
+				break;
+			}
+			cnt--;
+		}
+	}
+
+	/*
+	 * If a block was found, force the log through the LSN of the
+	 * transaction that freed the block
+	 */
+	if (cnt) {
+		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
+		lsn = bsy->busy_tp->t_lsn;
+		mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+	} else {
+		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
+		n = -1;
+		mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+	}
+
+	return n;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_alloc.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_alloc.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_alloc.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_alloc.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ALLOC_H__
+#define __XFS_ALLOC_H__
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+
+/*
+ * Freespace allocation types.	Argument to xfs_alloc_[v]extent.
+ */
+typedef enum xfs_alloctype
+{
+	XFS_ALLOCTYPE_ANY_AG,		/* allocate anywhere, use rotor */
+	XFS_ALLOCTYPE_FIRST_AG,		/* ... start at ag 0 */
+	XFS_ALLOCTYPE_START_AG,		/* anywhere, start in this a.g. */
+	XFS_ALLOCTYPE_THIS_AG,		/* anywhere in this a.g. */
+	XFS_ALLOCTYPE_START_BNO,	/* near this block else anywhere */
+	XFS_ALLOCTYPE_NEAR_BNO,		/* in this a.g. and near this block */
+	XFS_ALLOCTYPE_THIS_BNO		/* at exactly this block */
+} xfs_alloctype_t;
+
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
+
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+	struct xfs_trans *tp;		/* transaction pointer */
+	struct xfs_mount *mp;		/* file system mount point */
+	struct xfs_buf	*agbp;		/* buffer for a.g. freelist header */
+	struct xfs_perag *pag;		/* per-ag struct for this agno */
+	xfs_fsblock_t	fsbno;		/* file system block number */
+	xfs_agnumber_t	agno;		/* allocation group number */
+	xfs_agblock_t	agbno;		/* allocation group-relative block # */
+	xfs_extlen_t	minlen;		/* minimum size of extent */
+	xfs_extlen_t	maxlen;		/* maximum size of extent */
+	xfs_extlen_t	mod;		/* mod value for extent size */
+	xfs_extlen_t	prod;		/* prod value for extent size */
+	xfs_extlen_t	minleft;	/* min blocks must be left after us */
+	xfs_extlen_t	total;		/* total blocks needed in xaction */
+	xfs_extlen_t	alignment;	/* align answer to multiple of this */
+	xfs_extlen_t	minalignslop;	/* slop for minlen+alignment calcs */
+	xfs_extlen_t	len;		/* output: actual size of extent */
+	xfs_alloctype_t type;		/* allocation type XFS_ALLOCTYPE_... */
+	xfs_alloctype_t otype;		/* original allocation type */
+	char		wasdel;		/* set if allocation was prev delayed */
+	char		wasfromfl;	/* set if allocation is from freelist */
+	char		isfl;		/* set if is freelist blocks - !actg */
+	char		userdata;	/* set if this is user data */
+} xfs_alloc_arg_t;
+
+/*
+ * Defines for userdata
+ */
+#define XFS_ALLOC_USERDATA		1	/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA	2	/* special case start of file */
+
+
+#ifdef __KERNEL__
+
+/*
+ * Types for alloc tracing.
+ */
+#define XFS_ALLOC_KTRACE_ALLOC	1
+#define XFS_ALLOC_KTRACE_FREE	2
+#define XFS_ALLOC_KTRACE_MODAGF 3
+#define XFS_ALLOC_KTRACE_BUSY	4
+#define XFS_ALLOC_KTRACE_UNBUSY 5
+#define XFS_ALLOC_KTRACE_BUSYSEARCH	6
+
+
+/*
+ * Allocation tracing buffer size.
+ */
+#define XFS_ALLOC_TRACE_SIZE	4096
+
+#ifdef	XFS_ALL_TRACE
+#define XFS_ALLOC_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef	XFS_ALLOC_TRACE
+#endif
+
+/*
+ * Prototypes for visible xfs_alloc.c routines
+ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ * This is external so mkfs can call it, too.
+ */
+int				/* error */
+xfs_alloc_fix_freelist(
+	xfs_alloc_arg_t *args,	/* allocation argument structure */
+	int		flags); /* XFS_ALLOC_FLAG_... */
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int				/* error */
+xfs_alloc_get_freelist(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*agbp,	/* buffer containing the agf structure */
+	xfs_agblock_t	*bnop); /* block address retrieved from freelist */
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*bp,	/* buffer for a.g. freelist header */
+	int		fields);/* mask of fields to be logged (XFS_AGF_...) */
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int				/* error */
+xfs_alloc_pagf_init(
+	struct xfs_mount *mp,	/* file system mount structure */
+	struct xfs_trans *tp,	/* transaction pointer */
+	xfs_agnumber_t	agno,	/* allocation group number */
+	int		flags); /* XFS_ALLOC_FLAGS_... */
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int				/* error */
+xfs_alloc_put_freelist(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*agbp,	/* buffer for a.g. freelist header */
+	struct xfs_buf	*agflbp,/* buffer for a.g. free block array */
+	xfs_agblock_t	bno);	/* block being freed */
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error  */
+xfs_alloc_read_agf(
+	struct xfs_mount *mp,		/* mount point structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	int		flags,		/* XFS_ALLOC_FLAG_... */
+	struct xfs_buf	**bpp);		/* buffer for the ag freelist header */
+
+/*
+ * Allocate an extent (variable-size).
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t *args); /* allocation argument structure */
+
+/*
+ * Free an extent.
+ */
+int				/* error */
+xfs_free_extent(
+	struct xfs_trans *tp,	/* transaction pointer */
+	xfs_fsblock_t	bno,	/* starting block number of extent */
+	xfs_extlen_t	len);	/* length of extent */
+
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+		xfs_agnumber_t agno,
+		xfs_agblock_t bno,
+		xfs_extlen_t len);
+
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+		xfs_agnumber_t ag,
+		int idx);
+
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_ALLOC_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_alloc_btree.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_alloc_btree.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_alloc_btree.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_alloc_btree.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,2155 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Free space allocation for XFS.
+ */
+
+#include <xfs.h>
+
+/*
+ * Prototypes for internal functions.
+ */
+
+STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
+STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
+STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
+		xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
+STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * Single level of the xfs_alloc_delete record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int				/* error */
+xfs_alloc_delrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level removing record from */
+	int			*stat)	/* fail/done/go-on */
+{
+	xfs_agf_t		*agf;	/* allocation group freelist header */
+	xfs_alloc_block_t	*block; /* btree block record/key lives in */
+	xfs_agblock_t		bno;	/* btree block number */
+	xfs_buf_t		*bp;	/* buffer for block */
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_alloc_key_t		key;	/* kp points here if block is level 0 */
+	xfs_agblock_t		lbno;	/* left block's block number */
+	xfs_buf_t		*lbp;	/* left block's buffer pointer */
+	xfs_alloc_block_t	*left;	/* left btree block */
+	xfs_alloc_key_t		*lkp=NULL;	/* left block key pointer */
+	xfs_alloc_ptr_t		*lpp=NULL;	/* left block address pointer */
+	int			lrecs=0;	/* number of records in left block */
+	xfs_alloc_rec_t		*lrp;	/* left block record pointer */
+	xfs_mount_t		*mp;	/* mount structure */
+	int			ptr;	/* index in btree block for this rec */
+	xfs_agblock_t		rbno;	/* right block's block number */
+	xfs_buf_t		*rbp;	/* right block's buffer pointer */
+	xfs_alloc_block_t	*right; /* right btree block */
+	xfs_alloc_key_t		*rkp;	/* right block key pointer */
+	xfs_alloc_ptr_t		*rpp;	/* right block address pointer */
+	int			rrecs=0;	/* number of records in right block */
+	xfs_alloc_rec_t		*rrp;	/* right block record pointer */
+	xfs_btree_cur_t		*tcur;	/* temporary btree cursor */
+
+	/*
+	 * Get the index of the entry being deleted, check for nothing there.
+	 */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Get the buffer & block containing the record or key/ptr.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+#endif
+	/*
+	 * Fail if we're off the end of the block.
+	 */
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 0;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_abt_delrec);
+	/*
+	 * It's a nonleaf.  Excise the key and ptr being deleted, by
+	 * sliding the entries past them down one.
+	 * Log the changed areas of the block.
+	 */
+	if (level > 0) {
+		lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+		lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = ptr; i < INT_GET(block->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			ovbcopy(&lkp[ptr], &lkp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lkp)); /* INT_: mem copy */
+			ovbcopy(&lpp[ptr], &lpp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lpp)); /* INT_: mem copy */
+			xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+			xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+		}
+	}
+	/*
+	 * It's a leaf.	 Excise the record being deleted, by sliding the
+	 * entries past it down one.  Log the changed areas of the block.
+	 */
+	else {
+		lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			ovbcopy(&lrp[ptr], &lrp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lrp));
+			xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+		}
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		if (ptr == 1) {
+			key.ar_startblock = lrp->ar_startblock; /* INT_: direct copy */
+			key.ar_blockcount = lrp->ar_blockcount; /* INT_: direct copy */
+			lkp = &key;
+		}
+	}
+	/*
+	 * Decrement and log the number of entries in the block.
+	 */
+	INT_MOD(block->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+	/*
+	 * See if the longest free extent in the allocation group was
+	 * changed by this operation.  True if it's the by-size btree, and
+	 * this is the leaf level, and there is no right sibling block,
+	 * and this was the last record.
+	 */
+	agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+	mp = cur->bc_mp;
+
+	if (level == 0 &&
+	    cur->bc_btnum == XFS_BTNUM_CNT &&
+	    INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
+	    ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		ASSERT(ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT) + 1);
+		/*
+		 * There are still records in the block.  Grab the size
+		 * from the last one.
+		 */
+		if (INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			rrp = XFS_ALLOC_REC_ADDR(block, INT_GET(block->bb_numrecs, ARCH_CONVERT), cur);
+			INT_COPY(agf->agf_longest, rrp->ar_blockcount, ARCH_CONVERT);
+		}
+		/*
+		 * No free extents left.
+		 */
+		else
+			INT_ZERO(agf->agf_longest, ARCH_CONVERT);
+		mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest =
+			INT_GET(agf->agf_longest, ARCH_CONVERT);
+		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+			XFS_AGF_LONGEST);
+	}
+	/*
+	 * Is this the root level?  If so, we're almost done.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		/*
+		 * If this is the root level,
+		 * and there's only one entry left,
+		 * and it's NOT the leaf level,
+		 * then we can get rid of this level.
+		 */
+		if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == 1 && level > 0) {
+			/*
+			 * lpp is still set to the first pointer in the block.
+			 * Make it the new root of the btree.
+			 */
+			bno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT);
+			INT_COPY(agf->agf_roots[cur->bc_btnum], *lpp, ARCH_CONVERT);
+			INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, -1);
+			mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_levels[cur->bc_btnum]--;
+			/*
+			 * Put this buffer/block on the ag's freelist.
+			 */
+			if ((error = xfs_alloc_put_freelist(cur->bc_tp,
+					cur->bc_private.a.agbp, NULL, bno)))
+				return error;
+			xfs_trans_agbtree_delta(cur->bc_tp, -1);
+			xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+				XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+			/*
+			 * Update the cursor so there's one fewer level.
+			 */
+			xfs_btree_setbuf(cur, level, 0);
+			cur->bc_nlevels--;
+		} else if (level > 0 &&
+			   (error = xfs_alloc_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * If we deleted the leftmost entry in the block, update the
+	 * key values above us in the tree.
+	 */
+	if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
+		return error;
+	/*
+	 * If the number of records remaining in the block is at least
+	 * the minimum, we're done.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+		if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * Otherwise, we have to move some records around to keep the
+	 * tree balanced.  Look at the left and right sibling blocks to
+	 * see if we can re-balance by moving only one record.
+	 */
+	rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+	lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
+	bno = NULLAGBLOCK;
+	ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
+	/*
+	 * Duplicate the cursor so our btree manipulations here won't
+	 * disrupt the next level up.
+	 */
+	if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+		return error;
+	/*
+	 * If there's a right sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (rbno != NULLAGBLOCK) {
+		/*
+		 * Move the temp cursor to the last entry in the next block.
+		 * Actually any entry but the first would suffice.
+		 */
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_increment(tcur, level, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Grab a pointer to the block.
+		 */
+		rbp = tcur->bc_bufs[level];
+		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+			goto error0;
+#endif
+		/*
+		 * Grab the current block number, for future use.
+		 */
+		bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		/*
+		 * If right block is full enough so that removing one entry
+		 * won't make it too empty, and left-shifting an entry out
+		 * of right to us works, we're done.
+		 */
+		if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
+		     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+			if ((error = xfs_alloc_lshift(tcur, level, &i)))
+				goto error0;
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_ALLOC_BLOCK_MINRECS(level, cur));
+				xfs_btree_del_cursor(tcur,
+						     XFS_BTREE_NOERROR);
+				if (level > 0 &&
+				    (error = xfs_alloc_decrement(cur, level,
+					    &i)))
+					return error;
+				*stat = 1;
+				return 0;
+			}
+		}
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference, and fix up the temp cursor to point
+		 * to our block again (last record).
+		 */
+		rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+		if (lbno != NULLAGBLOCK) {
+			i = xfs_btree_firstrec(tcur, level);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if ((error = xfs_alloc_decrement(tcur, level, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		}
+	}
+	/*
+	 * If there's a left sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (lbno != NULLAGBLOCK) {
+		/*
+		 * Move the temp cursor to the first entry in the
+		 * previous block.
+		 */
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_decrement(tcur, level, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		xfs_btree_firstrec(tcur, level);
+		/*
+		 * Grab a pointer to the block.
+		 */
+		lbp = tcur->bc_bufs[level];
+		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+			goto error0;
+#endif
+		/*
+		 * Grab the current block number, for future use.
+		 */
+		bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		/*
+		 * If left block is full enough so that removing one entry
+		 * won't make it too empty, and right-shifting an entry out
+		 * of left to us works, we're done.
+		 */
+		if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
+		     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+			if ((error = xfs_alloc_rshift(tcur, level, &i)))
+				goto error0;
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_ALLOC_BLOCK_MINRECS(level, cur));
+				xfs_btree_del_cursor(tcur,
+						     XFS_BTREE_NOERROR);
+				if (level == 0)
+					cur->bc_ptrs[0]++;
+				*stat = 1;
+				return 0;
+			}
+		}
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference.
+		 */
+		lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * Delete the temp cursor, we're done with it.
+	 */
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	/*
+	 * If here, we need to do a join to keep the tree balanced.
+	 */
+	ASSERT(bno != NULLAGBLOCK);
+	/*
+	 * See if we can join with the left neighbor block.
+	 */
+	if (lbno != NULLAGBLOCK &&
+	    lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * Set "right" to be the starting block,
+		 * "left" to be the left neighbor.
+		 */
+		rbno = bno;
+		right = block;
+		rbp = bp;
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, lbno, 0, &lbp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+		if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+			return error;
+	}
+	/*
+	 * If that won't work, see if we can join with the right neighbor block.
+	 */
+	else if (rbno != NULLAGBLOCK &&
+		 rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+		  XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * Set "left" to be the starting block,
+		 * "right" to be the right neighbor.
+		 */
+		lbno = bno;
+		left = block;
+		lbp = bp;
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, rbno, 0, &rbp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+		if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+			return error;
+	}
+	/*
+	 * Otherwise, we can't fix the imbalance.
+	 * Just return.	 This is probably a logic error, but it's not fatal.
+	 */
+	else {
+		if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * We're now going to join "left" and "right" by moving all the stuff
+	 * in "right" to "left" and deleting "right".
+	 */
+	if (level > 0) {
+		/*
+		 * It's a non-leaf.  Move keys and pointers.
+		 */
+		lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		bcopy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lkp)); /* INT_: structure copy */
+		bcopy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lpp)); /* INT_: structure copy */
+		xfs_alloc_log_keys(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_alloc_log_ptrs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	} else {
+		/*
+		 * It's a leaf.	 Move records.
+		 */
+		lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+		bcopy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lrp));
+		xfs_alloc_log_recs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	}
+	/*
+	 * If we joined with the left neighbor, set the buffer in the
+	 * cursor to the left block, and fix up the index.
+	 */
+	if (bp != lbp) {
+		xfs_btree_setbuf(cur, level, lbp);
+		cur->bc_ptrs[level] += INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * If we joined with the right neighbor and there's a level above
+	 * us, increment the cursor at that level.
+	 */
+	else if (level + 1 < cur->bc_nlevels &&
+		 (error = xfs_alloc_increment(cur, level + 1, &i)))
+		return error;
+	/*
+	 * Fix up the number of records in the surviving block.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	/*
+	 * Fix up the right block pointer in the surviving block, and log it.
+	 */
+	left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */
+	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	/*
+	 * If there is a right sibling now, make it point to the
+	 * remaining block.
+	 */
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		xfs_alloc_block_t	*rrblock;
+		xfs_buf_t		*rrbp;
+
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
+				&rrbp, XFS_ALLOC_BTREE_REF)))
+			return error;
+		rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+		if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+			return error;
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
+		xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * Free the deleting block by putting it on the freelist.
+	 */
+	if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+			NULL, rbno)))
+		return error;
+	xfs_trans_agbtree_delta(cur->bc_tp, -1);
+	/*
+	 * Adjust the current level's cursor so that we're left referring
+	 * to the right node, after we're done.
+	 * If this leaves the ptr value 0 our caller will fix it up.
+	 */
+	if (level > 0)
+		cur->bc_ptrs[level]--;
+	/*
+	 * Return value means the next level up has something to do.
+	 */
+	*stat = 2;
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int				/* error */
+xfs_alloc_insrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to insert record at */
+	xfs_agblock_t		*bnop,	/* i/o: block number inserted */
+	xfs_alloc_rec_t		*recp,	/* i/o: record data inserted */
+	xfs_btree_cur_t		**curp, /* output: new cursor replacing cur */
+	int			*stat)	/* output: success/failure */
+{
+	xfs_agf_t		*agf;	/* allocation group freelist header */
+	xfs_alloc_block_t	*block; /* btree block record/key lives in */
+	xfs_buf_t		*bp;	/* buffer for block */
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_alloc_key_t		key;	/* key value being inserted */
+	xfs_alloc_key_t		*kp;	/* pointer to btree keys */
+	xfs_agblock_t		nbno;	/* block number of allocated block */
+	xfs_btree_cur_t		*ncur;	/* new cursor to be used at next lvl */
+	xfs_alloc_key_t		nkey;	/* new key value, from split */
+	xfs_alloc_rec_t		nrec;	/* new record value, for caller */
+	int			optr;	/* old ptr value */
+	xfs_alloc_ptr_t		*pp;	/* pointer to btree addresses */
+	int			ptr;	/* index in btree block for this rec */
+	xfs_alloc_rec_t		*rp;	/* pointer to btree records */
+
+	ASSERT(INT_GET(recp->ar_blockcount, ARCH_CONVERT) > 0);
+	/*
+	 * If we made it to the root level, allocate a new root block
+	 * and we're done.
+	 */
+	if (level >= cur->bc_nlevels) {
+		XFS_STATS_INC(xfsstats.xs_abt_insrec);
+		if ((error = xfs_alloc_newroot(cur, &i)))
+			return error;
+		*bnop = NULLAGBLOCK;
+		*stat = i;
+		return 0;
+	}
+	/*
+	 * Make a key out of the record data to be inserted, and save it.
+	 */
+	key.ar_startblock = recp->ar_startblock; /* INT_: direct copy */
+	key.ar_blockcount = recp->ar_blockcount; /* INT_: direct copy */
+	optr = ptr = cur->bc_ptrs[level];
+	/*
+	 * If we're off the left edge, return failure.
+	 */
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_abt_insrec);
+	/*
+	 * Get pointers to the btree buffer and block.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+	/*
+	 * Check that the new entry is being inserted in the right place.
+	 */
+	if (ptr <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		if (level == 0) {
+			rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
+			xfs_btree_check_rec(cur->bc_btnum, recp, rp);
+		} else {
+			kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
+			xfs_btree_check_key(cur->bc_btnum, &key, kp);
+		}
+	}
+#endif
+	nbno = NULLAGBLOCK;
+	ncur = (xfs_btree_cur_t *)0;
+	/*
+	 * If the block is full, we can't insert the new entry until we
+	 * make the block un-full.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * First, try shifting an entry to the right neighbor.
+		 */
+		if ((error = xfs_alloc_rshift(cur, level, &i)))
+			return error;
+		if (i) {
+			/* nothing */
+		}
+		/*
+		 * Next, try shifting an entry to the left neighbor.
+		 */
+		else {
+			if ((error = xfs_alloc_lshift(cur, level, &i)))
+				return error;
+			if (i)
+				optr = ptr = cur->bc_ptrs[level];
+			else {
+				/*
+				 * Next, try splitting the current block in
+				 * half. If this works we have to re-set our
+				 * variables because we could be in a
+				 * different block now.
+				 */
+				if ((error = xfs_alloc_split(cur, level, &nbno,
+						&nkey, &ncur, &i)))
+					return error;
+				if (i) {
+					bp = cur->bc_bufs[level];
+					block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+					if ((error =
+						xfs_btree_check_sblock(cur,
+							block, level, bp)))
+						return error;
+#endif
+					ptr = cur->bc_ptrs[level];
+					nrec.ar_startblock = nkey.ar_startblock; /* INT_: direct copy */
+					nrec.ar_blockcount = nkey.ar_blockcount; /* INT_: direct copy */
+				}
+				/*
+				 * Otherwise the insert fails.
+				 */
+				else {
+					*stat = 0;
+					return 0;
+				}
+			}
+		}
+	}
+	/*
+	 * At this point we know there's room for our new entry in the block
+	 * we're pointing at.
+	 */
+	if (level > 0) {
+		/*
+		 * It's a non-leaf entry.  Make a hole for the new data
+		 * in the key and ptr regions of the block.
+		 */
+		kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+		pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(block->bb_numrecs, ARCH_CONVERT); i >= ptr; i--) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		ovbcopy(&kp[ptr - 1], &kp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*kp)); /* INT_: copy */
+		ovbcopy(&pp[ptr - 1], &pp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*pp)); /* INT_: copy */
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
+			return error;
+#endif
+		/*
+		 * Now stuff the new data in, bump numrecs and log the new data.
+		 */
+		kp[ptr - 1] = key;
+		INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+		INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+		xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+		xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+#ifdef DEBUG
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
+				kp + ptr);
+#endif
+	} else {
+		/*
+		 * It's a leaf entry.  Make a hole for the new record.
+		 */
+		rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+		ovbcopy(&rp[ptr - 1], &rp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*rp));
+		/*
+		 * Now stuff the new record in, bump numrecs
+		 * and log the new data.
+		 */
+		rp[ptr - 1] = *recp; /* INT_: struct copy */
+		INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+		xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+#ifdef DEBUG
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
+				rp + ptr);
+#endif
+	}
+	/*
+	 * Log the new number of records in the btree header.
+	 */
+	xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+	/*
+	 * If we inserted at the start of a block, update the parents' keys.
+	 */
+	if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
+		return error;
+	/*
+	 * Look to see if the longest extent in the allocation group
+	 * needs to be updated.
+	 */
+
+	agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+	if (level == 0 &&
+	    cur->bc_btnum == XFS_BTNUM_CNT &&
+	    INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
+	    INT_GET(recp->ar_blockcount, ARCH_CONVERT) > INT_GET(agf->agf_longest, ARCH_CONVERT)) {
+		/*
+		 * If this is a leaf in the by-size btree and there
+		 * is no right sibling block and this block is bigger
+		 * than the previous longest block, update it.
+		 */
+		INT_COPY(agf->agf_longest, recp->ar_blockcount, ARCH_CONVERT);
+		cur->bc_mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest
+			= INT_GET(recp->ar_blockcount, ARCH_CONVERT);
+		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+			XFS_AGF_LONGEST);
+	}
+	/*
+	 * Return the new block number, if any.
+	 * If there is one, give back a record value and a cursor too.
+	 */
+	*bnop = nbno;
+	if (nbno != NULLAGBLOCK) {
+		*recp = nrec; /* INT_: struct copy */
+		*curp = ncur; /* INT_: struct copy */
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Log header fields from a btree block.
+ */
+STATIC void
+xfs_alloc_log_block(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			fields) /* mask of fields: XFS_BB_... */
+{
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	static const short	offsets[] = {	/* table of offsets */
+		offsetof(xfs_alloc_block_t, bb_magic),
+		offsetof(xfs_alloc_block_t, bb_level),
+		offsetof(xfs_alloc_block_t, bb_numrecs),
+		offsetof(xfs_alloc_block_t, bb_leftsib),
+		offsetof(xfs_alloc_block_t, bb_rightsib),
+		sizeof(xfs_alloc_block_t)
+	};
+
+	xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * Log keys from a btree block (nonleaf).
+ */
+STATIC void
+xfs_alloc_log_keys(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			kfirst, /* index of first key to log */
+	int			klast)	/* index of last key to log */
+{
+	xfs_alloc_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	xfs_alloc_key_t		*kp;	/* key pointer in btree block */
+	int			last;	/* last byte offset logged */
+
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+	kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_alloc_log_ptrs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			pfirst, /* index of first pointer to log */
+	int			plast)	/* index of last pointer to log */
+{
+	xfs_alloc_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	xfs_alloc_ptr_t		*pp;	/* block-pointer pointer in btree blk */
+
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+	pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Log records from a btree block (leaf).
+ */
+STATIC void
+xfs_alloc_log_recs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			rfirst, /* index of first record to log */
+	int			rlast)	/* index of last record to log */
+{
+	xfs_alloc_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	xfs_alloc_rec_t		*rp;	/* record pointer for btree block */
+
+
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+	rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+#ifdef DEBUG
+	{
+		xfs_agf_t	*agf;
+		xfs_alloc_rec_t *p;
+
+		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+		for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
+			ASSERT(INT_GET(p->ar_startblock, ARCH_CONVERT) + INT_GET(p->ar_blockcount, ARCH_CONVERT) <=
+			       INT_GET(agf->agf_length, ARCH_CONVERT));
+	}
+#endif
+	first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+STATIC int				/* error */
+xfs_alloc_lookup(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_lookup_t		dir,	/* <=, ==, or >= */
+	int			*stat)	/* success/failure */
+{
+	xfs_agblock_t		agbno;	/* a.g. relative btree block number */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	xfs_alloc_block_t	*block=NULL;	/* current btree block */
+	int			diff;	/* difference for the current key */
+	int			error;	/* error return value */
+	int			keyno=0;	/* current key number */
+	int			level;	/* level in the btree */
+	xfs_mount_t		*mp;	/* file system mount point */
+
+	XFS_STATS_INC(xfsstats.xs_abt_lookup);
+	/*
+	 * Get the allocation group header, and the root block number.
+	 */
+	mp = cur->bc_mp;
+
+	{
+		xfs_agf_t	*agf;	/* a.g. freespace header */
+
+		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+		agno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
+		agbno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT);
+	}
+	/*
+	 * Iterate over each level in the btree, starting at the root.
+	 * For each level above the leaves, find the key we need, based
+	 * on the lookup record, then follow the corresponding block
+	 * pointer down to the next level.
+	 */
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		xfs_buf_t	*bp;	/* buffer pointer for btree block */
+		xfs_daddr_t	d;	/* disk address of btree block */
+
+		/*
+		 * Get the disk address we're looking for.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+		/*
+		 * If the old buffer at this level is for a different block,
+		 * throw it away, otherwise just use it.
+		 */
+		bp = cur->bc_bufs[level];
+		if (bp && XFS_BUF_ADDR(bp) != d)
+			bp = (xfs_buf_t *)0;
+		if (!bp) {
+			/*
+			 * Need to get a new buffer.  Read it, then
+			 * set it in the cursor, releasing the old one.
+			 */
+			if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
+					agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
+				return error;
+			xfs_btree_setbuf(cur, level, bp);
+			/*
+			 * Point to the btree block, now that we have the buffer
+			 */
+			block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+			if ((error = xfs_btree_check_sblock(cur, block, level,
+					bp)))
+				return error;
+		} else
+			block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+		/*
+		 * If we already had a key match at a higher level, we know
+		 * we need to use the first entry in this block.
+		 */
+		if (diff == 0)
+			keyno = 1;
+		/*
+		 * Otherwise we need to search this block.  Do a binary search.
+		 */
+		else {
+			int		high;	/* high entry number */
+			xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
+			xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
+			int		low;	/* low entry number */
+
+			/*
+			 * Get a pointer to keys or records.
+			 */
+			if (level > 0)
+				kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+			else
+				krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
+			/*
+			 * Set low and high entry numbers, 1-based.
+			 */
+			low = 1;
+			if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
+				/*
+				 * If the block is empty, the tree must
+				 * be an empty leaf.
+				 */
+				ASSERT(level == 0 && cur->bc_nlevels == 1);
+				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+				*stat = 0;
+				return 0;
+			}
+			/*
+			 * Binary search the block.
+			 */
+			while (low <= high) {
+				xfs_extlen_t	blockcount;	/* key value */
+				xfs_agblock_t	startblock;	/* key value */
+
+				XFS_STATS_INC(xfsstats.xs_abt_compare);
+				/*
+				 * keyno is average of low and high.
+				 */
+				keyno = (low + high) >> 1;
+				/*
+				 * Get startblock & blockcount.
+				 */
+				if (level > 0) {
+					xfs_alloc_key_t *kkp;
+
+					kkp = kkbase + keyno - 1;
+					startblock = INT_GET(kkp->ar_startblock, ARCH_CONVERT);
+					blockcount = INT_GET(kkp->ar_blockcount, ARCH_CONVERT);
+				} else {
+					xfs_alloc_rec_t *krp;
+
+					krp = krbase + keyno - 1;
+					startblock = INT_GET(krp->ar_startblock, ARCH_CONVERT);
+					blockcount = INT_GET(krp->ar_blockcount, ARCH_CONVERT);
+				}
+				/*
+				 * Compute difference to get next direction.
+				 */
+				if (cur->bc_btnum == XFS_BTNUM_BNO)
+					diff = (int)startblock -
+					       (int)cur->bc_rec.a.ar_startblock;
+				else if (!(diff = (int)blockcount -
+					    (int)cur->bc_rec.a.ar_blockcount))
+					diff = (int)startblock -
+					    (int)cur->bc_rec.a.ar_startblock;
+				/*
+				 * Less than, move right.
+				 */
+				if (diff < 0)
+					low = keyno + 1;
+				/*
+				 * Greater than, move left.
+				 */
+				else if (diff > 0)
+					high = keyno - 1;
+				/*
+				 * Equal, we're done.
+				 */
+				else
+					break;
+			}
+		}
+		/*
+		 * If there are more levels, set up for the next level
+		 * by getting the block number and filling in the cursor.
+		 */
+		if (level > 0) {
+			/*
+			 * If we moved left, need the previous key number,
+			 * unless there isn't one.
+			 */
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, keyno, cur), ARCH_CONVERT);
+#ifdef DEBUG
+			if ((error = xfs_btree_check_sptr(cur, agbno, level)))
+				return error;
+#endif
+			cur->bc_ptrs[level] = keyno;
+		}
+	}
+	/*
+	 * Done with the search.
+	 * See if we need to adjust the results.
+	 */
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		if (dir == XFS_LOOKUP_GE &&
+		    keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
+		    INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			int	i;
+
+			cur->bc_ptrs[0] = keyno;
+			if ((error = xfs_alloc_increment(cur, 0, &i)))
+				return error;
+			XFS_WANT_CORRUPTED_RETURN(i == 1);
+			*stat = 1;
+			return 0;
+		}
+	}
+	else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_ptrs[0] = keyno;
+	/*
+	 * Return if we succeeded or not.
+	 */
+	if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT))
+		*stat = 0;
+	else
+		*stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
+	return 0;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int				/* error */
+xfs_alloc_lshift(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to shift record on */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+#ifdef DEBUG
+	int			i;	/* loop index */
+#endif
+	xfs_alloc_key_t		key;	/* key value for leaf level upward */
+	xfs_buf_t		*lbp;	/* buffer for left neighbor block */
+	xfs_alloc_block_t	*left;	/* left neighbor btree block */
+	int			nrec;	/* new number of left block entries */
+	xfs_buf_t		*rbp;	/* buffer for right (current) block */
+	xfs_alloc_block_t	*right; /* right (current) btree block */
+	xfs_alloc_key_t		*rkp=NULL;	/* key pointer for right block */
+	xfs_alloc_ptr_t		*rpp=NULL;	/* address pointer for right block */
+	xfs_alloc_rec_t		*rrp=NULL;	/* record pointer for right block */
+
+	/*
+	 * Set up variables for this block as "right".
+	 */
+	rbp = cur->bc_bufs[level];
+	right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+		return error;
+#endif
+	/*
+	 * If we've got no left sibling then we can't shift an entry left.
+	 */
+	if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] <= 1) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Set up the left neighbor as "left".
+	 */
+	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp,
+			XFS_ALLOC_BTREE_REF)))
+		return error;
+	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+	/*
+	 * If it's full, it can't take another entry.
+	 */
+	if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		*stat = 0;
+		return 0;
+	}
+	nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
+	/*
+	 * If non-leaf, copy a key and a ptr to the left block.
+	 */
+	if (level > 0) {
+		xfs_alloc_key_t *lkp;	/* key pointer for left block */
+		xfs_alloc_ptr_t *lpp;	/* address pointer for left block */
+
+		lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
+		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+		*lkp = *rkp;
+		xfs_alloc_log_keys(cur, lbp, nrec, nrec);
+		lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
+		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level)))
+			return error;
+#endif
+		*lpp = *rpp; /* INT_: copy */
+		xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
+		xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
+	}
+	/*
+	 * If leaf, copy a record to the left block.
+	 */
+	else {
+		xfs_alloc_rec_t *lrp;	/* record pointer for left block */
+
+		lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
+		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+		*lrp = *rrp;
+		xfs_alloc_log_recs(cur, lbp, nrec, nrec);
+		xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
+	}
+	/*
+	 * Bump and log left's numrecs, decrement and log right's numrecs.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1);
+	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+	/*
+	 * Slide the contents of right down one entry.
+	 */
+	if (level > 0) {
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+					level)))
+				return error;
+		}
+#endif
+		ovbcopy(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+		xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	} else {
+		ovbcopy(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
+		key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
+		rkp = &key;
+	}
+	/*
+	 * Update the parent key values of right.
+	 */
+	if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
+		return error;
+	/*
+	 * Slide the cursor value left one.
+	 */
+	cur->bc_ptrs[level]--;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int				/* error */
+xfs_alloc_newroot(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	xfs_agblock_t		lbno;	/* left block number */
+	xfs_buf_t		*lbp;	/* left btree buffer */
+	xfs_alloc_block_t	*left;	/* left btree block */
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_agblock_t		nbno;	/* new block number */
+	xfs_buf_t		*nbp;	/* new (root) buffer */
+	xfs_alloc_block_t	*new;	/* new (root) btree block */
+	int			nptr;	/* new value for key index, 1 or 2 */
+	xfs_agblock_t		rbno;	/* right block number */
+	xfs_buf_t		*rbp;	/* right btree buffer */
+	xfs_alloc_block_t	*right; /* right btree block */
+
+	mp = cur->bc_mp;
+
+	ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
+	/*
+	 * Get a buffer from the freelist blocks, for the new root.
+	 */
+	if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+			&nbno)))
+		return error;
+	/*
+	 * None available, we fail.
+	 */
+	if (nbno == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	xfs_trans_agbtree_delta(cur->bc_tp, 1);
+	nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
+		0);
+	new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
+	/*
+	 * Set the root data in the a.g. freespace structure.
+	 */
+	{
+		xfs_agf_t	*agf;	/* a.g. freespace header */
+		xfs_agnumber_t	seqno;
+
+		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+		INT_SET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT, nbno);
+		INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, 1);
+		seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
+		mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
+		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+			XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+	}
+	/*
+	 * At the previous root level there are now two blocks: the old
+	 * root, and the new block generated when it was split.
+	 * We don't know which one the cursor is pointing at, so we
+	 * set up variables "left" and "right" for each case.
+	 */
+	lbp = cur->bc_bufs[cur->bc_nlevels - 1];
+	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
+		return error;
+#endif
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		/*
+		 * Our block is left, pick up the right block.
+		 */
+		lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
+		rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, rbno, 0, &rbp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+		if ((error = xfs_btree_check_sblock(cur, right,
+				cur->bc_nlevels - 1, rbp)))
+			return error;
+		nptr = 1;
+	} else {
+		/*
+		 * Our block is right, pick up the left block.
+		 */
+		rbp = lbp;
+		right = left;
+		rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
+		lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, lbno, 0, &lbp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+		if ((error = xfs_btree_check_sblock(cur, left,
+				cur->bc_nlevels - 1, lbp)))
+			return error;
+		nptr = 2;
+	}
+	/*
+	 * Fill in the new block's btree header and log it.
+	 */
+	INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+	INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels);
+	INT_SET(new->bb_numrecs, ARCH_CONVERT, 2);
+	INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+	INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+	xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
+	ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
+	/*
+	 * Fill in the key data in the new root.
+	 */
+	{
+		xfs_alloc_key_t		*kp;	/* btree key pointer */
+
+		kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
+		if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) {
+			kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */
+			kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */
+		} else {
+			xfs_alloc_rec_t *rp;	/* btree record pointer */
+
+			rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
+			kp[0].ar_startblock = rp->ar_startblock; /* INT_: direct copy */
+			kp[0].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */
+			rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+			kp[1].ar_startblock = rp->ar_startblock; /* INT_: direct copy */
+			kp[1].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */
+		}
+	}
+	xfs_alloc_log_keys(cur, nbp, 1, 2);
+	/*
+	 * Fill in the pointer data in the new root.
+	 */
+	{
+		xfs_alloc_ptr_t		*pp;	/* btree address pointer */
+
+		pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
+		INT_SET(pp[0], ARCH_CONVERT, lbno);
+		INT_SET(pp[1], ARCH_CONVERT, rbno);
+	}
+	xfs_alloc_log_ptrs(cur, nbp, 1, 2);
+	/*
+	 * Fix up the cursor.
+	 */
+	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+	cur->bc_ptrs[cur->bc_nlevels] = nptr;
+	cur->bc_nlevels++;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int				/* error */
+xfs_alloc_rshift(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to shift record on */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_alloc_key_t		key;	/* key value for leaf level upward */
+	xfs_buf_t		*lbp;	/* buffer for left (current) block */
+	xfs_alloc_block_t	*left;	/* left (current) btree block */
+	xfs_buf_t		*rbp;	/* buffer for right neighbor block */
+	xfs_alloc_block_t	*right; /* right neighbor btree block */
+	xfs_alloc_key_t		*rkp;	/* key pointer for right block */
+	xfs_btree_cur_t		*tcur;	/* temporary cursor */
+
+	/*
+	 * Set up variables for this block as "left".
+	 */
+	lbp = cur->bc_bufs[level];
+	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+#endif
+	/*
+	 * If we've got no right sibling then we can't shift an entry right.
+	 */
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Set up the right neighbor as "right".
+	 */
+	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp,
+			XFS_ALLOC_BTREE_REF)))
+		return error;
+	right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+		return error;
+	/*
+	 * If it's full, it can't take another entry.
+	 */
+	if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Make a hole at the start of the right neighbor block, then
+	 * copy the last left block entry to the hole.
+	 */
+	if (level > 0) {
+		xfs_alloc_key_t *lkp;	/* key pointer for left block */
+		xfs_alloc_ptr_t *lpp;	/* address pointer for left block */
+		xfs_alloc_ptr_t *rpp;	/* address pointer for right block */
+
+		lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		ovbcopy(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level)))
+			return error;
+#endif
+		*rkp = *lkp; /* INT_: copy */
+		*rpp = *lpp; /* INT_: copy */
+		xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
+	} else {
+		xfs_alloc_rec_t *lrp;	/* record pointer for left block */
+		xfs_alloc_rec_t *rrp;	/* record pointer for right block */
+
+		lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+		ovbcopy(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		*rrp = *lrp;
+		xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
+		key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
+		rkp = &key;
+		xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
+	}
+	/*
+	 * Decrement and log left's numrecs, bump and log right's numrecs.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+	/*
+	 * Using a temporary cursor, update the parent key values of the
+	 * block on the right.
+	 */
+	if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+		return error;
+	i = xfs_btree_lastrec(tcur, level);
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	if ((error = xfs_alloc_increment(tcur, level, &i)) ||
+	    (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
+		goto error0;
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	*stat = 1;
+	return 0;
+error0:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and its first record (to be inserted into parent).
+ */
+STATIC int				/* error */
+xfs_alloc_split(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to split */
+	xfs_agblock_t		*bnop,	/* output: block number allocated */
+	xfs_alloc_key_t		*keyp,	/* output: first key of new block */
+	xfs_btree_cur_t		**curp, /* output: new cursor */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			i;	/* loop index/record number */
+	xfs_agblock_t		lbno;	/* left (current) block number */
+	xfs_buf_t		*lbp;	/* buffer for left block */
+	xfs_alloc_block_t	*left;	/* left (current) btree block */
+	xfs_agblock_t		rbno;	/* right (new) block number */
+	xfs_buf_t		*rbp;	/* buffer for right block */
+	xfs_alloc_block_t	*right; /* right (new) btree block */
+
+	/*
+	 * Allocate the new block from the freelist.
+	 * If we can't do it, we're toast.  Give up.
+	 */
+	if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+			&rbno)))
+		return error;
+	if (rbno == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	xfs_trans_agbtree_delta(cur->bc_tp, 1);
+	rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
+		rbno, 0);
+	/*
+	 * Set up the new block as "right".
+	 */
+	right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+	/*
+	 * "Left" is the current (according to the cursor) block.
+	 */
+	lbp = cur->bc_bufs[level];
+	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+#endif
+	/*
+	 * Fill in the btree header for the new block.
+	 */
+	INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+	right->bb_level = left->bb_level; /* INT_: direct copy */
+	INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
+	/*
+	 * Make sure that if there's an odd number of entries now, that
+	 * each new block will have the same number of entries.
+	 */
+	if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
+	    cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
+		INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+	i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
+	/*
+	 * For non-leaf blocks, copy keys and addresses over to the new block.
+	 */
+	if (level > 0) {
+		xfs_alloc_key_t *lkp;	/* left btree key pointer */
+		xfs_alloc_ptr_t *lpp;	/* left btree address pointer */
+		xfs_alloc_key_t *rkp;	/* right btree key pointer */
+		xfs_alloc_ptr_t *rpp;	/* right btree address pointer */
+
+		lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
+		lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
+		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		bcopy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); /* INT_: copy */
+		bcopy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));/* INT_: copy */
+		xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		*keyp = *rkp;
+	}
+	/*
+	 * For leaf blocks, copy records over to the new block.
+	 */
+	else {
+		xfs_alloc_rec_t *lrp;	/* left btree record pointer */
+		xfs_alloc_rec_t *rrp;	/* right btree record pointer */
+
+		lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
+		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+		bcopy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		keyp->ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
+		keyp->ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
+	}
+	/*
+	 * Find the left block number by looking in the buffer.
+	 * Adjust numrecs, sibling pointers.
+	 */
+	lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
+	right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
+	INT_SET(left->bb_rightsib, ARCH_CONVERT, rbno);
+	INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
+	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
+	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	/*
+	 * If there's a block to the new block's right, make that block
+	 * point back to right instead of to left.
+	 */
+	if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		xfs_alloc_block_t	*rrblock;	/* rr btree block */
+		xfs_buf_t		*rrbp;		/* buffer for rrblock */
+
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.a.agno, INT_GET(right->bb_rightsib, ARCH_CONVERT), 0,
+				&rrbp, XFS_ALLOC_BTREE_REF)))
+			return error;
+		rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+		if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+			return error;
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, rbno);
+		xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * If the cursor is really in the right block, move it there.
+	 * If it's just pointing past the last entry in left, then we'll
+	 * insert there, so don't change anything in that case.
+	 */
+	if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * If there are more levels, we'll need another cursor which refers to
+	 * the right block, no matter where this cursor was.
+	 */
+	if (level + 1 < cur->bc_nlevels) {
+		if ((error = xfs_btree_dup_cursor(cur, curp)))
+			return error;
+		(*curp)->bc_ptrs[level + 1]++;
+	}
+	*bnop = rbno;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int				/* error */
+xfs_alloc_updkey(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_alloc_key_t		*keyp,	/* new key value to update to */
+	int			level)	/* starting level for update */
+{
+	int			ptr;	/* index of key in block */
+
+	/*
+	 * Go up the tree from this level toward the root.
+	 * At each level, update the key value to the value input.
+	 * Stop when we reach a level where the cursor isn't pointing
+	 * at the first entry in the block.
+	 */
+	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+		xfs_alloc_block_t	*block; /* btree block */
+		xfs_buf_t		*bp;	/* buffer for block */
+#ifdef DEBUG
+		int			error;	/* error return value */
+#endif
+		xfs_alloc_key_t		*kp;	/* ptr to btree block keys */
+
+		bp = cur->bc_bufs[level];
+		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+			return error;
+#endif
+		ptr = cur->bc_ptrs[level];
+		kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
+		*kp = *keyp;
+		xfs_alloc_log_keys(cur, bp, ptr, ptr);
+	}
+	return 0;
+}
+
+/*
+ * Externally visible routines.
+ */
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_alloc_decrement(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat)	/* success/failure */
+{
+	xfs_alloc_block_t	*block; /* btree block */
+	int			error;	/* error return value */
+	int			lev;	/* btree level */
+
+	ASSERT(level < cur->bc_nlevels);
+	/*
+	 * Read-ahead to the left at this level.
+	 */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+	/*
+	 * Decrement the ptr at this level.  If we're still in the block
+	 * then we're done.
+	 */
+	if (--cur->bc_ptrs[level] > 0) {
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * Get a pointer to the btree block.
+	 */
+	block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level,
+			cur->bc_bufs[level])))
+		return error;
+#endif
+	/*
+	 * If we just went off the left edge of the tree, return failure.
+	 */
+	if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * March up the tree decrementing pointers.
+	 * Stop when we don't go off the left edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_ptrs[lev] > 0)
+			break;
+		/*
+		 * Read-ahead the left block, we're going to read it
+		 * in the next loop.
+		 */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+	/*
+	 * If we went off the root then we are seriously confused.
+	 */
+	ASSERT(lev < cur->bc_nlevels);
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
+		xfs_agblock_t	agbno;	/* block number of btree block */
+		xfs_buf_t	*bp;	/* buffer pointer for block */
+
+		agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.a.agno, agbno, 0, &bp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+		cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_alloc_delete(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+	int		i;		/* result code */
+	int		level;		/* btree level */
+
+	/*
+	 * Go up the tree, starting at leaf level.
+	 * If 2 is returned then a join was done; go to the next level.
+	 * Otherwise we are done.
+	 */
+	for (level = 0, i = 2; i == 2; level++) {
+		if ((error = xfs_alloc_delrec(cur, level, &i)))
+			return error;
+	}
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_ptrs[level] == 0) {
+				if ((error = xfs_alloc_decrement(cur, level, &i)))
+					return error;
+				break;
+			}
+		}
+	}
+	*stat = i;
+	return 0;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_alloc_get_rec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat)	/* output: success/failure */
+{
+	xfs_alloc_block_t	*block; /* btree block */
+#ifdef DEBUG
+	int			error;	/* error return value */
+#endif
+	int			ptr;	/* record number */
+
+	ptr = cur->bc_ptrs[0];
+	block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
+		return error;
+#endif
+	/*
+	 * Off the right end or left end, return failure.
+	 */
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Point to the record and extract its data.
+	 */
+	{
+		xfs_alloc_rec_t		*rec;	/* record data */
+
+		rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
+		*bno = INT_GET(rec->ar_startblock, ARCH_CONVERT);
+		*len = INT_GET(rec->ar_blockcount, ARCH_CONVERT);
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_alloc_increment(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat)	/* success/failure */
+{
+	xfs_alloc_block_t	*block; /* btree block */
+	xfs_buf_t		*bp;	/* tree block buffer */
+	int			error;	/* error return value */
+	int			lev;	/* btree level */
+
+	ASSERT(level < cur->bc_nlevels);
+	/*
+	 * Read-ahead to the right at this level.
+	 */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+	/*
+	 * Get a pointer to the btree block.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+#endif
+	/*
+	 * Increment the ptr at this level.  If we're still in the block
+	 * then we're done.
+	 */
+	if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * If we just went off the right edge of the tree, return failure.
+	 */
+	if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * March up the tree incrementing pointers.
+	 * Stop when we don't go off the right edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		bp = cur->bc_bufs[lev];
+		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+#endif
+		if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			break;
+		/*
+		 * Read-ahead the right block, we're going to read it
+		 * in the next loop.
+		 */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+	/*
+	 * If we went off the root then we are seriously confused.
+	 */
+	ASSERT(lev < cur->bc_nlevels);
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+	     lev > level; ) {
+		xfs_agblock_t	agbno;	/* block number of btree block */
+
+		agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.a.agno, agbno, 0, &bp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+		cur->bc_ptrs[lev] = 1;
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int					/* error */
+xfs_alloc_insert(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+	int		i;		/* result value, 0 for failure */
+	int		level;		/* current level number in btree */
+	xfs_agblock_t	nbno;		/* new block number (split result) */
+	xfs_btree_cur_t *ncur;		/* new cursor (split result) */
+	xfs_alloc_rec_t nrec;		/* record being inserted this level */
+	xfs_btree_cur_t *pcur;		/* previous level's cursor */
+
+	level = 0;
+	nbno = NULLAGBLOCK;
+	INT_SET(nrec.ar_startblock, ARCH_CONVERT, cur->bc_rec.a.ar_startblock);
+	INT_SET(nrec.ar_blockcount, ARCH_CONVERT, cur->bc_rec.a.ar_blockcount);
+	ncur = (xfs_btree_cur_t *)0;
+	pcur = cur;
+	/*
+	 * Loop going up the tree, starting at the leaf level.
+	 * Stop when we don't get a split block, that must mean that
+	 * the insert is finished with this level.
+	 */
+	do {
+		/*
+		 * Insert nrec/nbno into this level of the tree.
+		 * Note if we fail, nbno will be null.
+		 */
+		if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
+				&i))) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			return error;
+		}
+		/*
+		 * See if the cursor we just used is trash.
+		 * Can't trash the caller's cursor, but otherwise we should
+		 * if ncur is a new cursor or we're about to be done.
+		 */
+		if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
+			cur->bc_nlevels = pcur->bc_nlevels;
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		/*
+		 * If we got a new cursor, switch to it.
+		 */
+		if (ncur) {
+			pcur = ncur;
+			ncur = (xfs_btree_cur_t *)0;
+		}
+	} while (nbno != NULLAGBLOCK);
+	*stat = i;
+	return 0;
+}
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_eq(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agblock_t	bno,		/* starting block of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_ge(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agblock_t	bno,		/* starting block of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_le(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agblock_t	bno,		/* starting block of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur, to the value given by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int					/* error */
+xfs_alloc_update(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len)	/* length of extent */
+{
+	xfs_alloc_block_t	*block; /* btree block to update */
+	int			error;	/* error return value */
+	int			ptr;	/* current record number (updating) */
+
+	ASSERT(len > 0);
+	/*
+	 * Pick up the a.g. freelist struct and the current block.
+	 */
+	block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
+		return error;
+#endif
+	/*
+	 * Get the address of the rec to be updated.
+	 */
+	ptr = cur->bc_ptrs[0];
+	{
+		xfs_alloc_rec_t		*rp;	/* pointer to updated record */
+
+		rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
+		/*
+		 * Fill in the new contents and log them.
+		 */
+		INT_SET(rp->ar_startblock, ARCH_CONVERT, bno);
+		INT_SET(rp->ar_blockcount, ARCH_CONVERT, len);
+		xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
+	}
+	/*
+	 * If it's the by-size btree and it's the last leaf block and
+	 * it's the last record... then update the size of the longest
+	 * extent in the a.g., which we cache in the a.g. freelist header.
+	 */
+	if (cur->bc_btnum == XFS_BTNUM_CNT &&
+	    INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
+	    ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		xfs_agf_t	*agf;	/* a.g. freespace header */
+		xfs_agnumber_t	seqno;
+
+		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+		seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
+		cur->bc_mp->m_perag[seqno].pagf_longest = len;
+		INT_SET(agf->agf_longest, ARCH_CONVERT, len);
+		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+			XFS_AGF_LONGEST);
+	}
+	/*
+	 * Updating first record in leaf. Pass new key value up to our parent.
+	 */
+	if (ptr == 1) {
+		xfs_alloc_key_t key;	/* key containing [bno, len] */
+
+		INT_SET(key.ar_startblock, ARCH_CONVERT, bno);
+		INT_SET(key.ar_blockcount, ARCH_CONVERT, len);
+		if ((error = xfs_alloc_updkey(cur, &key, 1)))
+			return error;
+	}
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_alloc_btree.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_alloc_btree.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_alloc_btree.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_alloc_btree.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define __XFS_ALLOC_BTREE_H__
+
+/*
+ * Freespace on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_btree_sblock;
+struct xfs_mount;
+
+/*
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno.  All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define XFS_ABTB_MAGIC	0x41425442	/* 'ABTB' for bno tree */
+#define XFS_ABTC_MAGIC	0x41425443	/* 'ABTC' for cnt tree */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec
+{
+	xfs_agblock_t	ar_startblock;	/* starting block number */
+	xfs_extlen_t	ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef xfs_agblock_t xfs_alloc_ptr_t;	/* btree pointer type */
+					/* btree block header type */
+typedef struct xfs_btree_sblock xfs_alloc_block_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_ALLOC_BLOCK)
+xfs_alloc_block_t *xfs_buf_to_alloc_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_ALLOC_BLOCK(bp)	xfs_buf_to_alloc_block(bp)
+#else
+#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+/*
+ * Real block structures have a size equal to the disk block size.
+ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_SIZE)
+int xfs_alloc_block_size(int lev, struct xfs_btree_cur *cur);
+#define XFS_ALLOC_BLOCK_SIZE(lev,cur)	xfs_alloc_block_size(lev,cur)
+#else
+#define XFS_ALLOC_BLOCK_SIZE(lev,cur)	(1 << (cur)->bc_blocklog)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MAXRECS)
+int xfs_alloc_block_maxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur)	xfs_alloc_block_maxrecs(lev,cur)
+#else
+#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur)	\
+	((cur)->bc_mp->m_alloc_mxr[lev != 0])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MINRECS)
+int xfs_alloc_block_minrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_ALLOC_BLOCK_MINRECS(lev,cur)	xfs_alloc_block_minrecs(lev,cur)
+#else
+#define XFS_ALLOC_BLOCK_MINRECS(lev,cur)	\
+	((cur)->bc_mp->m_alloc_mnr[lev != 0])
+#endif
+
+/*
+ * Minimum and maximum blocksize.
+ * The blocksize upper limit is pretty much arbitrary.
+ */
+#define XFS_MIN_BLOCKSIZE_LOG	9	/* i.e. 512 bytes */
+#define XFS_MAX_BLOCKSIZE_LOG	16	/* i.e. 65536 bytes */
+#define XFS_MIN_BLOCKSIZE	(1 << XFS_MIN_BLOCKSIZE_LOG)
+#define XFS_MAX_BLOCKSIZE	(1 << XFS_MAX_BLOCKSIZE_LOG)
+
+/*
+ * block numbers in the AG; SB is BB 0, AGF is BB 1, AGI is BB 2, AGFL is BB 3
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BNO_BLOCK)
+xfs_agblock_t xfs_bno_block(struct xfs_mount *mp);
+#define XFS_BNO_BLOCK(mp)	xfs_bno_block(mp)
+#else
+#define XFS_BNO_BLOCK(mp)	((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CNT_BLOCK)
+xfs_agblock_t xfs_cnt_block(struct xfs_mount *mp);
+#define XFS_CNT_BLOCK(mp)	xfs_cnt_block(mp)
+#else
+#define XFS_CNT_BLOCK(mp)	((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+#endif
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_REC_ADDR)
+xfs_alloc_rec_t *xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i,
+				    struct xfs_btree_cur *cur);
+#define XFS_ALLOC_REC_ADDR(bb,i,cur)	xfs_alloc_rec_addr(bb,i,cur)
+#else
+#define XFS_ALLOC_REC_ADDR(bb,i,cur)	\
+	XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, bb, i, \
+		XFS_ALLOC_BLOCK_MAXRECS(0, cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_KEY_ADDR)
+xfs_alloc_key_t *xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i,
+				    struct xfs_btree_cur *cur);
+#define XFS_ALLOC_KEY_ADDR(bb,i,cur)	xfs_alloc_key_addr(bb,i,cur)
+#else
+#define XFS_ALLOC_KEY_ADDR(bb,i,cur)	\
+	XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \
+		XFS_ALLOC_BLOCK_MAXRECS(1, cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_PTR_ADDR)
+xfs_alloc_ptr_t *xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i,
+				    struct xfs_btree_cur *cur);
+#define XFS_ALLOC_PTR_ADDR(bb,i,cur)	xfs_alloc_ptr_addr(bb,i,cur)
+#else
+#define XFS_ALLOC_PTR_ADDR(bb,i,cur)	\
+	XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \
+		XFS_ALLOC_BLOCK_MAXRECS(1, cur))
+#endif
+
+/*
+ * Prototypes for externally visible routines.
+ */
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_alloc_decrement(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat); /* success/failure */
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_alloc_delete(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat); /* success/failure */
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat); /* output: success/failure */
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_alloc_increment(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat); /* success/failure */
+
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int					/* error */
+xfs_alloc_insert(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_eq(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat); /* success/failure */
+
+/*
+ * Update the record referred to by cur, to the value given by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int					/* error */
+xfs_alloc_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len);	/* length of extent */
+
+#endif	/* __XFS_ALLOC_BTREE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_arch.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_arch.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_arch.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_arch.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ARCH_H__
+#define __XFS_ARCH_H__
+
+#ifndef XFS_BIG_FILESYSTEMS
+#error XFS_BIG_FILESYSTEMS must be defined true or false
+#endif
+
+#ifdef __KERNEL__
+
+#include <asm/byteorder.h>
+
+#ifdef __LITTLE_ENDIAN
+# define __BYTE_ORDER	__LITTLE_ENDIAN
+#endif
+#ifdef __BIG_ENDIAN
+# define __BYTE_ORDER	__BIG_ENDIAN
+#endif
+
+#endif	/* __KERNEL__ */
+
+/* do we need conversion? */
+
+#define ARCH_NOCONVERT 1
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define ARCH_CONVERT   0
+#else
+#define ARCH_CONVERT   ARCH_NOCONVERT
+#endif
+
+/* generic swapping macros */
+
+#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var))))
+#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var))))
+#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var))))
+
+#define INT_SWAP(type, var) \
+    ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \
+    ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \
+    ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \
+    (var))))
+
+#define INT_SWAP_UNALIGNED_32(from,to) \
+    { \
+	((__u8*)(to))[0] = ((__u8*)(from))[3]; \
+	((__u8*)(to))[1] = ((__u8*)(from))[2]; \
+	((__u8*)(to))[2] = ((__u8*)(from))[1]; \
+	((__u8*)(to))[3] = ((__u8*)(from))[0]; \
+    }
+
+#define INT_SWAP_UNALIGNED_64(from,to) \
+    { \
+	INT_SWAP_UNALIGNED_32( ((__u8*)(from)) + 4, ((__u8*)(to))); \
+	INT_SWAP_UNALIGNED_32( ((__u8*)(from)), ((__u8*)(to)) + 4); \
+    }
+
+/*
+ * get and set integers from potentially unaligned locations
+ */
+
+#define INT_GET_UNALIGNED_16_LE(pointer) \
+   ((__u16)((((__u8*)(pointer))[0]	) | (((__u8*)(pointer))[1] << 8 )))
+#define INT_GET_UNALIGNED_16_BE(pointer) \
+   ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1])))
+#define INT_SET_UNALIGNED_16_LE(pointer,value) \
+    { \
+	((__u8*)(pointer))[0] = (((value)     ) & 0xff); \
+	((__u8*)(pointer))[1] = (((value) >> 8) & 0xff); \
+    }
+#define INT_SET_UNALIGNED_16_BE(pointer,value) \
+    { \
+	((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \
+	((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
+    }
+
+#define INT_GET_UNALIGNED_32_LE(pointer) \
+   ((__u32)((((__u8*)(pointer))[0]	) | (((__u8*)(pointer))[1] << 8 ) \
+	   |(((__u8*)(pointer))[2] << 16) | (((__u8*)(pointer))[3] << 24)))
+#define INT_GET_UNALIGNED_32_BE(pointer) \
+   ((__u32)((((__u8*)(pointer))[0] << 24) | (((__u8*)(pointer))[1] << 16) \
+	   |(((__u8*)(pointer))[2] << 8)  | (((__u8*)(pointer))[3]	)))
+
+#define INT_GET_UNALIGNED_64_LE(pointer) \
+   (((__u64)(INT_GET_UNALIGNED_32_LE(((__u8*)(pointer))+4)) << 32 ) \
+   |((__u64)(INT_GET_UNALIGNED_32_LE(((__u8*)(pointer))	 ))	  ))
+#define INT_GET_UNALIGNED_64_BE(pointer) \
+   (((__u64)(INT_GET_UNALIGNED_32_BE(((__u8*)(pointer))	 )) << 32  ) \
+   |((__u64)(INT_GET_UNALIGNED_32_BE(((__u8*)(pointer))+4))	   ))
+
+/*
+ * now pick the right ones for our MACHINE ARCHITECTURE
+ */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define INT_GET_UNALIGNED_16(pointer)	    INT_GET_UNALIGNED_16_LE(pointer)
+#define INT_SET_UNALIGNED_16(pointer,value) INT_SET_UNALIGNED_16_LE(pointer,value)
+#define INT_GET_UNALIGNED_32(pointer)	    INT_GET_UNALIGNED_32_LE(pointer)
+#define INT_GET_UNALIGNED_64(pointer)	    INT_GET_UNALIGNED_64_LE(pointer)
+#else
+#define INT_GET_UNALIGNED_16(pointer)	    INT_GET_UNALIGNED_16_BE(pointer)
+#define INT_SET_UNALIGNED_16(pointer,value) INT_SET_UNALIGNED_16_BE(pointer,value)
+#define INT_GET_UNALIGNED_32(pointer)	    INT_GET_UNALIGNED_32_BE(pointer)
+#define INT_GET_UNALIGNED_64(pointer)	    INT_GET_UNALIGNED_64_BE(pointer)
+#endif
+
+/* define generic INT_ macros */
+
+#define INT_GET(reference,arch) \
+    (((arch) == ARCH_NOCONVERT) \
+	? \
+	    (reference) \
+	: \
+	    INT_SWAP((reference),(reference)) \
+    )
+
+/* does not return a value */
+#define INT_SET(reference,arch,valueref) \
+    (__builtin_constant_p(valueref) ? \
+	(void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \
+	(void)( \
+	    ((reference) = (valueref)), \
+	    ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \
+	) \
+    )
+
+/* does not return a value */
+#define INT_MOD_EXPR(reference,arch,code) \
+    (void)(((arch) == ARCH_NOCONVERT) \
+	? \
+	    ((reference) code) \
+	: \
+	    ( \
+		(reference) = INT_GET((reference),arch) , \
+		((reference) code), \
+		INT_SET(reference, arch, reference) \
+	    ) \
+    )
+
+/* does not return a value */
+#define INT_MOD(reference,arch,delta) \
+    (void)( \
+	INT_MOD_EXPR(reference,arch,+=(delta)) \
+    )
+
+/*
+ * INT_COPY - copy a value between two locations with the
+ *	      _same architecture_ but _potentially different sizes_
+ *
+ *	    if the types of the two parameters are equal or they are
+ *		in native architecture, a simple copy is done
+ *
+ *	    otherwise, architecture conversions are done
+ *
+ */
+
+/* does not return a value */
+#define INT_COPY(dst,src,arch) \
+    (void)( \
+	((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
+	    ? \
+		((dst) = (src)) \
+	    : \
+		INT_SET(dst, arch, INT_GET(src, arch)) \
+    )
+
+/*
+ * INT_XLATE - copy a value in either direction between two locations
+ *	       with different architectures
+ *
+ *		    dir < 0	- copy from memory to buffer (native to arch)
+ *		    dir > 0	- copy from buffer to memory (arch to native)
+ */
+
+/* does not return a value */
+#define INT_XLATE(buf,mem,dir,arch) {\
+    ASSERT(dir); \
+    if (dir>0) { \
+	(mem)=INT_GET(buf, arch); \
+    } else { \
+	INT_SET(buf, arch, mem); \
+    } \
+}
+
+#define INT_ISZERO(reference,arch) \
+    ((reference) == 0)
+
+#define INT_ZERO(reference,arch) \
+    ((reference) = 0)
+
+#define INT_GET_UNALIGNED_16_ARCH(pointer,arch) \
+    ( ((arch) == ARCH_NOCONVERT) \
+	? \
+	    (INT_GET_UNALIGNED_16(pointer)) \
+	: \
+	    (INT_GET_UNALIGNED_16_BE(pointer)) \
+    )
+#define INT_SET_UNALIGNED_16_ARCH(pointer,value,arch) \
+    if ((arch) == ARCH_NOCONVERT) { \
+	INT_SET_UNALIGNED_16(pointer,value); \
+    } else { \
+	INT_SET_UNALIGNED_16_BE(pointer,value); \
+    }
+
+#define DIRINO4_GET_ARCH(pointer,arch) \
+    ( ((arch) == ARCH_NOCONVERT) \
+	? \
+	    (INT_GET_UNALIGNED_32(pointer)) \
+	: \
+	    (INT_GET_UNALIGNED_32_BE(pointer)) \
+    )
+
+#if XFS_BIG_FILESYSTEMS
+#define DIRINO_GET_ARCH(pointer,arch) \
+    ( ((arch) == ARCH_NOCONVERT) \
+	? \
+	    (INT_GET_UNALIGNED_64(pointer)) \
+	: \
+	    (INT_GET_UNALIGNED_64_BE(pointer)) \
+    )
+#else
+/* MACHINE ARCHITECTURE dependent */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define DIRINO_GET_ARCH(pointer,arch) \
+    DIRINO4_GET_ARCH((((__u8*)pointer)+4),arch)
+#else
+#define DIRINO_GET_ARCH(pointer,arch) \
+    DIRINO4_GET_ARCH(pointer,arch)
+#endif
+#endif
+
+#define DIRINO_COPY_ARCH(from,to,arch) \
+    if ((arch) == ARCH_NOCONVERT) { \
+	bcopy(from,to,sizeof(xfs_ino_t)); \
+    } else { \
+	INT_SWAP_UNALIGNED_64(from,to); \
+    }
+#define DIRINO4_COPY_ARCH(from,to,arch) \
+    if ((arch) == ARCH_NOCONVERT) { \
+	bcopy((((__u8*)from+4)),to,sizeof(xfs_dir2_ino4_t)); \
+    } else { \
+	INT_SWAP_UNALIGNED_32(from,to); \
+    }
+
+#endif	/* __XFS_ARCH_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,2294 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
+
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
+STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+
+/*
+ * Routines to manipulate out-of-line attribute values.
+ */
+STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args);
+STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
+STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
+
+#define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
+#define ATTR_RMTVALUE_TRANSBLKS 8	/* max # of blks in a transaction */
+
+#if defined(DEBUG)
+ktrace_t *xfs_attr_trace_buf;
+#endif
+
+
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+/*ARGSUSED*/
+int								/* error */
+xfs_attr_get(bhv_desc_t *bdp, char *name, char *value, int *valuelenp,
+	     int flags, struct cred *cred)
+{
+	xfs_da_args_t	args;
+	int		error;
+	int		namelen;
+	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
+
+	if (!name)
+		return EINVAL;
+	ASSERT(MAXNAMELEN-1 <= 0xff);	/* length is stored in uint8 */
+	namelen = strlen(name);
+	if (namelen >= MAXNAMELEN)
+		return EFAULT;		/* match IRIX behaviour */
+	XFS_STATS_INC(xfsstats.xs_attr_get);
+
+	if (XFS_IFORK_Q(ip) == 0)
+		return ENOATTR;
+
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return (EIO);
+
+	/*
+	 * Do we answer them, or ignore them?
+	 */
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if ((error = xfs_iaccess(XFS_BHVTOI(bdp), IREAD, cred))) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(error));
+	}
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	bzero((char *)&args, sizeof(args));
+	args.name = name;
+	args.namelen = namelen;
+	args.value = value;
+	args.valuelen = *valuelenp;
+	args.flags = flags;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.dp = ip;
+	args.whichfork = XFS_ATTR_FORK;
+	args.trans = NULL;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (XFS_IFORK_Q(ip) == 0 ||
+	    (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     ip->i_d.di_anextents == 0)) {
+		error = XFS_ERROR(ENOATTR);
+	} else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_attr_shortform_getvalue(&args);
+	} else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_get(&args);
+	} else {
+		error = xfs_attr_node_get(&args);
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	/*
+	 * Return the number of bytes in the value to the caller.
+	 */
+	*valuelenp = args.valuelen;
+
+	if (error == EEXIST)
+		error = 0;
+	return(error);
+}
+
+/*ARGSUSED*/
+int								/* error */
+xfs_attr_set(bhv_desc_t *bdp, char *name, char *value, int valuelen, int flags,
+		     struct cred *cred)
+{
+	xfs_da_args_t	args;
+	xfs_inode_t	*dp;
+	xfs_fsblock_t	firstblock;
+	xfs_bmap_free_t flist;
+	int		error, err2, committed;
+	int		local, size;
+	uint		nblks;
+	xfs_mount_t	*mp;
+	int		rsvd = (flags & ATTR_ROOT) != 0;
+	int		namelen;
+
+	ASSERT(MAXNAMELEN-1 <= 0xff); /* length is stored in uint8 */
+	namelen = strlen(name);
+	if (namelen >= MAXNAMELEN)
+		return EFAULT; /* match irix behaviour */
+
+	XFS_STATS_INC(xfsstats.xs_attr_set);
+	/*
+	 * Do we answer them, or ignore them?
+	 */
+	dp = XFS_BHVTOI(bdp);
+	mp = dp->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return (EIO);
+
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+	if ((error = xfs_iaccess(dp, IWRITE, cred))) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(error));
+	}
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+	/*
+	 * Attach the dquots to the inode.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if ((error = xfs_qm_dqattach(dp, 0)))
+			return (error);
+	}
+
+	/*
+	 * If the inode doesn't have an attribute fork, add one.
+	 * (inode must not be locked when we call this routine)
+	 */
+	if (XFS_IFORK_Q(dp) == 0) {
+		error = xfs_bmap_add_attrfork(dp, rsvd);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	bzero((char *)&args, sizeof(args));
+	args.name = name;
+	args.namelen = namelen;
+	args.value = value;
+	args.valuelen = valuelen;
+	args.flags = flags;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.dp = dp;
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+	args.whichfork = XFS_ATTR_FORK;
+	args.oknoent = 1;
+
+	/* Determine space new attribute will use, and if it will be inline
+	 * or out of line.
+	 */
+	size = xfs_attr_leaf_newentsize(&args, mp->m_sb.sb_blocksize, &local);
+
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	if (local) {
+		if (size > (mp->m_sb.sb_blocksize >> 1)) {
+			/* Double split possible */
+			nblks <<= 1;
+		}
+	} else {
+		uint	dblocks = XFS_B_TO_FSB(mp, valuelen);
+		/* Out of line attribute, cannot double split, but make
+		 * room for the attribute value itself.
+		 */
+		nblks += dblocks;
+		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+	}
+
+	/* Size is now blocks for attribute data */
+	args.total = nblks;
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (rsvd)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
+				      XFS_ATTRSET_LOG_RES(mp, nblks),
+				      0, XFS_TRANS_PERM_LOG_RES,
+				      XFS_ATTRSET_LOG_COUNT))) {
+		xfs_trans_cancel(args.trans, 0);
+		return(error);
+	}
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (rsvd) {
+			error = xfs_trans_reserve_blkquota_force(args.trans,
+					dp, nblks);
+		} else {
+			error = xfs_trans_reserve_blkquota(args.trans,
+					dp, nblks);
+		}
+		if (error) {
+			xfs_iunlock(dp, XFS_ILOCK_EXCL);
+			xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+			return (error);
+		}
+	}
+
+	xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(args.trans, dp);
+
+	/*
+	 * If the attribute list is non-existant or a shortform list,
+	 * upgrade it to a single-leaf-block attribute list.
+	 */
+	if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+	    ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
+	     (dp->i_d.di_anextents == 0))) {
+
+		/*
+		 * Build initial attribute list (if required).
+		 */
+		if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+			(void)xfs_attr_shortform_create(&args);
+
+		/*
+		 * Try to add the attr to the attribute list in
+		 * the inode.
+		 */
+		error = xfs_attr_shortform_addname(&args);
+		if (error != ENOSPC) {
+			/*
+			 * Commit the shortform mods, and we're done.
+			 * NOTE: this is also the error path (EEXIST, etc).
+			 */
+			ASSERT(args.trans != NULL);
+
+			/*
+			 * If this is a synchronous mount, make sure that
+			 * the transaction goes to disk before returning
+			 * to the user.
+			 */
+			if (mp->m_flags & XFS_MOUNT_WSYNC) {
+				xfs_trans_set_sync(args.trans);
+			}
+			err2 = xfs_trans_commit(args.trans,
+						 XFS_TRANS_RELEASE_LOG_RES,
+						 NULL);
+			xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+			/*
+			 * Hit the inode change time.
+			 */
+			if (!error && (flags & ATTR_KERNOTIME) == 0) {
+				xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+			}
+			return(error == 0 ? err2 : error);
+		}
+
+		/*
+		 * It won't fit in the shortform, transform to a leaf block.
+		 * GROT: another possible req'mt for a double-split btree op.
+		 */
+		XFS_BMAP_INIT(args.flist, args.firstblock);
+		error = xfs_attr_shortform_to_leaf(&args);
+		if (!error) {
+			error = xfs_bmap_finish(&args.trans, args.flist,
+						*args.firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args.trans = NULL;
+			xfs_bmap_cancel(&flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args.trans, dp);
+		}
+
+		/*
+		 * Commit the leaf transformation.  We'll need another (linked)
+		 * transaction to add the new attribute to the leaf.
+		 */
+		if ((error = xfs_attr_rolltrans(&args.trans, dp)))
+			goto out;
+
+	}
+
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_addname(&args);
+	} else {
+		error = xfs_attr_node_addname(&args);
+	}
+	if (error) {
+		goto out;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(args.trans);
+	}
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
+				 NULL);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Hit the inode change time.
+	 */
+	if (!error && (flags & ATTR_KERNOTIME) == 0) {
+		xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+	}
+
+	return(error);
+
+out:
+	if (args.trans)
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return(error);
+}
+
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
+/*ARGSUSED*/
+int								/* error */
+xfs_attr_remove(bhv_desc_t *bdp, char *name, int flags, struct cred *cred)
+{
+	xfs_da_args_t	    args;
+	xfs_inode_t	    *dp;
+	xfs_fsblock_t	    firstblock;
+	xfs_bmap_free_t	    flist;
+	int		    error;
+	xfs_mount_t	    *mp;
+	int		    namelen;
+
+	ASSERT(MAXNAMELEN-1<=0xff); /* length is stored in uint8 */
+	namelen = strlen(name);
+	if (namelen>=MAXNAMELEN)
+		return EFAULT; /* match irix behaviour */
+
+	XFS_STATS_INC(xfsstats.xs_attr_remove);
+
+	/*
+	 * Do we answer them, or ignore them?
+	 */
+	dp = XFS_BHVTOI(bdp);
+	mp = dp->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return (EIO);
+
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+	if ((error = xfs_iaccess(dp, IWRITE, cred))) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(error));
+	} else if (XFS_IFORK_Q(dp) == 0 ||
+		   (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+		    dp->i_d.di_anextents == 0)) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(ENOATTR));
+	}
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	bzero((char *)&args, sizeof(args));
+	args.name = name;
+	args.namelen = namelen;
+	args.flags = flags;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.dp = dp;
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+	args.total = 0;
+	args.whichfork = XFS_ATTR_FORK;
+
+	/*
+	 * Attach the dquots to the inode.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (XFS_NOT_DQATTACHED(mp, dp)) {
+			if ((error = xfs_qm_dqattach(dp, 0)))
+				return (error);
+		}
+	}
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (flags & ATTR_ROOT)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	if ((error = xfs_trans_reserve(args.trans,
+				      XFS_ATTRRM_SPACE_RES(mp),
+				      XFS_ATTRRM_LOG_RES(mp),
+				      0, XFS_TRANS_PERM_LOG_RES,
+				      XFS_ATTRRM_LOG_COUNT))) {
+		xfs_trans_cancel(args.trans, 0);
+		return(error);
+
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks not allocate in the common case.
+	 */
+	xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(args.trans, dp);
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (XFS_IFORK_Q(dp) == 0 ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+		error = XFS_ERROR(ENOATTR);
+		goto out;
+	}
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+		error = xfs_attr_shortform_remove(&args);
+		if (error) {
+			goto out;
+		}
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_removename(&args);
+	} else {
+		error = xfs_attr_node_removename(&args);
+	}
+	if (error) {
+		goto out;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(args.trans);
+	}
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
+				 NULL);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Hit the inode change time.
+	 */
+	if (!error && (flags & ATTR_KERNOTIME) == 0) {
+		xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+	}
+
+	return(error);
+
+out:
+	if (args.trans)
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return(error);
+}
+
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths.	Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
+		      attrlist_cursor_kern_t *cursor, struct cred *cred)
+{
+	xfs_attr_list_context_t context;
+	xfs_inode_t *dp;
+	int error;
+
+	XFS_STATS_INC(xfsstats.xs_attr_list);
+
+	/*
+	 * Validate the cursor.
+	 */
+	if (cursor->pad1 || cursor->pad2)
+		return(XFS_ERROR(EINVAL));
+	if ((cursor->initted == 0) &&
+	    (cursor->hashval || cursor->blkno || cursor->offset))
+		return(XFS_ERROR(EINVAL));
+
+	/*
+	 * Check for a properly aligned buffer.
+	 */
+	if (((long)buffer) & (sizeof(int)-1))
+		return(XFS_ERROR(EFAULT));
+	if (flags & ATTR_KERNOVAL)
+		bufsize = 0;
+
+	/*
+	 * Initialize the output buffer.
+	 */
+	context.dp = dp = XFS_BHVTOI(bdp);
+	context.cursor = cursor;
+	context.count = 0;
+	context.dupcnt = 0;
+	context.resynch = 1;
+	context.flags = flags;
+	if (!(flags & ATTR_KERNAMELS)) {
+		context.bufsize = (bufsize & ~(sizeof(int)-1));	 /* align */
+		context.firstu = context.bufsize;
+		context.alist = (attrlist_t *)buffer;
+		context.alist->al_count = 0;
+		context.alist->al_more = 0;
+		context.alist->al_offset[0] = context.bufsize;
+	}
+	else {
+		context.bufsize = bufsize;
+		context.firstu = context.bufsize;
+		context.alist = (attrlist_t *)buffer;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return (EIO);
+	/*
+	 * Do they have permission?
+	 */
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+	if ((error = xfs_iaccess(dp, IREAD, cred))) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(error));
+	}
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	xfs_attr_trace_l_c("syscall start", &context);
+	if (XFS_IFORK_Q(dp) == 0 ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+		error = 0;
+	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_attr_shortform_list(&context);
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_list(&context);
+	} else {
+		error = xfs_attr_node_list(&context);
+	}
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+	xfs_attr_trace_l_c("syscall end", &context);
+
+	if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) {
+		ASSERT(error >= 0);
+	}
+	else {	/* must return negated buffer size or the error */
+		if (context.count < 0)
+			error = XFS_ERROR(ERANGE);
+		else
+			error = -context.count;
+	}
+
+	return(error);
+}
+
+int								/* error */
+xfs_attr_inactive(xfs_inode_t *dp)
+{
+	xfs_trans_t *trans;
+	xfs_mount_t *mp;
+	int error;
+
+	mp = dp->i_mount;
+	ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+	/* XXXsup - why on earth are we taking ILOCK_EXCL here??? */
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	if ((XFS_IFORK_Q(dp) == 0) ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+		return(0);
+	}
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+	if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
+				      XFS_TRANS_PERM_LOG_RES,
+				      XFS_ATTRINVAL_LOG_COUNT))) {
+		xfs_trans_cancel(trans, 0);
+		return(error);
+	}
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks, not allocate, in the common case.
+	 */
+	xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(trans, dp);
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if ((XFS_IFORK_Q(dp) == 0) ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+		error = 0;
+		goto out;
+	}
+	error = xfs_attr_root_inactive(&trans, dp);
+	if (error)
+		goto out;
+	/*
+	 * signal synchronous inactive transactions unless this
+	 * is a synchronous mount filesystem in which case we
+	 * know that we're here because we've been called out of
+	 * xfs_inactive which means that the last reference is gone
+	 * and the unlink transaction has already hit the disk so
+	 * async inactive transactions are safe.
+	 */
+	if ((error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK,
+				(!(mp->m_flags & XFS_MOUNT_WSYNC)
+				 ? 1 : 0))))
+		goto out;
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES,
+				 NULL);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	return(error);
+
+out:
+	xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return(error);
+}
+
+
+
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_attr_shortform_addname(xfs_da_args_t *args)
+{
+	int newsize, retval;
+
+	retval = xfs_attr_shortform_lookup(args);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		return(retval);
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			return(retval);
+		retval = xfs_attr_shortform_remove(args);
+		ASSERT(retval == 0);
+	}
+
+	newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
+	newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+	if ((newsize <= XFS_IFORK_ASIZE(args->dp)) &&
+	    (args->namelen < XFS_ATTR_SF_ENTSIZE_MAX) &&
+	    (args->valuelen < XFS_ATTR_SF_ENTSIZE_MAX)) {
+		retval = xfs_attr_shortform_add(args);
+		ASSERT(retval == 0);
+	} else {
+		return(XFS_ERROR(ENOSPC));
+	}
+	return(0);
+}
+
+
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+int
+xfs_attr_leaf_addname(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int retval, error, committed;
+
+	/*
+	 * Read the (only) block in the attribute list in.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+
+	/*
+	 * Look up the given attribute in the leaf block.  Figure out if
+	 * the given flags produce an error or call for an atomic rename.
+	 */
+	retval = xfs_attr_leaf_lookup_int(bp, args);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		xfs_da_brelse(args->trans, bp);
+		return(retval);
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE) {	/* pure create op */
+			xfs_da_brelse(args->trans, bp);
+			return(retval);
+		}
+		args->rename = 1;			/* an atomic rename */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+	}
+
+	/*
+	 * Add the attribute to the leaf block, transitioning to a Btree
+	 * if required.
+	 */
+	retval = xfs_attr_leaf_add(bp, args);
+	xfs_da_buf_done(bp);
+	if (retval == ENOSPC) {
+		/*
+		 * Promote the attribute list to the Btree format, then
+		 * Commit that transaction so that the node_addname() call
+		 * can manage its own transactions.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_attr_leaf_to_node(args);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+
+		/*
+		 * Commit the current trans (including the inode) and start
+		 * a new one.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+			return (error);
+
+		/*
+		 * Fob the whole rest of the problem off on the Btree code.
+		 */
+		error = xfs_attr_node_addname(args);
+		return(error);
+	}
+
+	/*
+	 * Commit the transaction that added the attr name so that
+	 * later routines can manage their own transactions.
+	 */
+	if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+		return (error);
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->rename) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr_leaf_flipflags(args);
+		if (error)
+			return(error);
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return(error);
+		}
+
+		/*
+		 * Read in the block containing the "old" attr, then
+		 * remove the "old" attr from that block (neat, huh!)
+		 */
+		error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
+						     &bp, XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+		(void)xfs_attr_leaf_remove(bp, args);
+
+		/*
+		 * If the result is small enough, shrink it all into the inode.
+		 */
+		if (xfs_attr_shortform_allfit(bp, dp)) {
+			XFS_BMAP_INIT(args->flist, args->firstblock);
+			error = xfs_attr_leaf_to_shortform(bp, args);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							*args->firstblock,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				return(error);
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed) {
+				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(args->trans, dp);
+			}
+		} else
+			xfs_da_buf_done(bp);
+
+		/*
+		 * Commit the remove and start the next trans in series.
+		 */
+		error = xfs_attr_rolltrans(&args->trans, dp);
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr_leaf_clearflag(args);
+	}
+	return(error);
+}
+
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int committed;
+	int error;
+
+	/*
+	 * Remove the attribute.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+
+	ASSERT(bp != NULL);
+	error = xfs_attr_leaf_lookup_int(bp, args);
+	if (error == ENOATTR) {
+		xfs_da_brelse(args->trans, bp);
+		return(error);
+	}
+
+	(void)xfs_attr_leaf_remove(bp, args);
+
+	/*
+	 * If the result is small enough, shrink it all into the inode.
+	 */
+	if (xfs_attr_shortform_allfit(bp, dp)) {
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_attr_leaf_to_shortform(bp, args);
+		/* bp is gone due to xfs_da_shrink_inode */
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+	} else
+		xfs_da_buf_done(bp);
+	return(0);
+}
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+	xfs_dabuf_t *bp;
+	int error;
+
+	args->blkno = 0;
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+
+	error = xfs_attr_leaf_lookup_int(bp, args);
+	if (error != EEXIST)  {
+		xfs_da_brelse(args->trans, bp);
+		return(error);
+	}
+	error = xfs_attr_leaf_getvalue(bp, args);
+	xfs_da_brelse(args->trans, bp);
+	if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
+		error = xfs_attr_rmtval_get(args);
+	}
+	return(error);
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+	xfs_attr_leafblock_t *leaf;
+	int error;
+	xfs_dabuf_t *bp;
+
+	context->cursor->blkno = 0;
+	error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						!= XFS_ATTR_LEAF_MAGIC) {
+		xfs_da_brelse(NULL, bp);
+		return(XFS_ERROR(EFSCORRUPTED));
+	}
+
+	(void)xfs_attr_leaf_list_int(bp, context);
+	xfs_da_brelse(NULL, bp);
+	return(0);
+}
+
+
+/*========================================================================
+ * External routines when attribute list size > XFS_LBSIZE(mp).
+ *========================================================================*/
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ */
+STATIC int
+xfs_attr_node_addname(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	xfs_mount_t *mp;
+	int committed, retval, error;
+
+	/*
+	 * Fill in bucket of arguments/results/context to carry around.
+	 */
+	dp = args->dp;
+	mp = dp->i_mount;
+restart:
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = mp;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name already exists, and get back a pointer
+	 * to where it should go.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error)
+		goto out;
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		goto out;
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			goto out;
+		args->rename = 1;			/* atomic rename op */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+		args->rmtblkno = 0;
+		args->rmtblkcnt = 0;
+	}
+
+	retval = xfs_attr_leaf_add(blk->bp, state->args);
+	if (retval == ENOSPC) {
+		if (state->path.active == 1) {
+			/*
+			 * Its really a single leaf node, but it had
+			 * out-of-line values so it looked like it *might*
+			 * have been a b-tree.
+			 */
+			xfs_da_state_free(state);
+			XFS_BMAP_INIT(args->flist, args->firstblock);
+			error = xfs_attr_leaf_to_node(args);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							*args->firstblock,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed) {
+				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(args->trans, dp);
+			}
+
+			/*
+			 * Commit the node conversion and start the next
+			 * trans in the chain.
+			 */
+			if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+				goto out;
+
+			goto restart;
+		}
+
+		/*
+		 * Split as many Btree elements as required.
+		 * This code tracks the new and old attr's location
+		 * in the index/blkno/rmtblkno/rmtblkcnt fields and
+		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_da_split(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+	} else {
+		/*
+		 * Addition succeeded, update Btree hashvals.
+		 */
+		xfs_da_fixhashpath(state, &state->path);
+	}
+
+	/*
+	 * Kill the state structure, we're done with it and need to
+	 * allow the buffers to come back later.
+	 */
+	xfs_da_state_free(state);
+	state = NULL;
+
+	/*
+	 * Commit the leaf addition or btree split and start the next
+	 * trans in the chain.
+	 */
+	if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+		goto out;
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->rename) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr_leaf_flipflags(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return(error);
+		}
+
+		/*
+		 * Re-find the "old" attribute entry after any split ops.
+		 * The INCOMPLETE flag means that we will find the "old"
+		 * attr, not the "new" one.
+		 */
+		args->flags |= XFS_ATTR_INCOMPLETE;
+		state = xfs_da_state_alloc();
+		state->args = args;
+		state->mp = mp;
+		state->blocksize = state->mp->m_sb.sb_blocksize;
+		state->inleaf = 0;
+		error = xfs_da_node_lookup_int(state, &retval);
+		if (error)
+			goto out;
+
+		/*
+		 * Remove the name and update the hashvals in the tree.
+		 */
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+		error = xfs_attr_leaf_remove(blk->bp, args);
+		xfs_da_fixhashpath(state, &state->path);
+
+		/*
+		 * Check to see if the tree needs to be collapsed.
+		 */
+		if (retval && (state->path.active > 1)) {
+			XFS_BMAP_INIT(args->flist, args->firstblock);
+			error = xfs_da_join(state);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							*args->firstblock,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed) {
+				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(args->trans, dp);
+			}
+		}
+
+		/*
+		 * Commit and start the next trans in the chain.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+			goto out;
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr_leaf_clearflag(args);
+		if (error)
+			goto out;
+	}
+	retval = error = 0;
+
+out:
+	if (state)
+		xfs_da_state_free(state);
+	if (error)
+		return(error);
+	return(retval);
+}
+
+/*
+ * Remove a name from a B-tree attribute list.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_attr_node_removename(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int retval, error, committed;
+
+	/*
+	 * Tie a string around our finger to remind us where we are.
+	 */
+	dp = args->dp;
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error || (retval != EEXIST)) {
+		if (error == 0)
+			error = retval;
+		goto out;
+	}
+
+	/*
+	 * If there is an out-of-line value, de-allocate the blocks.
+	 * This is done before we remove the attribute so that we don't
+	 * overflow the maximum size of a transaction and/or hit a deadlock.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->bp != NULL);
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if (args->rmtblkno > 0) {
+		/*
+		 * Fill in disk block numbers in the state structure
+		 * so that we can get the buffers back after we commit
+		 * several transactions in the following calls.
+		 */
+		error = xfs_attr_fillstate(state);
+		if (error)
+			goto out;
+
+		/*
+		 * Mark the attribute as INCOMPLETE, then bunmapi() the
+		 * remote value.
+		 */
+		error = xfs_attr_leaf_setflag(args);
+		if (error)
+			goto out;
+		error = xfs_attr_rmtval_remove(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Refill the state structure with buffers, the prior calls
+		 * released our buffers.
+		 */
+		error = xfs_attr_refillstate(state);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Remove the name and update the hashvals in the tree.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	retval = xfs_attr_leaf_remove(blk->bp, args);
+	xfs_da_fixhashpath(state, &state->path);
+
+	/*
+	 * Check to see if the tree needs to be collapsed.
+	 */
+	if (retval && (state->path.active > 1)) {
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_da_join(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+
+		/*
+		 * Commit the Btree join operation and start a new trans.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+			goto out;
+	}
+
+	/*
+	 * If the result is small enough, push it all into the inode.
+	 */
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		/*
+		 * Have to get rid of the copy of this dabuf in the state.
+		 */
+		ASSERT(state->path.active == 1);
+		ASSERT(state->path.blk[0].bp);
+		xfs_da_buf_done(state->path.blk[0].bp);
+		state->path.blk[0].bp = NULL;
+
+		error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+						     XFS_ATTR_FORK);
+		if (error)
+			goto out;
+		ASSERT(INT_GET(((xfs_attr_leafblock_t *)
+				      bp->data)->hdr.info.magic, ARCH_CONVERT)
+						       == XFS_ATTR_LEAF_MAGIC);
+
+		if (xfs_attr_shortform_allfit(bp, dp)) {
+			XFS_BMAP_INIT(args->flist, args->firstblock);
+			error = xfs_attr_leaf_to_shortform(bp, args);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							*args->firstblock,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed) {
+				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(args->trans, dp);
+			}
+		} else
+			xfs_da_brelse(args->trans, bp);
+	}
+	error = 0;
+
+out:
+	xfs_da_state_free(state);
+	return(error);
+}
+
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commit's has released these buffers.
+ */
+STATIC int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level;
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = xfs_da_blkno(blk->bp);
+			xfs_da_buf_done(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = xfs_da_blkno(blk->bp);
+			xfs_da_buf_done(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	return(0);
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commit's has released those
+ * buffers from our grip.
+ */
+STATIC int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level, error;
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da_read_buf(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da_read_buf(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	return(0);
+}
+
+/*
+ * Look up a filename in a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ */
+int
+xfs_attr_node_get(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	int error, retval;
+	int i;
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error) {
+		retval = error;
+	} else if (retval == EEXIST) {
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->bp != NULL);
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+		/*
+		 * Get the value, local or "remote"
+		 */
+		retval = xfs_attr_leaf_getvalue(blk->bp, args);
+		if (!retval && (args->rmtblkno > 0)
+		    && !(args->flags & ATTR_KERNOVAL)) {
+			retval = xfs_attr_rmtval_get(args);
+		}
+	}
+
+	/*
+	 * If not in a transaction, we have to release all the buffers.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return(retval);
+}
+
+STATIC int							/* error */
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	int error, i;
+	xfs_dabuf_t *bp;
+
+	cursor = context->cursor;
+	cursor->initted = 1;
+
+	/*
+	 * Do all sorts of validation on the passed-in cursor structure.
+	 * If anything is amiss, ignore the cursor and look up the hashval
+	 * starting from the btree root.
+	 */
+	bp = NULL;
+	if (cursor->blkno > 0) {
+		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+					      &bp, XFS_ATTR_FORK);
+		if ((error != 0) && (error != EFSCORRUPTED))
+			return(error);
+		if (bp) {
+			node = bp->data;
+			switch (INT_GET(node->hdr.info.magic, ARCH_CONVERT)) {
+			case XFS_DA_NODE_MAGIC:
+				xfs_attr_trace_l_cn("wrong blk", context, node);
+				xfs_da_brelse(NULL, bp);
+				bp = NULL;
+				break;
+			case XFS_ATTR_LEAF_MAGIC:
+				leaf = bp->data;
+				if (cursor->hashval >
+				    INT_GET(leaf->entries[
+					 INT_GET(leaf->hdr.count,
+						ARCH_CONVERT)-1].hashval,
+							ARCH_CONVERT)) {
+					xfs_attr_trace_l_cl("wrong blk",
+							   context, leaf);
+					xfs_da_brelse(NULL, bp);
+					bp = NULL;
+				} else if (cursor->hashval <=
+					     INT_GET(leaf->entries[0].hashval,
+							ARCH_CONVERT)) {
+					xfs_attr_trace_l_cl("maybe wrong blk",
+							   context, leaf);
+					xfs_da_brelse(NULL, bp);
+					bp = NULL;
+				}
+				break;
+			default:
+				xfs_attr_trace_l_c("wrong blk - ??", context);
+				xfs_da_brelse(NULL, bp);
+				bp = NULL;
+			}
+		}
+	}
+
+	/*
+	 * We did not find what we expected given the cursor's contents,
+	 * so we start from the top and work down based on the hash value.
+	 * Note that start of node block is same as start of leaf block.
+	 */
+	if (bp == NULL) {
+		cursor->blkno = 0;
+		for (;;) {
+			error = xfs_da_read_buf(NULL, context->dp,
+						      cursor->blkno, -1, &bp,
+						      XFS_ATTR_FORK);
+			if (error)
+				return(error);
+			if (bp == NULL)
+				return(XFS_ERROR(EFSCORRUPTED));
+			node = bp->data;
+			if (INT_GET(node->hdr.info.magic, ARCH_CONVERT)
+							== XFS_ATTR_LEAF_MAGIC)
+				break;
+			if (INT_GET(node->hdr.info.magic, ARCH_CONVERT)
+							!= XFS_DA_NODE_MAGIC) {
+				xfs_da_brelse(NULL, bp);
+				return(XFS_ERROR(EFSCORRUPTED));
+			}
+			btree = node->btree;
+			for (i = 0;
+				i < INT_GET(node->hdr.count, ARCH_CONVERT);
+								btree++, i++) {
+				if (cursor->hashval
+						<= INT_GET(btree->hashval,
+							    ARCH_CONVERT)) {
+					cursor->blkno = INT_GET(btree->before, ARCH_CONVERT);
+					xfs_attr_trace_l_cb("descending",
+							    context, btree);
+					break;
+				}
+			}
+			if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
+				xfs_da_brelse(NULL, bp);
+				return(0);
+			}
+			xfs_da_brelse(NULL, bp);
+		}
+	}
+	ASSERT(bp != NULL);
+
+	/*
+	 * Roll upward through the blocks, processing each leaf block in
+	 * order.  As long as there is space in the result buffer, keep
+	 * adding the information.
+	 */
+	for (;;) {
+		leaf = bp->data;
+		if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						!= XFS_ATTR_LEAF_MAGIC) {
+			xfs_da_brelse(NULL, bp);
+			return(XFS_ERROR(EFSCORRUPTED));
+		}
+		error = xfs_attr_leaf_list_int(bp, context);
+		if (error || (INT_ISZERO(leaf->hdr.info.forw, ARCH_CONVERT)))
+			break;	/* not really an error, buffer full or EOF */
+		cursor->blkno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT);
+		xfs_da_brelse(NULL, bp);
+		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+					      &bp, XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		if (bp == NULL)
+			return(XFS_ERROR(EFSCORRUPTED));
+	}
+	xfs_da_brelse(NULL, bp);
+	return(0);
+}
+
+
+/*========================================================================
+ * External routines for manipulating out-of-line attribute values.
+ *========================================================================*/
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+STATIC int
+xfs_attr_rmtval_get(xfs_da_args_t *args)
+{
+	xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
+	xfs_mount_t *mp;
+	xfs_daddr_t dblkno;
+	xfs_caddr_t dst;
+	xfs_buf_t *bp;
+	int nmap, error, tmp, valuelen, blkcnt, i;
+	xfs_dablk_t lblkno;
+
+	ASSERT(!(args->flags & ATTR_KERNOVAL));
+
+	mp = args->dp->i_mount;
+	dst = args->value;
+	valuelen = args->valuelen;
+	lblkno = args->rmtblkno;
+	while (valuelen > 0) {
+		nmap = ATTR_RMTVALUE_MAPSIZE;
+		error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
+				  args->rmtblkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				  NULL, 0, map, &nmap, NULL);
+		if (error)
+			return(error);
+		ASSERT(nmap >= 1);
+
+		for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+			ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+			       (map[i].br_startblock != HOLESTARTBLOCK));
+			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+			blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
+					     blkcnt, XFS_BUF_LOCK, &bp);
+			if (error)
+				return(error);
+
+			tmp = (valuelen < XFS_BUF_SIZE(bp))
+				? valuelen : XFS_BUF_SIZE(bp);
+			xfs_biomove(bp, 0, tmp, dst, XFS_B_READ);
+			xfs_buf_relse(bp);
+			dst += tmp;
+			valuelen -= tmp;
+
+			lblkno += map[i].br_blockcount;
+		}
+	}
+	ASSERT(valuelen == 0);
+	return(0);
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+STATIC int
+xfs_attr_rmtval_set(xfs_da_args_t *args)
+{
+	xfs_mount_t *mp;
+	xfs_fileoff_t lfileoff;
+	xfs_inode_t *dp;
+	xfs_bmbt_irec_t map;
+	xfs_daddr_t dblkno;
+	xfs_caddr_t src;
+	xfs_buf_t *bp;
+	xfs_dablk_t lblkno;
+	int blkcnt, valuelen, nmap, error, tmp, committed;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	src = args->value;
+
+	/*
+	 * Find a "hole" in the attribute address space large enough for
+	 * us to drop the new attribute's value into.
+	 */
+	blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+	lfileoff = 0;
+	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+						   XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+	args->rmtblkcnt = blkcnt;
+
+	/*
+	 * Roll through the "value", allocating blocks on disk as required.
+	 */
+	while (blkcnt > 0) {
+		/*
+		 * Allocate a single extent, up to the size of the value.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
+				  blkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
+							XFS_BMAPI_WRITE,
+				  args->firstblock, args->total, &map, &nmap,
+				  args->flist);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+
+		/*
+		 * Start the next trans in the chain.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+			return (error);
+	}
+
+	/*
+	 * Roll through the "value", copying the attribute value to the
+	 * already-allocated blocks.  Blocks are written synchronously
+	 * so that we can know they are all on disk before we turn off
+	 * the INCOMPLETE flag.
+	 */
+	lblkno = args->rmtblkno;
+	valuelen = args->valuelen;
+	while (valuelen > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
+				  args->rmtblkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				  args->firstblock, 0, &map, &nmap, NULL);
+		if (error) {
+			return(error);
+		}
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
+							blkcnt, XFS_BUF_LOCK);
+		ASSERT(bp);
+		ASSERT(!XFS_BUF_GETERROR(bp));
+
+		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
+							XFS_BUF_SIZE(bp);
+		xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE);
+		if (tmp < XFS_BUF_SIZE(bp))
+			xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
+			return (error);
+		}
+		src += tmp;
+		valuelen -= tmp;
+
+		lblkno += map.br_blockcount;
+	}
+	ASSERT(valuelen == 0);
+	return(0);
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+STATIC int
+xfs_attr_rmtval_remove(xfs_da_args_t *args)
+{
+	xfs_mount_t *mp;
+	xfs_bmbt_irec_t map;
+	xfs_buf_t *bp;
+	xfs_daddr_t dblkno;
+	xfs_dablk_t lblkno;
+	int valuelen, blkcnt, nmap, error, done, committed;
+
+	mp = args->dp->i_mount;
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's
+	 * blocks.
+	 */
+	lblkno = args->rmtblkno;
+	valuelen = args->rmtblkcnt;
+	while (valuelen > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
+					args->rmtblkcnt,
+					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+					args->firstblock, 0, &map, &nmap,
+					args->flist);
+		if (error) {
+			return(error);
+		}
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		/*
+		 * If the "remote" value is in the cache, remove it.
+		 */
+		/* bp = incore(mp->m_dev, dblkno, blkcnt, 1); */
+		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, 1);
+		if (bp) {
+			XFS_BUF_STALE(bp);
+			XFS_BUF_UNDELAYWRITE(bp);
+			xfs_buf_relse(bp);
+			bp = NULL;
+		}
+
+		valuelen -= map.br_blockcount;
+
+		lblkno += map.br_blockcount;
+	}
+
+	/*
+	 * Keep de-allocating extents until the remote-value region is gone.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	done = 0;
+	while (!done) {
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				    1, args->firstblock, args->flist, &done);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, args->dp);
+		}
+
+		/*
+		 * Close out trans and start the next one in the chain.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
+			return (error);
+	}
+	return(0);
+}
+
+#if defined(XFS_ATTR_TRACE)
+/*
+ * Add a trace buffer entry for an attr_list context structure.
+ */
+void
+xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
+{
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where,
+		(__psunsigned_t)context->dp,
+		(__psunsigned_t)context->cursor->hashval,
+		(__psunsigned_t)context->cursor->blkno,
+		(__psunsigned_t)context->cursor->offset,
+		(__psunsigned_t)context->alist,
+		(__psunsigned_t)context->bufsize,
+		(__psunsigned_t)context->count,
+		(__psunsigned_t)context->firstu,
+		(__psunsigned_t)
+			(context->count > 0)
+				? (ATTR_ENTRY(context->alist,
+					      context->count-1)->a_valuelen)
+				: 0,
+		(__psunsigned_t)context->dupcnt,
+		(__psunsigned_t)context->flags,
+		(__psunsigned_t)NULL,
+		(__psunsigned_t)NULL,
+		(__psunsigned_t)NULL);
+}
+
+/*
+ * Add a trace buffer entry for a context structure and a Btree node.
+ */
+void
+xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
+			 struct xfs_da_intnode *node)
+{
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where,
+		(__psunsigned_t)context->dp,
+		(__psunsigned_t)context->cursor->hashval,
+		(__psunsigned_t)context->cursor->blkno,
+		(__psunsigned_t)context->cursor->offset,
+		(__psunsigned_t)context->alist,
+		(__psunsigned_t)context->bufsize,
+		(__psunsigned_t)context->count,
+		(__psunsigned_t)context->firstu,
+		(__psunsigned_t)
+			(context->count > 0)
+				? (ATTR_ENTRY(context->alist,
+					      context->count-1)->a_valuelen)
+				: 0,
+		(__psunsigned_t)context->dupcnt,
+		(__psunsigned_t)context->flags,
+		(__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Add a trace buffer entry for a context structure and a Btree element.
+ */
+void
+xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
+			  struct xfs_da_node_entry *btree)
+{
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where,
+		(__psunsigned_t)context->dp,
+		(__psunsigned_t)context->cursor->hashval,
+		(__psunsigned_t)context->cursor->blkno,
+		(__psunsigned_t)context->cursor->offset,
+		(__psunsigned_t)context->alist,
+		(__psunsigned_t)context->bufsize,
+		(__psunsigned_t)context->count,
+		(__psunsigned_t)context->firstu,
+		(__psunsigned_t)
+			(context->count > 0)
+				? (ATTR_ENTRY(context->alist,
+					      context->count-1)->a_valuelen)
+				: 0,
+		(__psunsigned_t)context->dupcnt,
+		(__psunsigned_t)context->flags,
+		(__psunsigned_t)INT_GET(btree->hashval, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(btree->before, ARCH_CONVERT),
+		(__psunsigned_t)NULL);
+}
+
+/*
+ * Add a trace buffer entry for a context structure and a leaf block.
+ */
+void
+xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
+			      struct xfs_attr_leafblock *leaf)
+{
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where,
+		(__psunsigned_t)context->dp,
+		(__psunsigned_t)context->cursor->hashval,
+		(__psunsigned_t)context->cursor->blkno,
+		(__psunsigned_t)context->cursor->offset,
+		(__psunsigned_t)context->alist,
+		(__psunsigned_t)context->bufsize,
+		(__psunsigned_t)context->count,
+		(__psunsigned_t)context->firstu,
+		(__psunsigned_t)
+			(context->count > 0)
+				? (ATTR_ENTRY(context->alist,
+					      context->count-1)->a_valuelen)
+				: 0,
+		(__psunsigned_t)context->dupcnt,
+		(__psunsigned_t)context->flags,
+		(__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Add a trace buffer entry for the arguments given to the routine,
+ * generic form.
+ */
+void
+xfs_attr_trace_enter(int type, char *where,
+			 __psunsigned_t a2, __psunsigned_t a3,
+			 __psunsigned_t a4, __psunsigned_t a5,
+			 __psunsigned_t a6, __psunsigned_t a7,
+			 __psunsigned_t a8, __psunsigned_t a9,
+			 __psunsigned_t a10, __psunsigned_t a11,
+			 __psunsigned_t a12, __psunsigned_t a13,
+			 __psunsigned_t a14, __psunsigned_t a15)
+{
+	ASSERT(xfs_attr_trace_buf);
+	ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
+					 (void *)where,
+					 (void *)a2,  (void *)a3,  (void *)a4,
+					 (void *)a5,  (void *)a6,  (void *)a7,
+					 (void *)a8,  (void *)a9,  (void *)a10,
+					 (void *)a11, (void *)a12, (void *)a13,
+					 (void *)a14, (void *)a15);
+}
+#endif	/* XFS_ATTR_TRACE */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ATTR_H__
+#define __XFS_ATTR_H__
+
+/*
+ * xfs_attr.h
+ *
+ * Large attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.
+ * The internal links in the Btree are logical block offsets into the file.
+ *
+ * Small attribute lists use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+#ifdef XFS_ALL_TRACE
+#define XFS_ATTR_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_ATTR_TRACE
+#endif
+
+
+/*========================================================================
+ * External interfaces
+ *========================================================================*/
+
+#define ATTR_ROOT	0x0002	/* use attrs in root namespace, not user */
+#define ATTR_CREATE	0x0010	/* pure create: fail if attr already exists */
+#define ATTR_REPLACE	0x0020	/* pure set: fail if attr does not exist */
+#define ATTR_KERNOTIME	0x1000	/* [kernel] don't update inode timestamps */
+#define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */
+#define ATTR_KERNAMELS	0x4000	/* [kernel] list attr names (simple list) */
+#define ATTR_KERNFULLS	0x8000	/* [kernel] full attr list, ie. root+user */
+
+/*
+ * The maximum size (into the kernel or returned from the kernel) of an
+ * attribute value or the buffer used for an attr_list() call.	Larger
+ * sizes will result in an E2BIG return code.
+ */
+#define ATTR_MAX_VALUELEN	(64*1024)	/* max length of a value */
+
+/*
+ * Define how lists of attribute names are returned to the user from
+ * the attr_list() call.  A large, 32bit aligned, buffer is passed in
+ * along with its size.	 We put an array of offsets at the top that each
+ * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
+ */
+typedef struct attrlist {
+	__s32	al_count;	/* number of entries in attrlist */
+	__s32	al_more;	/* T/F: more attrs (do call again) */
+	__s32	al_offset[1];	/* byte offsets of attrs [var-sized] */
+} attrlist_t;
+
+/*
+ * Show the interesting info about one attribute.  This is what the
+ * al_offset[i] entry points to.
+ */
+typedef struct attrlist_ent {	/* data from attr_list() */
+	__u32	a_valuelen;	/* number bytes in value of attr */
+	char	a_name[1];	/* attr name (NULL terminated) */
+} attrlist_ent_t;
+
+/*
+ * Given a pointer to the (char*) buffer containing the attr_list() result,
+ * and an index, return a pointer to the indicated attribute in the buffer.
+ */
+#define ATTR_ENTRY(buffer, index)		\
+	((attrlist_ent_t *)			\
+	 &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
+
+/*
+ * Multi-attribute operation vector.
+ */
+typedef struct attr_multiop {
+	int	am_opcode;	/* operation to perform (ATTR_OP_GET, etc.) */
+	int	am_error;	/* [out arg] result of this sub-op (an errno) */
+	char	*am_attrname;	/* attribute name to work with */
+	char	*am_attrvalue;	/* [in/out arg] attribute value (raw bytes) */
+	int	am_length;	/* [in/out arg] length of value */
+	int	am_flags;	/* bitwise OR of attr API flags defined above */
+} attr_multiop_t;
+
+#define ATTR_OP_GET	1	/* return the indicated attr's value */
+#define ATTR_OP_SET	2	/* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE	3	/* remove the indicated attr */
+
+/*
+ * Kernel-internal version of the attrlist cursor.
+ */
+typedef struct attrlist_cursor_kern {
+	__u32	hashval;	/* hash value of next entry to add */
+	__u32	blkno;		/* block containing entry (suggestion) */
+	__u32	offset;		/* offset in list of equal-hashvals */
+	__u16	pad1;		/* padding to match user-level */
+	__u8	pad2;		/* padding to match user-level */
+	__u8	initted;	/* T/F: cursor has been initialized */
+} attrlist_cursor_kern_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+struct cred;
+struct vnode;
+struct xfs_inode;
+struct attrlist_cursor_kern;
+struct xfs_ext_attr;
+struct xfs_da_args;
+
+/*
+ * Overall external interface routines.
+ */
+int xfs_attr_get(bhv_desc_t *, char *, char *, int *, int, struct cred *);
+int xfs_attr_set(bhv_desc_t *, char *, char *, int, int, struct cred *);
+int xfs_attr_remove(bhv_desc_t *, char *, int, struct cred *);
+int xfs_attr_list(bhv_desc_t *, char *, int, int,
+			 struct attrlist_cursor_kern *, struct cred *);
+int xfs_attr_inactive(struct xfs_inode *dp);
+
+int xfs_attr_node_get(struct xfs_da_args *);
+int xfs_attr_leaf_get(struct xfs_da_args *);
+int xfs_attr_shortform_getvalue(struct xfs_da_args *);
+int xfs_attr_fetch(struct xfs_inode *, char *, char *, int);
+
+#endif	/* __XFS_ATTR_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr_fetch.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr_fetch.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr_fetch.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr_fetch.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+int
+xfs_attr_fetch(xfs_inode_t *ip, char *name, char *value, int valuelen)
+{
+	xfs_da_args_t args;
+	int error;
+
+	if (XFS_IFORK_Q(ip) == 0)
+		return ENOATTR;
+	/*
+	 * Do the argument setup for the xfs_attr routines.
+	 */
+	bzero((char *)&args, sizeof(args));
+	args.dp = ip;
+	args.flags = ATTR_ROOT;
+	args.whichfork = XFS_ATTR_FORK;
+	args.name = name;
+	args.namelen = strlen(name);
+	args.value = value;
+	args.valuelen = valuelen;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.oknoent = 1;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (args.dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+		error = xfs_attr_shortform_getvalue(&args);
+	else if (xfs_bmap_one_block(args.dp, XFS_ATTR_FORK))
+		error = xfs_attr_leaf_get(&args);
+	else
+		error = xfs_attr_node_get(&args);
+
+	if (error == EEXIST)
+		error = 0;
+
+	return(error);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr_leaf.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr_leaf.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr_leaf.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr_leaf.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,2971 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_attr_leaf.c
+ *
+ * GROT: figure out how to recover gracefully when bmap returns ENOSPC.
+ */
+
+#include <xfs.h>
+
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
+					      int freemap_index);
+STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer);
+STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
+						   xfs_da_state_blk_t *blk1,
+						   xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *leaf_blk_1,
+					   xfs_da_state_blk_t *leaf_blk_2,
+					   int *number_entries_in_blk1,
+					   int *number_usedbytes_in_blk1);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
+					 int src_start,
+					 xfs_attr_leafblock_t *dst_leaf,
+					 int dst_start, int move_count,
+					 xfs_mount_t *mp);
+
+
+/*========================================================================
+ * External routines when dirsize < XFS_LITINO(mp).
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+int
+xfs_attr_shortform_create(xfs_da_args_t *args)
+{
+	xfs_attr_sf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	ifp = dp->i_afp;
+	ASSERT(ifp != NULL);
+	ASSERT(ifp->if_bytes == 0);
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+		ifp->if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+		ifp->if_flags |= XFS_IFINLINE;
+	} else {
+		ASSERT(ifp->if_flags & XFS_IFINLINE);
+	}
+	xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+	hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
+	INT_ZERO(hdr->count, ARCH_CONVERT);
+	INT_SET(hdr->totsize, ARCH_CONVERT, sizeof(*hdr));
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+	return(0);
+}
+
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+int
+xfs_attr_shortform_add(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i, offset, size;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	dp = args->dp;
+	ifp = dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (bcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (((args->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+			continue;
+		return(XFS_ERROR(EEXIST));
+	}
+
+	offset = (char *)sfe - (char *)sf;
+	size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+	xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+
+	sfe->namelen = args->namelen;
+	INT_SET(sfe->valuelen, ARCH_CONVERT, args->valuelen);
+	sfe->flags = (args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0;
+	bcopy(args->name, sfe->nameval, args->namelen);
+	bcopy(args->value, &sfe->nameval[args->namelen], args->valuelen);
+	INT_MOD(sf->hdr.count, ARCH_CONVERT, 1);
+	INT_MOD(sf->hdr.totsize, ARCH_CONVERT, size);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+	return(0);
+}
+
+/*
+ * Remove a name from the shortform attribute list structure.
+ */
+int
+xfs_attr_shortform_remove(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int base, size=0, end, totsize, i;
+	xfs_inode_t *dp;
+
+	/*
+	 * Remove the attribute.
+	 */
+	dp = args->dp;
+	base = sizeof(xfs_attr_sf_hdr_t);
+	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+					base += size, i++) {
+		size = XFS_ATTR_SF_ENTSIZE(sfe);
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (bcmp(sfe->nameval, args->name, args->namelen) != 0)
+			continue;
+		if (((args->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+			continue;
+		break;
+	}
+	if (i == INT_GET(sf->hdr.count, ARCH_CONVERT))
+		return(XFS_ERROR(ENOATTR));
+
+	end = base + size;
+	totsize = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
+	if (end != totsize) {
+		ovbcopy(&((char *)sf)[end], &((char *)sf)[base],
+							totsize - end);
+	}
+	INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
+	INT_MOD(sf->hdr.totsize, ARCH_CONVERT, -size);
+	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+	return(0);
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+	xfs_ifork_t *ifp;
+
+	ifp = args->dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (bcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (((args->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+			continue;
+		return(XFS_ERROR(EEXIST));
+	}
+	return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+
+	ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (bcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (((args->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+			continue;
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+			return(XFS_ERROR(EEXIST));
+		}
+		if (args->valuelen < INT_GET(sfe->valuelen, ARCH_CONVERT)) {
+			args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+			return(XFS_ERROR(ERANGE));
+		}
+		args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+		bcopy(&sfe->nameval[args->namelen], args->value,
+						    args->valuelen);
+		return(XFS_ERROR(EEXIST));
+	}
+	return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	xfs_da_args_t nargs;
+	char *tmpbuffer;
+	int error, i, size;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+	xfs_ifork_t *ifp;
+
+	dp = args->dp;
+	ifp = dp->i_afp;
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	size = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
+	tmpbuffer = kmem_alloc(size, KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	bcopy(ifp->if_u1.if_data, tmpbuffer, size);
+	sf = (xfs_attr_shortform_t *)tmpbuffer;
+
+	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+	bp = NULL;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		/*
+		 * If we hit an IO error middle of the transaction inside
+		 * grow_inode(), we may have inconsistent data. Bail out.
+		 */
+		if (error == EIO)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		bcopy(tmpbuffer, ifp->if_u1.if_data, size);	/* it back */
+		goto out;
+	}
+
+	ASSERT(blkno == 0);
+	error = xfs_attr_leaf_create(args, blkno, &bp);
+	if (error) {
+		error = xfs_da_shrink_inode(args, 0, bp);
+		bp = NULL;
+		if (error)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		bcopy(tmpbuffer, ifp->if_u1.if_data, size);	/* it back */
+		goto out;
+	}
+
+	bzero((char *)&nargs, sizeof(nargs));
+	nargs.dp = dp;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.oknoent = 1;
+
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+		nargs.name = (char *)sfe->nameval;
+		nargs.namelen = sfe->namelen;
+		nargs.value = (char *)&sfe->nameval[nargs.namelen];
+		nargs.valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+		nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
+						sfe->namelen);
+		nargs.flags = (sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0;
+		error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
+		ASSERT(error == ENOATTR);
+		error = xfs_attr_leaf_add(bp, &nargs);
+		ASSERT(error != ENOSPC);
+		if (error)
+			goto out;
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+	}
+	error = 0;
+
+out:
+	if(bp)
+		xfs_da_buf_done(bp);
+	kmem_free(tmpbuffer, size);
+	return(error);
+}
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+	xfs_attr_sf_sort_t *sa, *sb;
+
+	sa = (xfs_attr_sf_sort_t *)a;
+	sb = (xfs_attr_sf_sort_t *)b;
+	if (INT_GET(sa->hash, ARCH_CONVERT)
+				< INT_GET(sb->hash, ARCH_CONVERT)) {
+		return(-1);
+	} else if (INT_GET(sa->hash, ARCH_CONVERT)
+				> INT_GET(sb->hash, ARCH_CONVERT)) {
+		return(1);
+	} else {
+		return(sa->entno - sb->entno);
+	}
+}
+
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform atrtribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_sf_sort_t *sbuf, *sbp;
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	xfs_inode_t *dp;
+	int sbsize, nsbuf, count, i;
+
+	ASSERT(context != NULL);
+	dp = context->dp;
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_afp != NULL);
+	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+	ASSERT(sf != NULL);
+	if (INT_ISZERO(sf->hdr.count, ARCH_CONVERT))
+		return(0);
+	cursor = context->cursor;
+	ASSERT(cursor != NULL);
+
+	xfs_attr_trace_l_c("sf start", context);
+
+	/*
+	 * If the buffer is large enough, do not bother with sorting.
+	 * Note the generous fudge factor of 16 overhead bytes per entry.
+	 */
+	if ((dp->i_afp->if_bytes + INT_GET(sf->hdr.count, ARCH_CONVERT) * 16)
+							< context->bufsize) {
+		for (i = 0, sfe = &sf->list[0];
+				i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+			int ns = (sfe->flags & XFS_ATTR_ROOT)?
+						ROOT_NAMES : USER_NAMES;
+			if (((context->flags & ATTR_ROOT) != 0) !=
+			    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
+			    !(context->flags & ATTR_KERNFULLS)) {
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+				continue;
+			}
+			if (context->flags & ATTR_KERNOVAL) {
+				ASSERT(context->flags & ATTR_KERNAMELS);
+				context->count += xfs_namespaces[ns].namelen +
+					INT_GET(sfe->namelen, ARCH_CONVERT) + 1;
+			}
+			else {
+				if (xfs_attr_put_listent(context, ns,
+						   (char *)sfe->nameval,
+						   (int)sfe->namelen,
+						   (int)INT_GET(sfe->valuelen,
+								ARCH_CONVERT)))
+					break;
+			}
+			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+		}
+		xfs_attr_trace_l_c("sf big-gulp", context);
+		return(0);
+	}
+
+	/*
+	 * It didn't all fit, so we have to sort everything on hashval.
+	 */
+	sbsize = INT_GET(sf->hdr.count, ARCH_CONVERT) * sizeof(*sbuf);
+	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+
+	/*
+	 * Scan the attribute list for the rest of the entries, storing
+	 * the relevant info from only those that match into a buffer.
+	 */
+	nsbuf = 0;
+	for (i = 0, sfe = &sf->list[0];
+			i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+		if (((char *)sfe < (char *)sf) ||
+		    ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)) ||
+		    (sfe->namelen >= MAXNAMELEN)) {
+			xfs_attr_trace_l_c("sf corrupted", context);
+			kmem_free(sbuf, sbsize);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		if (((context->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
+		    !(context->flags & ATTR_KERNFULLS)) {
+			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+			continue;
+		}
+		sbp->entno = i;
+		INT_SET(sbp->hash, ARCH_CONVERT,
+			xfs_da_hashname((char *)sfe->nameval, sfe->namelen));
+		sbp->name = (char *)sfe->nameval;
+		sbp->namelen = sfe->namelen;
+		/* These are bytes, and both on-disk, don't endian-flip */
+		sbp->valuelen = sfe->valuelen;
+		sbp->flags = sfe->flags;
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+		sbp++;
+		nsbuf++;
+	}
+
+	/*
+	 * Sort the entries on hash then entno.
+	 */
+	qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+	/*
+	 * Re-find our place IN THE SORTED LIST.
+	 */
+	count = 0;
+	cursor->initted = 1;
+	cursor->blkno = 0;
+	for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+		if (INT_GET(sbp->hash, ARCH_CONVERT) == cursor->hashval) {
+			if (cursor->offset == count) {
+				break;
+			}
+			count++;
+		} else if (INT_GET(sbp->hash, ARCH_CONVERT) > cursor->hashval) {
+			break;
+		}
+	}
+	if (i == nsbuf) {
+		kmem_free(sbuf, sbsize);
+		xfs_attr_trace_l_c("blk end", context);
+		return(0);
+	}
+
+	/*
+	 * Loop putting entries into the user buffer.
+	 */
+	for ( ; i < nsbuf; i++, sbp++) {
+		int ns = (sbp->flags & XFS_ATTR_ROOT)? ROOT_NAMES:USER_NAMES;
+		if (cursor->hashval != INT_GET(sbp->hash, ARCH_CONVERT)) {
+			cursor->hashval = INT_GET(sbp->hash, ARCH_CONVERT);
+			cursor->offset = 0;
+		}
+		if (context->flags & ATTR_KERNOVAL) {
+			ASSERT(context->flags & ATTR_KERNAMELS);
+			context->count += xfs_namespaces[ns].namelen
+					+ sbp->namelen + 1;
+		}
+		else {
+			if (xfs_attr_put_listent(context, ns,
+					sbp->name, sbp->namelen,
+					INT_GET(sbp->valuelen, ARCH_CONVERT)))
+				break;
+		}
+		cursor->offset++;
+	}
+
+	kmem_free(sbuf, sbsize);
+	xfs_attr_trace_l_c("sf E-O-F", context);
+	return(0);
+}
+
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	int bytes, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+
+	entry = &leaf->entries[0];
+	bytes = sizeof(struct xfs_attr_sf_hdr);
+	for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* don't copy partial entries */
+		if (!(entry->flags & XFS_ATTR_LOCAL))
+			return(0);
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return(0);
+		if (INT_GET(name_loc->valuelen, ARCH_CONVERT) >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return(0);
+		bytes += sizeof(struct xfs_attr_sf_entry)-1
+				+ name_loc->namelen
+				+ INT_GET(name_loc->valuelen, ARCH_CONVERT);
+	}
+	return( bytes < XFS_IFORK_ASIZE(dp) );
+}
+
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_da_args_t nargs;
+	xfs_inode_t *dp;
+	char *tmpbuffer;
+	int error, i;
+
+	dp = args->dp;
+	tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+
+	ASSERT(bp != NULL);
+	bcopy(bp->data, tmpbuffer, XFS_LBSIZE(dp->i_mount));
+	leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	bzero(bp->data, XFS_LBSIZE(dp->i_mount));
+
+	/*
+	 * Clean out the prior contents of the attribute list.
+	 */
+	error = xfs_da_shrink_inode(args, 0, bp);
+	if (error)
+		goto out;
+	error = xfs_attr_shortform_create(args);
+	if (error)
+		goto out;
+
+	/*
+	 * Copy the attributes
+	 */
+	bzero((char *)&nargs, sizeof(nargs));
+	nargs.dp = dp;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.oknoent = 1;
+	entry = &leaf->entries[0];
+	for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;	/* don't copy partial entries */
+		if (INT_ISZERO(entry->nameidx, ARCH_CONVERT))
+			continue;
+		ASSERT(entry->flags & XFS_ATTR_LOCAL);
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+		nargs.name = (char *)name_loc->nameval;
+		nargs.namelen = name_loc->namelen;
+		nargs.value = (char *)&name_loc->nameval[nargs.namelen];
+		nargs.valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
+		nargs.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+		nargs.flags = (entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0;
+		xfs_attr_shortform_add(&nargs);
+	}
+	error = 0;
+
+out:
+	kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
+	return(error);
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr_leaf_to_node(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_intnode_t *node;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp1, *bp2;
+	xfs_dablk_t blkno;
+	int error;
+
+	dp = args->dp;
+	bp1 = bp2 = NULL;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		goto out;
+	error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+					     XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	ASSERT(bp1 != NULL);
+	bp2 = NULL;
+	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
+					    XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	ASSERT(bp2 != NULL);
+	bcopy(bp1->data, bp2->data, XFS_LBSIZE(dp->i_mount));
+	xfs_da_buf_done(bp1);
+	bp1 = NULL;
+	xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	node = bp1->data;
+	leaf = bp2->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	/* both on-disk, don't endian-flip twice */
+	node->btree[0].hashval =
+		leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval;
+	INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
+	INT_SET(node->hdr.count, ARCH_CONVERT, 1);
+	xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+	error = 0;
+out:
+	if (bp1)
+		xfs_da_buf_done(bp1);
+	if (bp2)
+		xfs_da_buf_done(bp2);
+	return(error);
+}
+
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+int
+xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int error;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+					    XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	bzero((char *)leaf, XFS_LBSIZE(dp->i_mount));
+	hdr = &leaf->hdr;
+	INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_ATTR_LEAF_MAGIC);
+	INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
+	if (INT_ISZERO(hdr->firstused, ARCH_CONVERT)) {
+		INT_SET(hdr->firstused, ARCH_CONVERT,
+			XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
+	}
+
+	INT_SET(hdr->freemap[0].base, ARCH_CONVERT,
+						sizeof(xfs_attr_leaf_hdr_t));
+	INT_SET(hdr->freemap[0].size, ARCH_CONVERT,
+					  INT_GET(hdr->firstused, ARCH_CONVERT)
+					- INT_GET(hdr->freemap[0].base,
+								ARCH_CONVERT));
+
+	xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	*bpp = bp;
+	return(0);
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+				   xfs_da_state_blk_t *newblk)
+{
+	xfs_dablk_t blkno;
+	int error;
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+	error = xfs_da_grow_inode(state->args, &blkno);
+	if (error)
+		return(error);
+	error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
+	if (error)
+		return(error);
+	newblk->blkno = blkno;
+	newblk->magic = XFS_ATTR_LEAF_MAGIC;
+
+	/*
+	 * Rebalance the entries across the two leaves.
+	 * NOTE: rebalance() currently depends on the 2nd block being empty.
+	 */
+	xfs_attr_leaf_rebalance(state, oldblk, newblk);
+	error = xfs_da_blk_link(state, oldblk, newblk);
+	if (error)
+		return(error);
+
+	/*
+	 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+	 * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+	 * "new" attrs info.  Will need the "old" info to remove it later.
+	 *
+	 * Insert the "new" entry in the correct block.
+	 */
+	if (state->inleaf)
+		error = xfs_attr_leaf_add(oldblk->bp, state->args);
+	else
+		error = xfs_attr_leaf_add(newblk->bp, state->args);
+
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+	return(error);
+}
+
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_attr_leaf_map_t *map;
+	int tablesize, entsize, sum, tmp, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT((args->index >= 0)
+		&& (args->index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+	hdr = &leaf->hdr;
+	entsize = xfs_attr_leaf_newentsize(args,
+			   args->trans->t_mountp->m_sb.sb_blocksize, NULL);
+
+	/*
+	 * Search through freemap for first-fit on new name length.
+	 * (may need to figure in size of entry struct too)
+	 */
+	tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1)
+					* sizeof(xfs_attr_leaf_entry_t)
+					+ sizeof(xfs_attr_leaf_hdr_t);
+	map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
+	for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
+		if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
+			sum += INT_GET(map->size, ARCH_CONVERT);
+			continue;
+		}
+		if (INT_ISZERO(map->size, ARCH_CONVERT))
+			continue;	/* no space in this map */
+		tmp = entsize;
+		if (INT_GET(map->base, ARCH_CONVERT)
+				< INT_GET(hdr->firstused, ARCH_CONVERT))
+			tmp += sizeof(xfs_attr_leaf_entry_t);
+		if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
+			tmp = xfs_attr_leaf_add_work(bp, args, i);
+			return(tmp);
+		}
+		sum += INT_GET(map->size, ARCH_CONVERT);
+	}
+
+	/*
+	 * If there are no holes in the address space of the block,
+	 * and we don't have enough freespace, then compaction will do us
+	 * no good and we should just give up.
+	 */
+	if (!hdr->holes && (sum < entsize))
+		return(XFS_ERROR(ENOSPC));
+
+	/*
+	 * Compact the entries to coalesce free space.
+	 * This may change the hdr->count via dropping INCOMPLETE entries.
+	 */
+	xfs_attr_leaf_compact(args->trans, bp);
+
+	/*
+	 * After compaction, the block is guaranteed to have only one
+	 * free region, in freemap[0].	If it is not big enough, give up.
+	 */
+	if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT)
+				< (entsize + sizeof(xfs_attr_leaf_entry_t)))
+		return(XFS_ERROR(ENOSPC));
+
+	return(xfs_attr_leaf_add_work(bp, args, 0));
+}
+
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_attr_leaf_map_t *map;
+	xfs_mount_t *mp;
+	int tmp, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
+	ASSERT((args->index >= 0)
+		&& (args->index <= INT_GET(hdr->count, ARCH_CONVERT)));
+
+	/*
+	 * Force open some space in the entry array and fill it in.
+	 */
+	entry = &leaf->entries[args->index];
+	if (args->index < INT_GET(hdr->count, ARCH_CONVERT)) {
+		tmp  = INT_GET(hdr->count, ARCH_CONVERT) - args->index;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		ovbcopy((char *)entry, (char *)(entry+1), tmp);
+		xfs_da_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+	}
+	INT_MOD(hdr->count, ARCH_CONVERT, 1);
+
+	/*
+	 * Allocate space for the new string (at the end of the run).
+	 */
+	map = &hdr->freemap[mapindex];
+	mp = args->trans->t_mountp;
+	ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+	ASSERT((INT_GET(map->base, ARCH_CONVERT) & 0x3) == 0);
+	ASSERT(INT_GET(map->size, ARCH_CONVERT)
+				>= xfs_attr_leaf_newentsize(args,
+					     mp->m_sb.sb_blocksize, NULL));
+	ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+	ASSERT((INT_GET(map->size, ARCH_CONVERT) & 0x3) == 0);
+	INT_MOD(map->size, ARCH_CONVERT,
+		-xfs_attr_leaf_newentsize(args, mp->m_sb.sb_blocksize, &tmp));
+	INT_SET(entry->nameidx, ARCH_CONVERT,
+					INT_GET(map->base, ARCH_CONVERT)
+				      + INT_GET(map->size, ARCH_CONVERT));
+	INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
+	entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
+	entry->flags |= (args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0;
+	if (args->rename) {
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		if ((args->blkno2 == args->blkno) &&
+		    (args->index2 <= args->index)) {
+			args->index2++;
+		}
+	}
+	xfs_da_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	ASSERT((args->index == 0) || (INT_GET(entry->hashval, ARCH_CONVERT)
+						>= INT_GET((entry-1)->hashval,
+							    ARCH_CONVERT)));
+	ASSERT((args->index == INT_GET(hdr->count, ARCH_CONVERT)-1) ||
+	       (INT_GET(entry->hashval, ARCH_CONVERT)
+			    <= (INT_GET((entry+1)->hashval, ARCH_CONVERT))));
+
+	/*
+	 * Copy the attribute name and value into the new space.
+	 *
+	 * For "remote" attribute values, simply note that we need to
+	 * allocate space for the "remote" value.  We can't actually
+	 * allocate the extents in this transaction, and we can't decide
+	 * which blocks they should be as we might allocate more blocks
+	 * as part of this transaction (a split operation for example).
+	 */
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		name_loc->namelen = args->namelen;
+		INT_SET(name_loc->valuelen, ARCH_CONVERT, args->valuelen);
+		bcopy(args->name, (char *)name_loc->nameval, args->namelen);
+		bcopy(args->value, (char *)&name_loc->nameval[args->namelen],
+				   INT_GET(name_loc->valuelen, ARCH_CONVERT));
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt->namelen = args->namelen;
+		bcopy(args->name, (char *)name_rmt->name, args->namelen);
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		/* just in case */
+		INT_ZERO(name_rmt->valuelen, ARCH_CONVERT);
+		INT_ZERO(name_rmt->valueblk, ARCH_CONVERT);
+		args->rmtblkno = 1;
+		args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+	}
+	xfs_da_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+				   xfs_attr_leaf_entsize(leaf, args->index)));
+
+	/*
+	 * Update the control info for this leaf node
+	 */
+	if (INT_GET(entry->nameidx, ARCH_CONVERT)
+				< INT_GET(hdr->firstused, ARCH_CONVERT)) {
+		/* both on-disk, don't endian-flip twice */
+		hdr->firstused = entry->nameidx;
+	}
+	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
+				>= ((INT_GET(hdr->count, ARCH_CONVERT)
+					* sizeof(*entry))+sizeof(*hdr)));
+	tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1)
+					* sizeof(xfs_attr_leaf_entry_t)
+					+ sizeof(xfs_attr_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+		if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
+			INT_MOD(map->base, ARCH_CONVERT,
+					sizeof(xfs_attr_leaf_entry_t));
+			INT_MOD(map->size, ARCH_CONVERT,
+					-sizeof(xfs_attr_leaf_entry_t));
+		}
+	}
+	INT_MOD(hdr->usedbytes, ARCH_CONVERT,
+				xfs_attr_leaf_entsize(leaf, args->index));
+	xfs_da_log_buf(args->trans, bp,
+		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+	return(0);
+}
+
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
+{
+	xfs_attr_leafblock_t *leaf_s, *leaf_d;
+	xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+
+	mp = trans->t_mountp;
+	tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	bcopy(bp->data, tmpbuffer, XFS_LBSIZE(mp));
+	bzero(bp->data, XFS_LBSIZE(mp));
+
+	/*
+	 * Copy basic information
+	 */
+	leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
+	leaf_d = bp->data;
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	hdr_d->info = hdr_s->info;	/* struct copy */
+	INT_SET(hdr_d->firstused, ARCH_CONVERT, XFS_LBSIZE(mp));
+	/* handle truncation gracefully */
+	if (INT_ISZERO(hdr_d->firstused, ARCH_CONVERT)) {
+		INT_SET(hdr_d->firstused, ARCH_CONVERT,
+				XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
+	}
+	INT_ZERO(hdr_d->usedbytes, ARCH_CONVERT);
+	INT_ZERO(hdr_d->count, ARCH_CONVERT);
+	hdr_d->holes = 0;
+	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
+					sizeof(xfs_attr_leaf_hdr_t));
+	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
+				INT_GET(hdr_d->firstused, ARCH_CONVERT)
+			      - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate name/value pairs packed and in sequence.
+	 */
+	xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
+				(int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
+
+	xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+
+	kmem_free(tmpbuffer, XFS_LBSIZE(mp));
+}
+
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block.  At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block.  Those
+ * values are used in "atomic rename" operations on attributes.	 Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				       xfs_da_state_blk_t *blk2)
+{
+	xfs_da_args_t *args;
+	xfs_da_state_blk_t *tmp_blk;
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+	xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+	int count, totallen, max, space, swap;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	args = state->args;
+
+	/*
+	 * Check ordering of blocks, reverse if it makes things simpler.
+	 *
+	 * NOTE: Given that all (current) callers pass in an empty
+	 * second block, this code should never set "swap".
+	 */
+	swap = 0;
+	if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
+		tmp_blk = blk1;
+		blk1 = blk2;
+		blk2 = tmp_blk;
+		leaf1 = blk1->bp->data;
+		leaf2 = blk2->bp->data;
+		swap = 1;
+	}
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.  Then get
+	 * the direction to copy and the number of elements to move.
+	 *
+	 * "inleaf" is true if the new entry should be inserted into blk1.
+	 * If "swap" is also true, then reverse the sense of "inleaf".
+	 */
+	state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
+							    &count, &totallen);
+	if (swap)
+		state->inleaf = !state->inleaf;
+
+	/*
+	 * Move any entries required from leaf to leaf:
+	 */
+	if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count = INT_GET(hdr1->count, ARCH_CONVERT) - count;
+		space  = INT_GET(hdr1->usedbytes, ARCH_CONVERT) - totallen;
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf2 is the destination, compact it if it looks tight.
+		 */
+		max  = INT_GET(hdr2->firstused, ARCH_CONVERT)
+						- sizeof(xfs_attr_leaf_hdr_t);
+		max -= INT_GET(hdr2->count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t);
+		if (space > max) {
+			xfs_attr_leaf_compact(args->trans, blk2->bp);
+		}
+
+		/*
+		 * Move high entries from leaf1 to low end of leaf2.
+		 */
+		xfs_attr_leaf_moveents(leaf1,
+				INT_GET(hdr1->count, ARCH_CONVERT)-count,
+				leaf2, 0, count, state->mp);
+
+		xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+		xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+	} else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
+		/*
+		 * I assert that since all callers pass in an empty
+		 * second buffer, this code should never execute.
+		 */
+
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count -= INT_GET(hdr1->count, ARCH_CONVERT);
+		space  = totallen - INT_GET(hdr1->usedbytes, ARCH_CONVERT);
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf1 is the destination, compact it if it looks tight.
+		 */
+		max  = INT_GET(hdr1->firstused, ARCH_CONVERT)
+						- sizeof(xfs_attr_leaf_hdr_t);
+		max -= INT_GET(hdr1->count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t);
+		if (space > max) {
+			xfs_attr_leaf_compact(args->trans, blk1->bp);
+		}
+
+		/*
+		 * Move low entries from leaf2 to high end of leaf1.
+		 */
+		xfs_attr_leaf_moveents(leaf2, 0, leaf1,
+				(int)INT_GET(hdr1->count, ARCH_CONVERT), count,
+				state->mp);
+
+		xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+		xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+	}
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	blk1->hashval =
+	    INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
+				    ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
+	blk2->hashval =
+	    INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
+				    ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 * NOTE: this code depends on the (current) situation that the
+	 * second block was originally empty.
+	 *
+	 * If the insertion point moved to the 2nd block, we must adjust
+	 * the index.  We must also track the entry just following the
+	 * new entry for use in an "atomic rename" operation, that entry
+	 * is always the "old" entry and the "new" entry is what we are
+	 * inserting.  The index/blkno fields refer to the "old" entry,
+	 * while the index2/blkno2 fields refer to the "new" entry.
+	 */
+	if (blk1->index > INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
+		ASSERT(state->inleaf == 0);
+		blk2->index = blk1->index
+				- INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+		args->index = args->index2 = blk2->index;
+		args->blkno = args->blkno2 = blk2->blkno;
+	} else if (blk1->index == INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
+		if (state->inleaf) {
+			args->index = blk1->index;
+			args->blkno = blk1->blkno;
+			args->index2 = 0;
+			args->blkno2 = blk2->blkno;
+		} else {
+			blk2->index = blk1->index
+				    - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+			args->index = args->index2 = blk2->index;
+			args->blkno = args->blkno2 = blk2->blkno;
+		}
+	} else {
+		ASSERT(state->inleaf == 1);
+		args->index = args->index2 = blk1->index;
+		args->blkno = args->blkno2 = blk1->blkno;
+	}
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+				    xfs_da_state_blk_t *blk1,
+				    xfs_da_state_blk_t *blk2,
+				    int *countarg, int *usedbytesarg)
+{
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+	xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+	xfs_attr_leaf_entry_t *entry;
+	int count, max, index, totallen, half;
+	int lastdelta, foundit, tmp;
+
+	/*
+	 * Set up environment.
+	 */
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+	foundit = 0;
+	totallen = 0;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.
+	 */
+	max = INT_GET(hdr1->count, ARCH_CONVERT)
+			+ INT_GET(hdr2->count, ARCH_CONVERT);
+	half  = (max+1) * sizeof(*entry);
+	half += INT_GET(hdr1->usedbytes, ARCH_CONVERT)
+				+ INT_GET(hdr2->usedbytes, ARCH_CONVERT)
+				+ xfs_attr_leaf_newentsize(state->args,
+						     state->blocksize, NULL);
+	half /= 2;
+	lastdelta = state->blocksize;
+	entry = &leaf1->entries[0];
+	for (count = index = 0; count < max; entry++, index++, count++) {
+
+#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
+		/*
+		 * The new entry is in the first block, account for it.
+		 */
+		if (count == blk1->index) {
+			tmp = totallen + sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args,
+							 state->blocksize,
+							 NULL);
+			if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+				break;
+			lastdelta = XFS_ATTR_ABS(half - tmp);
+			totallen = tmp;
+			foundit = 1;
+		}
+
+		/*
+		 * Wrap around into the second block if necessary.
+		 */
+		if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
+			leaf1 = leaf2;
+			entry = &leaf1->entries[0];
+			index = 0;
+		}
+
+		/*
+		 * Figure out if next leaf entry would be too much.
+		 */
+		tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+									index);
+		if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+			break;
+		lastdelta = XFS_ATTR_ABS(half - tmp);
+		totallen = tmp;
+#undef XFS_ATTR_ABS
+	}
+
+	/*
+	 * Calculate the number of usedbytes that will end up in lower block.
+	 * If new entry not in lower block, fix up the count.
+	 */
+	totallen -= count * sizeof(*entry);
+	if (foundit) {
+		totallen -= sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args,
+							 state->blocksize,
+							 NULL);
+	}
+
+	*countarg = count;
+	*usedbytesarg = totallen;
+	return(foundit);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	int count, bytes, forward, error, retval, i;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->data;
+	ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+	leaf = (xfs_attr_leafblock_t *)info;
+	count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	bytes = sizeof(xfs_attr_leaf_hdr_t) +
+		count * sizeof(xfs_attr_leaf_entry_t) +
+		INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+	if (bytes > (state->blocksize >> 1)) {
+		*action = 0;	/* blk over 50%, dont try to join */
+		return(0);
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (aribtrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (!INT_ISZERO(info->forw, ARCH_CONVERT));
+		bcopy(&state->path, &state->altpath, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return(error);
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return(0);
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink an attribute list over time.
+	 */
+	/* start with smaller blk num */
+	forward = (INT_GET(info->forw, ARCH_CONVERT)
+					< INT_GET(info->back, ARCH_CONVERT));
+	for (i = 0; i < 2; forward = !forward, i++) {
+		if (forward)
+			blkno = INT_GET(info->forw, ARCH_CONVERT);
+		else
+			blkno = INT_GET(info->back, ARCH_CONVERT);
+		if (blkno == 0)
+			continue;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+					blkno, -1, &bp, XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+
+		leaf = (xfs_attr_leafblock_t *)info;
+		count  = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		bytes  = state->blocksize - (state->blocksize>>2);
+		bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+		leaf = bp->data;
+		ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+		count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+		bytes -= count * sizeof(xfs_attr_leaf_entry_t);
+		bytes -= sizeof(xfs_attr_leaf_hdr_t);
+		xfs_da_brelse(state->args->trans, bp);
+		if (bytes >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return(0);
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	bcopy(&state->path, &state->altpath, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return(error);
+	if (retval) {
+		*action = 0;
+	} else {
+		*action = 1;
+	}
+	return(0);
+}
+
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_attr_leaf_map_t *map;
+	xfs_attr_leaf_entry_t *entry;
+	int before, after, smallest, entsize;
+	int tablesize, tmp, i;
+	xfs_mount_t *mp;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	mp = args->trans->t_mountp;
+	ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0)
+		&& (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
+	ASSERT((args->index >= 0)
+		&& (args->index < INT_GET(hdr->count, ARCH_CONVERT)));
+	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
+				>= ((INT_GET(hdr->count, ARCH_CONVERT)
+					* sizeof(*entry))+sizeof(*hdr)));
+	entry = &leaf->entries[args->index];
+	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
+				>= INT_GET(hdr->firstused, ARCH_CONVERT));
+	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+
+	/*
+	 * Scan through free region table:
+	 *    check for adjacency of free'd entry with an existing one,
+	 *    find smallest free region in case we need to replace it,
+	 *    adjust any map that borders the entry table,
+	 */
+	tablesize = INT_GET(hdr->count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t)
+					+ sizeof(xfs_attr_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	tmp = INT_GET(map->size, ARCH_CONVERT);
+	before = after = -1;
+	smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+	entsize = xfs_attr_leaf_entsize(leaf, args->index);
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+		ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+		ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+		if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
+			INT_MOD(map->base, ARCH_CONVERT,
+					-sizeof(xfs_attr_leaf_entry_t));
+			INT_MOD(map->size, ARCH_CONVERT,
+					sizeof(xfs_attr_leaf_entry_t));
+		}
+
+		if ((INT_GET(map->base, ARCH_CONVERT)
+					+ INT_GET(map->size, ARCH_CONVERT))
+				== INT_GET(entry->nameidx, ARCH_CONVERT)) {
+			before = i;
+		} else if (INT_GET(map->base, ARCH_CONVERT)
+			== (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
+			after = i;
+		} else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
+			tmp = INT_GET(map->size, ARCH_CONVERT);
+			smallest = i;
+		}
+	}
+
+	/*
+	 * Coalesce adjacent freemap regions,
+	 * or replace the smallest region.
+	 */
+	if ((before >= 0) || (after >= 0)) {
+		if ((before >= 0) && (after >= 0)) {
+			map = &hdr->freemap[before];
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+			INT_MOD(map->size, ARCH_CONVERT,
+				INT_GET(hdr->freemap[after].size,
+							ARCH_CONVERT));
+			INT_ZERO(hdr->freemap[after].base, ARCH_CONVERT);
+			INT_ZERO(hdr->freemap[after].size, ARCH_CONVERT);
+		} else if (before >= 0) {
+			map = &hdr->freemap[before];
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+		} else {
+			map = &hdr->freemap[after];
+			/* both on-disk, don't endian flip twice */
+			map->base = entry->nameidx;
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+		}
+	} else {
+		/*
+		 * Replace smallest region (if it is smaller than free'd entry)
+		 */
+		map = &hdr->freemap[smallest];
+		if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
+			INT_SET(map->base, ARCH_CONVERT,
+					INT_GET(entry->nameidx, ARCH_CONVERT));
+			INT_SET(map->size, ARCH_CONVERT, entsize);
+		}
+	}
+
+	/*
+	 * Did we remove the first entry?
+	 */
+	if (INT_GET(entry->nameidx, ARCH_CONVERT)
+				== INT_GET(hdr->firstused, ARCH_CONVERT))
+		smallest = 1;
+	else
+		smallest = 0;
+
+	/*
+	 * Compress the remaining entries and zero out the removed stuff.
+	 */
+	bzero(XFS_ATTR_LEAF_NAME(leaf, args->index), entsize);
+	INT_MOD(hdr->usedbytes, ARCH_CONVERT, -entsize);
+	xfs_da_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+				   entsize));
+
+	tmp = (INT_GET(hdr->count, ARCH_CONVERT) - args->index)
+					* sizeof(xfs_attr_leaf_entry_t);
+	ovbcopy((char *)(entry+1), (char *)entry, tmp);
+	INT_MOD(hdr->count, ARCH_CONVERT, -1);
+	xfs_da_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+	entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
+	bzero((char *)entry, sizeof(xfs_attr_leaf_entry_t));
+
+	/*
+	 * If we removed the first entry, re-find the first used byte
+	 * in the name area.  Note that if the entry was the "firstused",
+	 * then we don't have a "hole" in our block resulting from
+	 * removing the name.
+	 */
+	if (smallest) {
+		tmp = XFS_LBSIZE(mp);
+		entry = &leaf->entries[0];
+		for (i = INT_GET(hdr->count, ARCH_CONVERT)-1;
+						i >= 0; entry++, i--) {
+			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
+				>= INT_GET(hdr->firstused, ARCH_CONVERT));
+			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
+							< XFS_LBSIZE(mp));
+			if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
+				tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
+		}
+		INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
+		if (INT_ISZERO(hdr->firstused, ARCH_CONVERT)) {
+			INT_SET(hdr->firstused, ARCH_CONVERT,
+					tmp - XFS_ATTR_LEAF_NAME_ALIGN);
+		}
+	} else {
+		hdr->holes = 1;		/* mark as needing compaction */
+	}
+	xfs_da_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+
+	/*
+	 * Check if leaf is less than 50% full, caller may want to
+	 * "join" the leaf with a sibling if so.
+	 */
+	tmp  = sizeof(xfs_attr_leaf_hdr_t);
+	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t);
+	tmp += INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+	return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
+}
+
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				       xfs_da_state_blk_t *save_blk)
+{
+	xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
+	xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+
+	/*
+	 * Set up environment.
+	 */
+	mp = state->mp;
+	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	drop_leaf = drop_blk->bp->data;
+	save_leaf = save_blk->bp->data;
+	ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	drop_hdr = &drop_leaf->hdr;
+	save_hdr = &save_leaf->hdr;
+
+	/*
+	 * Save last hashval from dying block for later Btree fixup.
+	 */
+	drop_blk->hashval =
+		INT_GET(drop_leaf->entries[INT_GET(drop_leaf->hdr.count,
+						ARCH_CONVERT)-1].hashval,
+								ARCH_CONVERT);
+
+	/*
+	 * Check if we need a temp buffer, or can we do it in place.
+	 * Note that we don't check "leaf" for holes because we will
+	 * always be dropping it, toosmall() decided that for us already.
+	 */
+	if (save_hdr->holes == 0) {
+		/*
+		 * dest leaf has no holes, so we add there.  May need
+		 * to make some room in the entry array.
+		 */
+		if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
+			     (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+		} else {
+			xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
+				  INT_GET(save_hdr->count, ARCH_CONVERT),
+				  (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
+				  mp);
+		}
+	} else {
+		/*
+		 * Destination has holes, so we make a temporary copy
+		 * of the leaf and add them both to that.
+		 */
+		tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
+		ASSERT(tmpbuffer != NULL);
+		bzero(tmpbuffer, state->blocksize);
+		tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+		tmp_hdr = &tmp_leaf->hdr;
+		tmp_hdr->info = save_hdr->info; /* struct copy */
+		INT_ZERO(tmp_hdr->count, ARCH_CONVERT);
+		INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
+		if (INT_ISZERO(tmp_hdr->firstused, ARCH_CONVERT)) {
+			INT_SET(tmp_hdr->firstused, ARCH_CONVERT,
+				state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
+		}
+		INT_ZERO(tmp_hdr->usedbytes, ARCH_CONVERT);
+		if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
+				(int)INT_GET(drop_hdr->count, ARCH_CONVERT),
+				mp);
+			xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
+				  INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+				 (int)INT_GET(save_hdr->count, ARCH_CONVERT),
+				 mp);
+		} else {
+			xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
+				(int)INT_GET(save_hdr->count, ARCH_CONVERT),
+				mp);
+			xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
+				INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+				(int)INT_GET(drop_hdr->count, ARCH_CONVERT),
+				mp);
+		}
+		bcopy((char *)tmp_leaf, (char *)save_leaf, state->blocksize);
+		kmem_free(tmpbuffer, state->blocksize);
+	}
+
+	xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
+					   state->blocksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	save_blk->hashval =
+		INT_GET(save_leaf->entries[INT_GET(save_leaf->hdr.count,
+						ARCH_CONVERT)-1].hashval,
+								ARCH_CONVERT);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int probe, span;
+	xfs_dahash_t hashval;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
+					< (XFS_LBSIZE(args->dp->i_mount)/8));
+
+	/*
+	 * Binary search.  (note: small blocks will skip this loop)
+	 */
+	hashval = args->hashval;
+	probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
+	for (entry = &leaf->entries[probe]; span > 4;
+		   entry = &leaf->entries[probe]) {
+		span /= 2;
+		if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
+			probe += span;
+		else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
+			probe -= span;
+		else
+			break;
+	}
+	ASSERT((probe >= 0) && \
+	       ((INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+	       || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
+	ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT)
+							== hashval));
+
+	/*
+	 * Since we may have duplicate hashval's, find the first matching
+	 * hashval in the leaf.
+	 */
+	while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT)
+							>= hashval)) {
+		entry--;
+		probe--;
+	}
+	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+		&& (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
+		entry++;
+		probe++;
+	}
+	if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT))
+		    || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
+		args->index = probe;
+		return(XFS_ERROR(ENOATTR));
+	}
+
+	/*
+	 * Duplicate keys may be present, so search all of them for a match.
+	 */
+	for (  ; (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+			&& (INT_GET(entry->hashval, ARCH_CONVERT) == hashval);
+			entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+		/*
+		 * If we are looking for INCOMPLETE entries, show only those.
+		 * If we are looking for complete entries, show only those.
+		 */
+		if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+		    (entry->flags & XFS_ATTR_INCOMPLETE)) {
+			continue;
+		}
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
+			if (name_loc->namelen != args->namelen)
+				continue;
+			if (bcmp(args->name, (char *)name_loc->nameval,
+					     args->namelen) != 0)
+				continue;
+			if (((args->flags & ATTR_ROOT) != 0) !=
+			    ((entry->flags & XFS_ATTR_ROOT) != 0))
+				continue;
+			args->index = probe;
+			return(XFS_ERROR(EEXIST));
+		} else {
+			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe);
+			if (name_rmt->namelen != args->namelen)
+				continue;
+			if (bcmp(args->name, (char *)name_rmt->name,
+					     args->namelen) != 0)
+				continue;
+			if (((args->flags & ATTR_ROOT) != 0) !=
+			    ((entry->flags & XFS_ATTR_ROOT) != 0))
+				continue;
+			args->index = probe;
+			args->rmtblkno
+				  = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
+			args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
+						   INT_GET(name_rmt->valuelen,
+								ARCH_CONVERT));
+			return(XFS_ERROR(EEXIST));
+		}
+	}
+	args->index = probe;
+	return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ */
+int
+xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	int valuelen;
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
+					< (XFS_LBSIZE(args->dp->i_mount)/8));
+	ASSERT(args->index < ((int)INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+
+	entry = &leaf->entries[args->index];
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		ASSERT(name_loc->namelen == args->namelen);
+		ASSERT(bcmp(args->name, name_loc->nameval, args->namelen) == 0);
+		valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = valuelen;
+			return(0);
+		}
+		if (args->valuelen < valuelen) {
+			args->valuelen = valuelen;
+			return(XFS_ERROR(ERANGE));
+		}
+		args->valuelen = valuelen;
+		bcopy(&name_loc->nameval[args->namelen], args->value, valuelen);
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		ASSERT(name_rmt->namelen == args->namelen);
+		ASSERT(bcmp(args->name, name_rmt->name, args->namelen) == 0);
+		valuelen = INT_GET(name_rmt->valuelen, ARCH_CONVERT);
+		args->rmtblkno = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
+		args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = valuelen;
+			return(0);
+		}
+		if (args->valuelen < valuelen) {
+			args->valuelen = valuelen;
+			return(XFS_ERROR(ERANGE));
+		}
+		args->valuelen = valuelen;
+	}
+	return(0);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
+			xfs_attr_leafblock_t *leaf_d, int start_d,
+			int count, xfs_mount_t *mp)
+{
+	xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_attr_leaf_entry_t *entry_s, *entry_d;
+	int desti, tmp, i;
+
+	/*
+	 * Check for nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0)
+				&& (INT_GET(hdr_s->count, ARCH_CONVERT)
+						< (XFS_LBSIZE(mp)/8)));
+	ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
+		((INT_GET(hdr_s->count, ARCH_CONVERT)
+					* sizeof(*entry_s))+sizeof(*hdr_s)));
+	ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
+	ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
+		((INT_GET(hdr_d->count, ARCH_CONVERT)
+					* sizeof(*entry_d))+sizeof(*hdr_d)));
+
+	ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
+	ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
+	ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
+
+	/*
+	 * Move the entries in the destination leaf up to make a hole?
+	 */
+	if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
+		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_d->entries[start_d];
+		entry_d = &leaf_d->entries[start_d + count];
+		ovbcopy((char *)entry_s, (char *)entry_d, tmp);
+	}
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate attribute info packed and in sequence.
+	 */
+	entry_s = &leaf_s->entries[start_s];
+	entry_d = &leaf_d->entries[start_d];
+	desti = start_d;
+	for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT)
+				>= INT_GET(hdr_s->firstused, ARCH_CONVERT));
+		tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+		/*
+		 * Code to drop INCOMPLETE entries.  Difficult to use as we
+		 * may also need to change the insertion index.	 Code turned
+		 * off for 6.2, should be revisited later.
+		 */
+		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+			bzero(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
+			INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
+			INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+			entry_d--;	/* to compensate for ++ in loop hdr */
+			desti--;
+			if ((start_s + i) < offset)
+				result++;	/* insertion index adjustment */
+		} else {
+#endif /* GROT */
+			INT_MOD(hdr_d->firstused, ARCH_CONVERT, -tmp);
+			/* both on-disk, don't endian flip twice */
+			entry_d->hashval = entry_s->hashval;
+			/* both on-disk, don't endian flip twice */
+			entry_d->nameidx = hdr_d->firstused;
+			entry_d->flags = entry_s->flags;
+			ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp
+							<= XFS_LBSIZE(mp));
+			ovbcopy(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i),
+			      XFS_ATTR_LEAF_NAME(leaf_d, desti), tmp);
+			ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp
+							<= XFS_LBSIZE(mp));
+			bzero(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
+			INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
+			INT_MOD(hdr_d->usedbytes, ARCH_CONVERT, tmp);
+			INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+			INT_MOD(hdr_d->count, ARCH_CONVERT, 1);
+			tmp = INT_GET(hdr_d->count, ARCH_CONVERT)
+						* sizeof(xfs_attr_leaf_entry_t)
+						+ sizeof(xfs_attr_leaf_hdr_t);
+			ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
+#ifdef GROT
+		}
+#endif /* GROT */
+	}
+
+	/*
+	 * Zero out the entries we just copied.
+	 */
+	if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + XFS_LBSIZE(mp)));
+		bzero((char *)entry_s, tmp);
+	} else {
+		/*
+		 * Move the remaining entries down to fill the hole,
+		 * then zero the entries at the top.
+		 */
+		tmp  = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s + count];
+		entry_d = &leaf_s->entries[start_s];
+		ovbcopy((char *)entry_s, (char *)entry_d, tmp);
+
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_s->entries[INT_GET(hdr_s->count,
+							ARCH_CONVERT)];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + XFS_LBSIZE(mp)));
+		bzero((char *)entry_s, tmp);
+	}
+
+	/*
+	 * Fill in the freemap information
+	 */
+	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
+					sizeof(xfs_attr_leaf_hdr_t));
+	INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT,
+				INT_GET(hdr_d->count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t));
+	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
+				INT_GET(hdr_d->firstused, ARCH_CONVERT)
+			      - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+	INT_ZERO(hdr_d->freemap[1].base, ARCH_CONVERT);
+	INT_ZERO(hdr_d->freemap[2].base, ARCH_CONVERT);
+	INT_ZERO(hdr_d->freemap[1].size, ARCH_CONVERT);
+	INT_ZERO(hdr_d->freemap[2].size, ARCH_CONVERT);
+	hdr_s->holes = 1;	/* leaf may not be compact */
+}
+
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+int
+xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
+{
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+
+	leaf1 = leaf1_bp->data;
+	leaf2 = leaf2_bp->data;
+	ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC) &&
+	       (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC));
+	if (   (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0)
+	    && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0)
+	    && (   (INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
+		      INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT))
+		|| (INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
+				ARCH_CONVERT)-1].hashval, ARCH_CONVERT) <
+		      INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
+				ARCH_CONVERT)-1].hashval, ARCH_CONVERT))) ) {
+		return(1);
+	}
+	return(0);
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+{
+	xfs_attr_leafblock_t *leaf;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	if (count)
+		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	if (INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+		return(0);
+	return(INT_GET(leaf->entries[INT_GET(leaf->hdr.count,
+				ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int size;
+
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index);
+		size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen,
+						   INT_GET(name_loc->valuelen,
+								ARCH_CONVERT));
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index);
+		size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen);
+	}
+	return(size);
+}
+
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(xfs_da_args_t *args, int blocksize, int *local)
+{
+	int size;
+
+	size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(args->namelen, args->valuelen);
+	if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) {
+		if (local) {
+			*local = 1;
+		}
+	} else {
+		size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(args->namelen);
+		if (local) {
+			*local = 0;
+		}
+	}
+	return(size);
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int retval, i;
+
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	cursor = context->cursor;
+	cursor->initted = 1;
+
+	xfs_attr_trace_l_cl("blk start", context, leaf);
+
+	/*
+	 * Re-find our place in the leaf block if this is a new syscall.
+	 */
+	if (context->resynch) {
+		entry = &leaf->entries[0];
+		for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+							entry++, i++) {
+			if (INT_GET(entry->hashval, ARCH_CONVERT)
+							== cursor->hashval) {
+				if (cursor->offset == context->dupcnt) {
+					context->dupcnt = 0;
+					break;
+				}
+				context->dupcnt++;
+			} else if (INT_GET(entry->hashval, ARCH_CONVERT)
+							> cursor->hashval) {
+				context->dupcnt = 0;
+				break;
+			}
+		}
+		if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+			xfs_attr_trace_l_c("not found", context);
+			return(0);
+		}
+	} else {
+		entry = &leaf->entries[0];
+		i = 0;
+	}
+	context->resynch = 0;
+
+	/*
+	 * We have found our place, start copying out the new attributes.
+	 */
+	retval = 0;
+	for (  ; (i < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+	     && (retval == 0); entry++, i++) {
+		int ns = (entry->flags & XFS_ATTR_ROOT)? ROOT_NAMES:USER_NAMES;
+
+		if (INT_GET(entry->hashval, ARCH_CONVERT) != cursor->hashval) {
+			cursor->hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+			cursor->offset = 0;
+		}
+
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* skip incomplete entries */
+		if (((context->flags & ATTR_ROOT) != 0) !=
+		    ((entry->flags & XFS_ATTR_ROOT) != 0) &&
+		    !(context->flags & ATTR_KERNFULLS))
+			continue;		/* skip non-matching entries */
+
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+			if (context->flags & ATTR_KERNOVAL) {
+				ASSERT(context->flags & ATTR_KERNAMELS);
+				context->count += xfs_namespaces[ns].namelen
+						+ (int)name_loc->namelen + 1;
+			} else {
+				retval = xfs_attr_put_listent(context, ns,
+					(char *)name_loc->nameval,
+					(int)name_loc->namelen,
+					(int)INT_GET(name_loc->valuelen,
+								ARCH_CONVERT));
+			}
+		} else {
+			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			if (context->flags & ATTR_KERNOVAL) {
+				ASSERT(context->flags & ATTR_KERNAMELS);
+				context->count += xfs_namespaces[ns].namelen
+						+ (int)name_rmt->namelen + 1;
+			} else {
+				retval = xfs_attr_put_listent(context, ns,
+					(char *)name_rmt->name,
+					(int)name_rmt->namelen,
+					(int)INT_GET(name_rmt->valuelen,
+								ARCH_CONVERT));
+			}
+		}
+		if (retval == 0) {
+			cursor->offset++;
+		}
+	}
+	xfs_attr_trace_l_cl("blk end", context, leaf);
+	return(retval);
+}
+
+#define ATTR_ENTBASESIZE		/* minimum bytes used by an attr */ \
+	(((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define ATTR_ENTSIZE(namelen)		/* actual bytes used by an attr */ \
+	((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+	 & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_put_listent(xfs_attr_list_context_t *context,
+		     int ns, char *name, int namelen, int valuelen)
+{
+	attrlist_ent_t *aep;
+	int arraytop;
+
+	ASSERT(!(context->flags & ATTR_KERNOVAL));
+	if (context->flags & ATTR_KERNAMELS) {
+		char *offset;
+		xattr_namespace_t *nsp;
+
+		ASSERT(context->count >= 0);
+
+		nsp = &xfs_namespaces[ns];
+		arraytop = context->count + nsp->namelen + namelen+1;
+		if (arraytop > context->firstu) {
+			context->count = -1;	/* insufficient space */
+			return(1);
+		}
+		offset = (char *)context->alist + context->count;
+		strncpy(offset, nsp->name, nsp->namelen);	/* namespace */
+		offset += nsp->namelen;
+		strncpy(offset, name, namelen);			/* real name */
+		offset += namelen;
+		*offset = '\0';
+		context->count += nsp->namelen + namelen + 1;
+		return(0);
+	}
+
+	ASSERT(context->count >= 0);
+	ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+	ASSERT(context->firstu >= sizeof(*context->alist));
+	ASSERT(context->firstu <= context->bufsize);
+
+	arraytop = sizeof(*context->alist) +
+			context->count * sizeof(context->alist->al_offset[0]);
+	context->firstu -= ATTR_ENTSIZE(namelen);
+	if (context->firstu < arraytop) {
+		xfs_attr_trace_l_c("buffer full", context);
+		context->alist->al_more = 1;
+		return(1);
+	}
+
+	aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
+	aep->a_valuelen = valuelen;
+	bcopy(name, aep->a_name, namelen);
+	aep->a_name[ namelen ] = 0;
+	context->alist->al_offset[ context->count++ ] = context->firstu;
+	context->alist->al_count = context->count;
+	xfs_attr_trace_l_c("add", context);
+	return(0);
+}
+
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_clearflag(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_dabuf_t *bp;
+	int error;
+#ifdef DEBUG
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen;
+	char *name;
+#endif /* DEBUG */
+
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	ASSERT(bp != NULL);
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
+	ASSERT(args->index >= 0);
+	entry = &leaf->entries[ args->index ];
+	ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+
+#ifdef DEBUG
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		namelen = name_loc->namelen;
+		name = (char *)name_loc->nameval;
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		namelen = name_rmt->namelen;
+		name = (char *)name_rmt->name;
+	}
+	ASSERT(INT_GET(entry->hashval, ARCH_CONVERT) == args->hashval);
+	ASSERT(namelen == args->namelen);
+	ASSERT(bcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+
+	entry->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+	if (args->rmtblkno) {
+		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
+		INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
+		xfs_da_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+	xfs_da_buf_done(bp);
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	error = xfs_attr_rolltrans(&args->trans, args->dp);
+
+	return(error);
+}
+
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_setflag(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_dabuf_t *bp;
+	int error;
+
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	ASSERT(bp != NULL);
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
+	ASSERT(args->index >= 0);
+	entry = &leaf->entries[ args->index ];
+
+	ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+	entry->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp,
+			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		INT_ZERO(name_rmt->valueblk, ARCH_CONVERT);
+		INT_ZERO(name_rmt->valuelen, ARCH_CONVERT);
+		xfs_da_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+	xfs_da_buf_done(bp);
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	error = xfs_attr_rolltrans(&args->trans, args->dp);
+
+	return(error);
+}
+
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr_leaf_flipflags(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+	xfs_attr_leaf_entry_t *entry1, *entry2;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_dabuf_t *bp1, *bp2;
+	int error;
+#ifdef DEBUG
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen1, namelen2;
+	char *name1, *name2;
+#endif /* DEBUG */
+
+	/*
+	 * Read the block containing the "old" attr
+	 */
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	ASSERT(bp1 != NULL);
+
+	/*
+	 * Read the block containing the "new" attr, if it is different
+	 */
+	if (args->blkno2 != args->blkno) {
+		error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
+					-1, &bp2, XFS_ATTR_FORK);
+		if (error) {
+			return(error);
+		}
+		ASSERT(bp2 != NULL);
+	} else {
+		bp2 = bp1;
+	}
+
+	leaf1 = bp1->data;
+	ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index < INT_GET(leaf1->hdr.count, ARCH_CONVERT));
+	ASSERT(args->index >= 0);
+	entry1 = &leaf1->entries[ args->index ];
+
+	leaf2 = bp2->data;
+	ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index2 < INT_GET(leaf2->hdr.count, ARCH_CONVERT));
+	ASSERT(args->index2 >= 0);
+	entry2 = &leaf2->entries[ args->index2 ];
+
+#ifdef DEBUG
+	if (entry1->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index);
+		namelen1 = name_loc->namelen;
+		name1 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+		namelen1 = name_rmt->namelen;
+		name1 = (char *)name_rmt->name;
+	}
+	if (entry2->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2);
+		namelen2 = name_loc->namelen;
+		name2 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+		namelen2 = name_rmt->namelen;
+		name2 = (char *)name_rmt->name;
+	}
+	ASSERT(INT_GET(entry1->hashval, ARCH_CONVERT) == INT_GET(entry2->hashval, ARCH_CONVERT));
+	ASSERT(namelen1 == namelen2);
+	ASSERT(bcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+
+	ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+	ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+
+	entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp1,
+			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+	if (args->rmtblkno) {
+		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+		INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
+		INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
+		xfs_da_log_buf(args->trans, bp1,
+			 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+	}
+
+	entry2->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp2,
+			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+		INT_ZERO(name_rmt->valueblk, ARCH_CONVERT);
+		INT_ZERO(name_rmt->valuelen, ARCH_CONVERT);
+		xfs_da_log_buf(args->trans, bp2,
+			 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+	}
+	xfs_da_buf_done(bp1);
+	if (bp1 != bp2)
+		xfs_da_buf_done(bp2);
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	error = xfs_attr_rolltrans(&args->trans, args->dp);
+
+	return(error);
+}
+
+/*========================================================================
+ * Indiscriminately delete the entire attribute fork
+ *========================================================================*/
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
+{
+	xfs_da_blkinfo_t *info;
+	xfs_daddr_t blkno;
+	xfs_dabuf_t *bp;
+	int error;
+
+	/*
+	 * Read block 0 to see what we have to work with.
+	 * We only get here if we have extents, since we remove
+	 * the extents in reverse order the extent containing
+	 * block 0 must still be there.
+	 */
+	error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	blkno = xfs_da_blkno(bp);
+
+	/*
+	 * Invalidate the tree, even if the "tree" is only a single leaf block.
+	 * This is a depth-first traversal!
+	 */
+	info = bp->data;
+	if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+		error = xfs_attr_node_inactive(trans, dp, bp, 1);
+	} else if (INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
+		error = xfs_attr_leaf_inactive(trans, dp, bp);
+	} else {
+		error = XFS_ERROR(EIO);
+		xfs_da_brelse(*trans, bp);
+	}
+	if (error)
+		return(error);
+
+	/*
+	 * Invalidate the incore copy of the root block.
+	 */
+	error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	xfs_da_binval(*trans, bp);	/* remove from cache */
+	/*
+	 * Commit the invalidate and start the next transaction.
+	 */
+	error = xfs_attr_rolltrans(trans, dp);
+
+	return (error);
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
+				   int level)
+{
+	xfs_da_blkinfo_t *info;
+	xfs_da_intnode_t *node;
+	xfs_dablk_t child_fsb;
+	xfs_daddr_t parent_blkno, child_blkno;
+	int error, count, i;
+	xfs_dabuf_t *child_bp;
+
+	/*
+	 * Since this code is recursive (gasp!) we must protect ourselves.
+	 */
+	if (level > XFS_DA_NODE_MAXDEPTH) {
+		xfs_da_brelse(*trans, bp);	/* no locks for later trans */
+		return(XFS_ERROR(EIO));
+	}
+
+	node = bp->data;
+	ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT)
+						== XFS_DA_NODE_MAGIC);
+	parent_blkno = xfs_da_blkno(bp);	/* save for re-read later */
+	count = INT_GET(node->hdr.count, ARCH_CONVERT);
+	if (!count) {
+		xfs_da_brelse(*trans, bp);
+		return(0);
+	}
+	child_fsb = INT_GET(node->btree[0].before, ARCH_CONVERT);
+	xfs_da_brelse(*trans, bp);	/* no locks for later trans */
+
+	/*
+	 * If this is the node level just above the leaves, simply loop
+	 * over the leaves removing all of them.  If this is higher up
+	 * in the tree, recurse downward.
+	 */
+	for (i = 0; i < count; i++) {
+		/*
+		 * Read the subsidiary block to see what we have to work with.
+		 * Don't do this in a transaction.  This is a depth-first
+		 * traversal of the tree so we may deal with many blocks
+		 * before we come back to this one.
+		 */
+		error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+						XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		if (child_bp) {
+						/* save for re-read later */
+			child_blkno = xfs_da_blkno(child_bp);
+
+			/*
+			 * Invalidate the subtree, however we have to.
+			 */
+			info = child_bp->data;
+			if (INT_GET(info->magic, ARCH_CONVERT)
+							== XFS_DA_NODE_MAGIC) {
+				error = xfs_attr_node_inactive(trans, dp,
+						child_bp, level+1);
+			} else if (INT_GET(info->magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC) {
+				error = xfs_attr_leaf_inactive(trans, dp,
+						child_bp);
+			} else {
+				error = XFS_ERROR(EIO);
+				xfs_da_brelse(*trans, child_bp);
+			}
+			if (error)
+				return(error);
+
+			/*
+			 * Remove the subsidiary block from the cache
+			 * and from the log.
+			 */
+			error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+				&child_bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+			xfs_da_binval(*trans, child_bp);
+		}
+
+		/*
+		 * If we're not done, re-read the parent to get the next
+		 * child block number.
+		 */
+		if ((i+1) < count) {
+			error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
+				&bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+			child_fsb = INT_GET(node->btree[i+1].before, ARCH_CONVERT);
+			xfs_da_brelse(*trans, bp);
+		}
+		/*
+		 * Atomically commit the whole invalidate stuff.
+		 */
+		if ((error = xfs_attr_rolltrans(trans, dp)))
+			return (error);
+	}
+
+	return(0);
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+int
+xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_attr_inactive_list_t *list, *lp;
+	int error, count, size, tmp, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+
+	/*
+	 * Count the number of "remote" value extents.
+	 */
+	count = 0;
+	entry = &leaf->entries[0];
+	for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+		if (   INT_GET(entry->nameidx, ARCH_CONVERT)
+		    && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			if (!INT_ISZERO(name_rmt->valueblk, ARCH_CONVERT))
+				count++;
+		}
+	}
+
+	/*
+	 * If there are no "remote" values, we're done.
+	 */
+	if (count == 0) {
+		xfs_da_brelse(*trans, bp);
+		return(0);
+	}
+
+	/*
+	 * Allocate storage for a list of all the "remote" value extents.
+	 */
+	size = count * sizeof(xfs_attr_inactive_list_t);
+	list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
+
+	/*
+	 * Identify each of the "remote" value extents.
+	 */
+	lp = list;
+	entry = &leaf->entries[0];
+	for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+		if (   INT_GET(entry->nameidx, ARCH_CONVERT)
+		    && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			if (!INT_ISZERO(name_rmt->valueblk, ARCH_CONVERT)) {
+				/* both on-disk, don't endian flip twice */
+				lp->valueblk = name_rmt->valueblk;
+				INT_SET(lp->valuelen, ARCH_CONVERT,
+						XFS_B_TO_FSB(dp->i_mount,
+						    INT_GET(name_rmt->valuelen,
+							      ARCH_CONVERT)));
+				lp++;
+			}
+		}
+	}
+	xfs_da_brelse(*trans, bp);	/* unlock for trans. in freextent() */
+
+	/*
+	 * Invalidate each of the "remote" value extents.
+	 */
+	error = 0;
+	for (lp = list, i = 0; i < count; i++, lp++) {
+		tmp = xfs_attr_leaf_freextent(trans, dp,
+						     INT_GET(lp->valueblk,
+								ARCH_CONVERT),
+						     INT_GET(lp->valuelen,
+								ARCH_CONVERT));
+		if (error == 0)
+			error = tmp;	/* save only the 1st errno */
+	}
+
+	kmem_free((xfs_caddr_t)list, size);
+	return(error);
+}
+
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+int
+xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
+				    xfs_dablk_t blkno, int blkcnt)
+{
+	xfs_bmbt_irec_t map;
+	xfs_dablk_t tblkno;
+	int tblkcnt, dblkcnt, nmap, error;
+	xfs_daddr_t dblkno;
+	xfs_buf_t *bp;
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's
+	 * blocks.
+	 */
+	tblkno = blkno;
+	tblkcnt = blkcnt;
+	while (tblkcnt > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		nmap = 1;
+		error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
+					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+					NULL, 0, &map, &nmap, NULL);
+		if (error) {
+			return(error);
+		}
+		ASSERT(nmap == 1);
+		ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+
+		/*
+		 * If it's a hole, these are already unmapped
+		 * so there's nothing to invalidate.
+		 */
+		if (map.br_startblock != HOLESTARTBLOCK) {
+
+			dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+						  map.br_startblock);
+			dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+						map.br_blockcount);
+			bp = xfs_trans_get_buf(*trans,
+					dp->i_mount->m_ddev_targp,
+					dblkno, dblkcnt, 0);
+			xfs_trans_binval(*trans, bp);
+			/*
+			 * Roll to next transaction.
+			 */
+			if ((error = xfs_attr_rolltrans(trans, dp)))
+				return (error);
+		}
+
+		tblkno += map.br_blockcount;
+		tblkcnt -= map.br_blockcount;
+	}
+
+	return(0);
+}
+
+
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to the next.
+ */
+int
+xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
+{
+	xfs_trans_t *trans;
+	unsigned int logres, count;
+	int	error;
+
+	/*
+	 * Ensure that the inode is always logged.
+	 */
+	trans = *transp;
+	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+
+	/*
+	 * Copy the critical parameters from one trans to the next.
+	 */
+	logres = trans->t_log_res;
+	count = trans->t_log_count;
+	*transp = xfs_trans_dup(trans);
+
+	/*
+	 * Commit the current transaction.
+	 * If this commit failed, then it'd just unlock those items that
+	 * are not marked ihold. That also means that a filesystem shutdown
+	 * is in progress. The caller takes the responsibility to cancel
+	 * the duplicate transaction that gets returned.
+	 */
+	if ((error = xfs_trans_commit(trans, 0, NULL)))
+		return (error);
+
+	trans = *transp;
+
+	/*
+	 * Reserve space in the log for th next transaction.
+	 * This also pushes items in the "AIL", the list of logged items,
+	 * out to disk if they are taking up space at the tail of the log
+	 * that we want to use.	 This requires that either nothing be locked
+	 * across this call, or that anything that is locked be logged in
+	 * the prior and the next transactions.
+	 */
+	error = xfs_trans_reserve(trans, 0, logres, 0,
+				  XFS_TRANS_PERM_LOG_RES, count);
+	/*
+	 *  Ensure that the inode is in the new transaction and locked.
+	 */
+	if (!error) {
+		xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+		xfs_trans_ihold(trans, dp);
+	}
+	return (error);
+
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr_leaf.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr_leaf.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr_leaf.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr_leaf.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define __XFS_ATTR_LEAF_H__
+
+/*
+ * Attribute storage layout, internal structure, access macros, etc.
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.	The
+ * internal links in the Btree are logical block offsets into the file.
+ */
+
+struct attrlist;
+struct attrlist_cursor_kern;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*========================================================================
+ * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This is the structure of the leaf nodes in the Btree.
+ *
+ * Struct leaf_entry's are packed from the top.	 Name/values grow from the
+ * bottom but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again.  If we
+ * still don't have enough space, then we have to split the block.  The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string.  The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT versus USER attribute in
+ * the leaf_entry.  The namespaces are independent only because we also look
+ * at the root/user bit when we are looking for a matching attribute name.
+ *
+ * We also store a "incomplete" bit in the leaf_entry.	It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set.  We clear the
+ * bit when we have finished setting up the attribute.	We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
+
+typedef struct xfs_attr_leafblock {
+	struct xfs_attr_leaf_hdr {	/* constant-structure header block */
+		xfs_da_blkinfo_t info;	/* block type, links, etc. */
+		__uint16_t count;	/* count of active leaf_entry's */
+		__uint16_t usedbytes;	/* num bytes of names/values stored */
+		__uint16_t firstused;	/* first used byte in name area */
+		__uint8_t  holes;	/* != 0 if blk needs compaction */
+		__uint8_t  pad1;
+		struct xfs_attr_leaf_map {	  /* RLE map of free bytes */
+			__uint16_t base;	  /* base of free region */
+			__uint16_t size;	  /* length of free region */
+		} freemap[XFS_ATTR_LEAF_MAPSIZE]; /* N largest free regions */
+	} hdr;
+	struct xfs_attr_leaf_entry {	/* sorted on key, not name */
+		xfs_dahash_t hashval;	/* hash value of name */
+		__uint16_t nameidx;	/* index into buffer of name/value */
+		__uint8_t flags;	/* LOCAL, ROOT and INCOMPLETE flags */
+		__uint8_t pad2;		/* unused pad byte */
+	} entries[1];			/* variable sized array */
+	struct xfs_attr_leaf_name_local {
+		__uint16_t valuelen;	/* number of bytes in value */
+		__uint8_t namelen;	/* length of name bytes */
+		__uint8_t nameval[1];	/* name/value bytes */
+	} namelist;			/* grows from bottom of buf */
+	struct xfs_attr_leaf_name_remote {
+		xfs_dablk_t valueblk;	/* block number of value bytes */
+		__uint32_t valuelen;	/* number of bytes in value */
+		__uint8_t namelen;	/* length of name bytes */
+		__uint8_t name[1];	/* name bytes */
+	} valuelist;			/* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+typedef struct xfs_attr_leaf_hdr xfs_attr_leaf_hdr_t;
+typedef struct xfs_attr_leaf_map xfs_attr_leaf_map_t;
+typedef struct xfs_attr_leaf_entry xfs_attr_leaf_entry_t;
+typedef struct xfs_attr_leaf_name_local xfs_attr_leaf_name_local_t;
+typedef struct xfs_attr_leaf_name_remote xfs_attr_leaf_name_remote_t;
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define XFS_ATTR_LOCAL_BIT	0	/* attr is stored locally */
+#define XFS_ATTR_ROOT_BIT	1	/* limit access to attr to userid 0 */
+#define XFS_ATTR_INCOMPLETE_BIT 7	/* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL		(1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT		(1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_INCOMPLETE	(1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define XFS_ATTR_LEAF_NAME_ALIGN	((uint)sizeof(xfs_dablk_t))
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE)
+xfs_attr_leaf_name_remote_t *
+xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx);
+#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx)	\
+	xfs_attr_leaf_name_remote(leafp,idx)
+#else
+#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx)	/* remote name struct ptr */ \
+	((xfs_attr_leaf_name_remote_t *)		\
+	 &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL)
+xfs_attr_leaf_name_local_t *
+xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx);
+#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx)	\
+	xfs_attr_leaf_name_local(leafp,idx)
+#else
+#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx)	/* local name struct ptr */ \
+	((xfs_attr_leaf_name_local_t *)		\
+	 &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME)
+char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx);
+#define XFS_ATTR_LEAF_NAME(leafp,idx)		xfs_attr_leaf_name(leafp,idx)
+#else
+#define XFS_ATTR_LEAF_NAME(leafp,idx)		/* generic name struct ptr */ \
+	(&((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
+#endif
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE)
+int xfs_attr_leaf_entsize_remote(int nlen);
+#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen)	\
+	xfs_attr_leaf_entsize_remote(nlen)
+#else
+#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen)	/* space for remote struct */ \
+	(((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+	  XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL)
+int xfs_attr_leaf_entsize_local(int nlen, int vlen);
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen)	\
+	xfs_attr_leaf_entsize_local(nlen,vlen)
+#else
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen)	/* space for local struct */ \
+	(((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + \
+	  XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX)
+int xfs_attr_leaf_entsize_local_max(int bsize);
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize)	\
+	xfs_attr_leaf_entsize_local_max(bsize)
+#else
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize)	/* max local struct size */ \
+	(((bsize) >> 1) + ((bsize) >> 2))
+#endif
+
+
+/*========================================================================
+ * Structure used to pass context around among the routines.
+ *========================================================================*/
+
+typedef struct xfs_attr_list_context {
+	struct xfs_inode		*dp;	/* inode */
+	struct attrlist_cursor_kern	*cursor;/* position in list */
+	struct attrlist			*alist; /* output buffer */
+	int				count;	/* num used entries */
+	int				dupcnt; /* count dup hashvals seen */
+	int				bufsize;/* total buffer size */
+	int				firstu; /* first used byte in buffer */
+	int				flags;	/* from VOP call */
+	int				resynch;/* T/F: resynch with cursor */
+} xfs_attr_list_context_t;
+
+/*
+ * Used to keep a list of "remote value" extents when unlinking an inode.
+ */
+typedef struct xfs_attr_inactive_list {
+	xfs_dablk_t	valueblk;	/* block number of value bytes */
+	int		valuelen;	/* number of bytes in value */
+} xfs_attr_inactive_list_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when dirsize < XFS_LITINO(mp).
+ */
+int	xfs_attr_shortform_create(struct xfs_da_args *args);
+int	xfs_attr_shortform_add(struct xfs_da_args *add);
+int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int	xfs_attr_shortform_remove(struct xfs_da_args *remove);
+int	xfs_attr_shortform_list(struct xfs_attr_list_context *context);
+int	xfs_attr_shortform_replace(struct xfs_da_args *args);
+int	xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
+
+/*
+ * Internal routines when dirsize == XFS_LBSIZE(mp).
+ */
+int	xfs_attr_leaf_to_node(struct xfs_da_args *args);
+int	xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
+					  struct xfs_da_args *args);
+int	xfs_attr_leaf_clearflag(struct xfs_da_args *args);
+int	xfs_attr_leaf_setflag(struct xfs_da_args *args);
+int	xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_attr_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block,
+				    struct xfs_dabuf **bpp);
+int	xfs_attr_leaf_split(struct xfs_da_state *state,
+				   struct xfs_da_state_blk *oldblk,
+				   struct xfs_da_state_blk *newblk);
+int	xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
+					struct xfs_da_args *args);
+int	xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
+int	xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
+				 struct xfs_da_args *args);
+int	xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
+				    struct xfs_da_args *args);
+int	xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
+				      struct xfs_attr_list_context *context);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void	xfs_attr_leaf_unbalance(struct xfs_da_state *state,
+				       struct xfs_da_state_blk *drop_blk,
+				       struct xfs_da_state_blk *save_blk);
+int	xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+int	xfs_attr_node_inactive(struct xfs_trans **trans, struct xfs_inode *dp,
+				      struct xfs_dabuf *bp, int level);
+int	xfs_attr_leaf_inactive(struct xfs_trans **trans, struct xfs_inode *dp,
+				      struct xfs_dabuf *bp);
+int	xfs_attr_leaf_freextent(struct xfs_trans **trans, struct xfs_inode *dp,
+				       xfs_dablk_t blkno, int blkcnt);
+
+/*
+ * Utility routines.
+ */
+xfs_dahash_t	xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
+int	xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
+				   struct xfs_dabuf *leaf2_bp);
+int	xfs_attr_leaf_newentsize(struct xfs_da_args *args, int blocksize,
+					int *local);
+int	xfs_attr_leaf_entsize(struct xfs_attr_leafblock *leaf, int index);
+int	xfs_attr_put_listent(struct xfs_attr_list_context *context,
+			     int ns, char *name, int namelen, int valuelen);
+int	xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
+
+#endif	/* __XFS_ATTR_LEAF_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr_sf.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr_sf.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_attr_sf.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_attr_sf.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define __XFS_ATTR_SF_H__
+
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+
+struct xfs_inode;
+
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+	struct xfs_attr_sf_hdr {	/* constant-structure header block */
+		__uint16_t totsize;	/* total bytes in shortform list */
+		__uint8_t count;	/* count of active entries */
+	} hdr;
+	struct xfs_attr_sf_entry {
+		__uint8_t namelen;	/* actual length of name (no NULL) */
+		__uint8_t valuelen;	/* actual length of value (no NULL) */
+		__uint8_t flags;	/* flags bits (see xfs_attr_leaf.h) */
+		__uint8_t nameval[1];	/* name & value bytes concatenated */
+	} list[1];			/* variable sized array */
+} xfs_attr_shortform_t;
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
+
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+	__uint8_t	entno;		/* entry number in original list */
+	__uint8_t	namelen;	/* length of name value (no null) */
+	__uint8_t	valuelen;	/* length of value */
+	__uint8_t	flags;		/* flags bits (see xfs_attr_leaf.h) */
+	xfs_dahash_t	hash;		/* this entry's hash value */
+	char		*name;		/* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME)
+int xfs_attr_sf_entsize_byname(int nlen, int vlen);
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)	\
+	xfs_attr_sf_entsize_byname(nlen,vlen)
+#else
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)	/* space name/value uses */ \
+	((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))
+#endif
+#define XFS_ATTR_SF_ENTSIZE_MAX			/* max space for name&value */ \
+	((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE)
+int xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep);
+#define XFS_ATTR_SF_ENTSIZE(sfep)	xfs_attr_sf_entsize(sfep)
+#else
+#define XFS_ATTR_SF_ENTSIZE(sfep)		/* space an entry uses */ \
+	((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_NEXTENTRY)
+xfs_attr_sf_entry_t *xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep);
+#define XFS_ATTR_SF_NEXTENTRY(sfep)	xfs_attr_sf_nextentry(sfep)
+#else
+#define XFS_ATTR_SF_NEXTENTRY(sfep)		/* next entry in struct */ \
+	((xfs_attr_sf_entry_t *) \
+		((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_TOTSIZE)
+int xfs_attr_sf_totsize(struct xfs_inode *dp);
+#define XFS_ATTR_SF_TOTSIZE(dp)		xfs_attr_sf_totsize(dp)
+#else
+#define XFS_ATTR_SF_TOTSIZE(dp)			/* total space in use */ \
+	(INT_GET(((xfs_attr_shortform_t *)((dp)->i_afp->if_u1.if_data))->hdr.totsize, ARCH_CONVERT))
+#endif
+
+#ifdef XFS_ALL_TRACE
+#define XFS_ATTR_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef	XFS_ATTR_TRACE
+#endif
+
+/*
+ * Kernel tracing support for attribute lists
+ */
+struct xfs_attr_list_context;
+struct xfs_da_intnode;
+struct xfs_da_node_entry;
+struct xfs_attr_leafblock;
+
+#define XFS_ATTR_TRACE_SIZE	4096	/* size of global trace buffer */
+
+/*
+ * Trace record types.
+ */
+#define XFS_ATTR_KTRACE_L_C	1	/* context */
+#define XFS_ATTR_KTRACE_L_CN	2	/* context, node */
+#define XFS_ATTR_KTRACE_L_CB	3	/* context, btree */
+#define XFS_ATTR_KTRACE_L_CL	4	/* context, leaf */
+
+#if defined(XFS_ATTR_TRACE)
+
+void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context);
+void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
+			      struct xfs_da_intnode *node);
+void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
+			      struct xfs_da_node_entry *btree);
+void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
+			      struct xfs_attr_leafblock *leaf);
+void xfs_attr_trace_enter(int type, char *where,
+			     __psunsigned_t a2, __psunsigned_t a3,
+			     __psunsigned_t a4, __psunsigned_t a5,
+			     __psunsigned_t a6, __psunsigned_t a7,
+			     __psunsigned_t a8, __psunsigned_t a9,
+			     __psunsigned_t a10, __psunsigned_t a11,
+			     __psunsigned_t a12, __psunsigned_t a13,
+			     __psunsigned_t a14, __psunsigned_t a15);
+#else
+#define xfs_attr_trace_l_c(w,c)
+#define xfs_attr_trace_l_cn(w,c,n)
+#define xfs_attr_trace_l_cb(w,c,b)
+#define xfs_attr_trace_l_cl(w,c,l)
+#endif /* XFS_ATTR_TRACE */
+
+#endif	/* __XFS_ATTR_SF_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bit.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bit.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bit.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bit.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * XFS bit manipulation routines, used in non-realtime code.
+ */
+
+#include <xfs.h>
+
+#ifndef HAVE_ARCH_HIGHBIT
+/*
+ * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
+ */
+const char xfs_highbit[256] = {
+       -1, 0, 1, 1, 2, 2, 2, 2,			/* 00 .. 07 */
+	3, 3, 3, 3, 3, 3, 3, 3,			/* 08 .. 0f */
+	4, 4, 4, 4, 4, 4, 4, 4,			/* 10 .. 17 */
+	4, 4, 4, 4, 4, 4, 4, 4,			/* 18 .. 1f */
+	5, 5, 5, 5, 5, 5, 5, 5,			/* 20 .. 27 */
+	5, 5, 5, 5, 5, 5, 5, 5,			/* 28 .. 2f */
+	5, 5, 5, 5, 5, 5, 5, 5,			/* 30 .. 37 */
+	5, 5, 5, 5, 5, 5, 5, 5,			/* 38 .. 3f */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 40 .. 47 */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 48 .. 4f */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 50 .. 57 */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 58 .. 5f */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 60 .. 67 */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 68 .. 6f */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 70 .. 77 */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 78 .. 7f */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* 80 .. 87 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* 88 .. 8f */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* 90 .. 97 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* 98 .. 9f */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* a0 .. a7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* a8 .. af */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* b0 .. b7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* b8 .. bf */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* c0 .. c7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* c8 .. cf */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* d0 .. d7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* d8 .. df */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* e0 .. e7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* e8 .. ef */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* f0 .. f7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* f8 .. ff */
+};
+#endif
+
+/*
+ * Count of bits set in byte, 0..8.
+ */
+static const char xfs_countbit[256] = {
+	0, 1, 1, 2, 1, 2, 2, 3,			/* 00 .. 07 */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 08 .. 0f */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 10 .. 17 */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 18 .. 1f */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 20 .. 27 */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 28 .. 2f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 30 .. 37 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 38 .. 3f */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 40 .. 47 */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 48 .. 4f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 50 .. 57 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 58 .. 5f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 60 .. 67 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 68 .. 6f */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 70 .. 77 */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* 78 .. 7f */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 80 .. 87 */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 88 .. 8f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 90 .. 97 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 98 .. 9f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* a0 .. a7 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* a8 .. af */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* b0 .. b7 */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* b8 .. bf */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* c0 .. c7 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* c8 .. cf */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* d0 .. d7 */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* d8 .. df */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* e0 .. e7 */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* e8 .. ef */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* f0 .. f7 */
+	5, 6, 6, 7, 6, 7, 7, 8,			/* f8 .. ff */
+};
+
+/*
+ * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
+ */
+int inline
+xfs_highbit32(
+	__uint32_t	v)
+{
+#ifdef HAVE_ARCH_HIGHBIT
+	return highbit32(v);
+#else
+	int		i;
+
+	if (v & 0xffff0000)
+		if (v & 0xff000000)
+			i = 24;
+		else
+			i = 16;
+	else if (v & 0x0000ffff)
+		if (v & 0x0000ff00)
+			i = 8;
+		else
+			i = 0;
+	else
+		return -1;
+	return i + xfs_highbit[(v >> i) & 0xff];
+#endif
+}
+
+/*
+ * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
+ */
+int
+xfs_lowbit64(
+	__uint64_t	v)
+{
+	int n;
+	n = ffs((unsigned)v);
+	if (n == 0) {
+		n = ffs(v >> 32);
+		if (n >= 0)
+			n+=32;
+	}
+	return n-1;
+}
+
+/*
+ * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
+ */
+int
+xfs_highbit64(
+	__uint64_t	v)
+{
+	__uint32_t h = v >> 32;
+	if (h)
+		return xfs_highbit32(h) + 32;
+	return xfs_highbit32((__u32)v);
+}
+
+
+/*
+ * Count the number of bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ *
+ * Do the counting by mapping a byte value to the number of set
+ * bits for that value using the xfs_countbit array, i.e.
+ * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1,
+ * xfs_countbit[3] == 2, etc.
+ */
+int
+xfs_count_bits(uint *map, uint size, uint start_bit)
+{
+	register int	bits;
+	register unsigned char	*bytep;
+	register unsigned char	*end_map;
+	int		byte_bit;
+
+	bits = 0;
+	end_map = (char*)(map + size);
+	bytep = (char*)(map + (start_bit & ~0x7));
+	byte_bit = start_bit & 0x7;
+
+	/*
+	 * If the caller fell off the end of the map, return 0.
+	 */
+	if (bytep >= end_map) {
+		return (0);
+	}
+
+	/*
+	 * If start_bit is not byte aligned, then process the
+	 * first byte separately.
+	 */
+	if (byte_bit != 0) {
+		/*
+		 * Shift off the bits we don't want to look at,
+		 * before indexing into xfs_countbit.
+		 */
+		bits += xfs_countbit[(*bytep >> byte_bit)];
+		bytep++;
+	}
+
+	/*
+	 * Count the bits in each byte until the end of the bitmap.
+	 */
+	while (bytep < end_map) {
+		bits += xfs_countbit[*bytep];
+		bytep++;
+	}
+
+	return (bits);
+}
+
+/*
+ * Count the number of contiguous bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ */
+int
+xfs_contig_bits(uint *map, uint size, uint start_bit)
+{
+#if BITS_PER_LONG == 32
+	return find_next_zero_bit(map,size*sizeof(uint)*8,start_bit) - start_bit;
+#else
+	/*
+	 * The first argument to find_next_zero_bit needs to be aligned,
+	 * but this is coming from the xfs_buf_log_format_t on-disk
+	 * struct, which can't be padded or otherwise modified w/o breaking
+	 * on-disk compatibility... so create a temporary, aligned
+	 * variable, copy over the bitmap, and send that to find_next_zero_bit
+	 * This only happens in recovery, so it's ugly but not too bad.
+	 */
+	void * addr;
+	int bit;
+	size_t bitmap_size = size * sizeof(uint);
+
+	addr = (void *)kmem_alloc(bitmap_size, KM_SLEEP);
+	memcpy(addr, map, size * sizeof(uint));
+
+	bit = find_next_zero_bit(addr,size*sizeof(uint)*8,start_bit) - start_bit;
+
+	kmem_free(addr, bitmap_size);
+
+	return bit;
+#endif
+}
+
+/*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there.	 It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ *
+ * Size is the number of words, not bytes, in the bitmap.
+ */
+int xfs_next_bit(uint *map, uint size, uint start_bit)
+{
+	uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+	uint result = start_bit & ~(NBWORD - 1);
+	uint tmp;
+
+	size <<= BIT_TO_WORD_SHIFT;
+
+	if (start_bit >= size)
+		return -1;
+	size -= result;
+	start_bit &= (NBWORD - 1);
+	if (start_bit) {
+		tmp = *p++;
+		/* set to zero first offset bits */
+		tmp &= (~0U << start_bit);
+		if (size < NBWORD)
+			goto found_first;
+		if (tmp != 0U)
+			goto found_middle;
+		size -= NBWORD;
+		result += NBWORD;
+	}
+	while (size >= NBWORD) {
+		if ((tmp = *p++) != 0U)
+			goto found_middle;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	if (!size)
+		return -1;
+	tmp = *p;
+found_first:
+found_middle:
+	return result + ffs(tmp) - 1;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bit.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bit.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bit.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bit.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BIT_H__
+#define __XFS_BIT_H__
+
+/*
+ * XFS bit manipulation routines.
+ */
+
+/*
+ * masks with n high/low bits set, 32-bit values & 64-bit values
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32HI)
+__uint32_t xfs_mask32hi(int n);
+#define XFS_MASK32HI(n)		xfs_mask32hi(n)
+#else
+#define XFS_MASK32HI(n)		((__uint32_t)-1 << (32 - (n)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64HI)
+__uint64_t xfs_mask64hi(int n);
+#define XFS_MASK64HI(n)		xfs_mask64hi(n)
+#else
+#define XFS_MASK64HI(n)		((__uint64_t)-1 << (64 - (n)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32LO)
+__uint32_t xfs_mask32lo(int n);
+#define XFS_MASK32LO(n)		xfs_mask32lo(n)
+#else
+#define XFS_MASK32LO(n)		(((__uint32_t)1 << (n)) - 1)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64LO)
+__uint64_t xfs_mask64lo(int n);
+#define XFS_MASK64LO(n)		xfs_mask64lo(n)
+#else
+#define XFS_MASK64LO(n)		(((__uint64_t)1 << (n)) - 1)
+#endif
+
+/* Get high bit set out of 32-bit argument, -1 if none set */
+extern int xfs_highbit32(__uint32_t v);
+
+/* Get low bit set out of 64-bit argument, -1 if none set */
+extern int xfs_lowbit64(__uint64_t v);
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+extern int xfs_highbit64(__uint64_t);
+
+/* Count set bits in map starting with start_bit */
+extern int xfs_count_bits(uint *map, uint size, uint start_bit);
+
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+
+#endif	/* __XFS_BIT_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bmap.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bmap.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bmap.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bmap.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,6323 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+#ifdef DEBUG
+ktrace_t	*xfs_bmap_trace_buf;
+#endif
+
+#ifdef XFSDEBUG
+STATIC void
+xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
+#endif
+
+kmem_zone_t		*xfs_bmap_free_item_zone;
+
+/*
+ * Prototypes for internal bmap routines.
+ */
+
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_extents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags);	/* inode logging flags */
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_local(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags);	/* inode logging flags */
+
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after allocating space (or doing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	xfs_fsblock_t		*first, /* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd);	/* OK to allocate reserved blocks */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_delay_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
+	xfs_fsblock_t		*first, /* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			rsvd);	/* OK to allocate reserved blocks */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_delay(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp,/* inode logging flags */
+	int			rsvd);	/* OK to allocate reserved blocks */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork); /* data or attr fork */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_unwritten_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp); /* inode logging flags */
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int				/* error */
+xfs_bmap_alloc(
+	xfs_bmalloca_t		*ap);	/* bmap alloc argument struct */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the extent list is already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork,  /* data or attr fork */
+	int			async);	    /* xaction can be async */
+
+#ifdef XFSDEBUG
+/*
+ * Check that the extents list for the inode ip is in the right order.
+ */
+STATIC void
+xfs_bmap_check_extents(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork);	/* data or attr fork */
+#else
+#define xfs_bmap_check_extents(ip,w)
+#endif
+
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current trans pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			iflags, /* input flags (meta-data or not) */
+	int			*logflagsp,/* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd);	 /* OK to allocate reserved blocks */
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+	xfs_bmap_free_t		*flist, /* free item list header */
+	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
+	xfs_bmap_free_item_t	*free); /* list item to be freed */
+
+/*
+ * Remove count entries from the extents array for inode "ip", starting
+ * at index "idx".  Copies the remaining items down over the deleted ones,
+ * and gives back the excess memory.
+ */
+STATIC void
+xfs_bmap_delete_exlist(
+	xfs_inode_t	*ip,		/* incode inode pointer */
+	xfs_extnum_t	idx,		/* starting delete index */
+	xfs_extnum_t	count,		/* count of items to delete */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
+	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
+	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Insert new item(s) in the extent list for inode "ip".
+ * Count new items are inserted at offset idx.
+ */
+STATIC void
+xfs_bmap_insert_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* starting index of new items */
+	xfs_extnum_t	count,		/* number of inserted items */
+	xfs_bmbt_irec_t *new,		/* items to insert */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Convert a local file to an extents file.
+ * This code is sort of bogus, since the file data needs to get
+ * logged so it won't be lost.	The bmap-level manipulations are ok, though.
+ */
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_t *			/* pointer to found extent entry */
+xfs_bmap_search_extents(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		whichfork,	/* data or attr fork */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t *gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t *prevp);	/* out: previous extent entry found */
+
+#ifdef XFS_BMAP_TRACE
+/*
+ * Add a bmap trace buffer entry.  Base routine for the others.
+ */
+STATIC void
+xfs_bmap_trace_addentry(
+	int		opcode,		/* operation */
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(ies) */
+	xfs_extnum_t	cnt,		/* count of entries, 1 or 2 */
+	xfs_bmbt_rec_t	*r1,		/* first record */
+	xfs_bmbt_rec_t	*r2,		/* second record or null */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
+ */
+STATIC void
+xfs_bmap_trace_delete(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(entries) deleted */
+	xfs_extnum_t	cnt,		/* count of entries deleted, 1 or 2 */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
+ * reading in the extents list from the disk (in the btree).
+ */
+STATIC void
+xfs_bmap_trace_insert(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(entries) inserted */
+	xfs_extnum_t	cnt,		/* count of entries inserted, 1 or 2 */
+	xfs_bmbt_irec_t *r1,		/* inserted record 1 */
+	xfs_bmbt_irec_t *r2,		/* inserted record 2 or null */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Add bmap trace entry after updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_post_update(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry updated */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Add bmap trace entry prior to updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_pre_update(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry to be updated */
+	int		whichfork);	/* data or attr fork */
+
+#else
+#define xfs_bmap_trace_delete(f,d,ip,i,c,w)
+#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w)
+#define xfs_bmap_trace_post_update(f,d,ip,i,w)
+#define xfs_bmap_trace_pre_update(f,d,ip,i,w)
+#endif	/* XFS_BMAP_TRACE */
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_filblks_t		len);	/* delayed extent length */
+
+#ifdef DEBUG
+/*
+ * Perform various validation checks on the values being returned
+ * from xfs_bmapi().
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap);
+#else
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+#if defined(DEBUG) && defined(XFS_RW_TRACE)
+STATIC void
+xfs_bunmap_trace(
+	xfs_inode_t		*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	inst_t			*ra);
+#else
+#define xfs_bunmap_trace(ip, bno, len, flags, ra)
+#endif	/* DEBUG && XFS_RW_TRACE */
+
+STATIC int
+xfs_bmap_count_tree(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_fsblock_t	blockno,
+	int		levelin,
+	int		*count);
+
+STATIC int
+xfs_bmap_count_leaves(
+	xfs_bmbt_rec_t		*frp,
+	int			numrecs,
+	int			*count);
+
+/*
+ * Bmap internal routines.
+ */
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* btree cursor */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* file system mount struct */
+	int			stat;		/* newroot status */
+
+	mp = ip->i_mount;
+	if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+		*flags |= XFS_ILOG_DBROOT;
+	else {
+		cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+			XFS_DATA_FORK);
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.firstblock = *firstblock;
+		if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+			goto error0;
+		ASSERT(stat == 1);	/* must be at least one entry */
+		if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+			goto error0;
+		if (stat == 0) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			return XFS_ERROR(ENOSPC);
+		}
+		*firstblock = cur->bc_private.b.firstblock;
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	}
+	return 0;
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_extents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	int			error;		/* error return value */
+
+	if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+		return 0;
+	cur = NULL;
+	error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+		flags, XFS_DATA_FORK);
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_local(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_da_args_t		dargs;		/* args for dir/attr code */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* mount structure pointer */
+
+	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+		return 0;
+	if ((ip->i_d.di_mode & IFMT) == IFDIR) {
+		mp = ip->i_mount;
+		bzero(&dargs, sizeof(dargs));
+		dargs.dp = ip;
+		dargs.firstblock = firstblock;
+		dargs.flist = flist;
+		dargs.total = mp->m_dirblkfsbs;
+		dargs.whichfork = XFS_DATA_FORK;
+		dargs.trans = tp;
+		error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs);
+	} else
+		error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
+			XFS_DATA_FORK);
+	return error;
+}
+
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after allocating space (or doing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	xfs_fsblock_t		*first, /* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd)	/* OK to use reserved data blocks */
+{
+	xfs_btree_cur_t		*cur;	/* btree cursor or null */
+	xfs_filblks_t		da_new; /* new count del alloc blocks used */
+	xfs_filblks_t		da_old; /* old count del alloc blocks used */
+	int			error;	/* error return value */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent";
+#endif
+	xfs_ifork_t		*ifp;	/* inode fork ptr */
+	int			logflags; /* returned value */
+	xfs_extnum_t		nextents; /* number of extents in file now */
+
+	XFS_STATS_INC(xfsstats.xs_add_exlist);
+	cur = *curp;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(idx <= nextents);
+	da_old = da_new = 0;
+	error = 0;
+	/*
+	 * This is the first extent added to a new/empty file.
+	 * Special case this one, so other routines get to assume there are
+	 * already extents in the list.
+	 */
+	if (nextents == 0) {
+		xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new,
+			NULL, whichfork);
+		xfs_bmap_insert_exlist(ip, 0, 1, new, whichfork);
+		ASSERT(cur == NULL);
+		ifp->if_lastex = 0;
+		if (!ISNULLSTARTBLOCK(new->br_startblock)) {
+			XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+			logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+		} else
+			logflags = 0;
+	}
+	/*
+	 * Any kind of new delayed allocation goes here.
+	 */
+	else if (ISNULLSTARTBLOCK(new->br_startblock)) {
+		if (cur)
+			ASSERT((cur->bc_private.b.flags &
+				XFS_BTCUR_BPRV_WASDEL) == 0);
+		if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new,
+				&logflags, rsvd)))
+			goto done;
+	}
+	/*
+	 * Real allocation off the end of the file.
+	 */
+	else if (idx == nextents) {
+		if (cur)
+			ASSERT((cur->bc_private.b.flags &
+				XFS_BTCUR_BPRV_WASDEL) == 0);
+		if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+				&logflags, whichfork)))
+			goto done;
+	} else {
+		xfs_bmbt_irec_t prev;	/* old extent at offset idx */
+
+		/*
+		 * Get the record referred to by idx.
+		 */
+		xfs_bmbt_get_all(&ifp->if_u1.if_extents[idx], &prev);
+		/*
+		 * If it's a real allocation record, and the new allocation ends
+		 * after the start of the referred to record, then we're filling
+		 * in a delayed or unwritten allocation with a real one, or
+		 * converting real back to unwritten.
+		 */
+		if (!ISNULLSTARTBLOCK(new->br_startblock) &&
+		    new->br_startoff + new->br_blockcount > prev.br_startoff) {
+			if (prev.br_state != XFS_EXT_UNWRITTEN &&
+			    ISNULLSTARTBLOCK(prev.br_startblock)) {
+				da_old = STARTBLOCKVAL(prev.br_startblock);
+				if (cur)
+					ASSERT(cur->bc_private.b.flags &
+						XFS_BTCUR_BPRV_WASDEL);
+				if ((error = xfs_bmap_add_extent_delay_real(ip,
+					idx, &cur, new, &da_new, first, flist,
+					&logflags, rsvd)))
+					goto done;
+			} else if (new->br_state == XFS_EXT_NORM) {
+				ASSERT(new->br_state == XFS_EXT_NORM);
+				if ((error = xfs_bmap_add_extent_unwritten_real(
+					ip, idx, &cur, new, &logflags)))
+					goto done;
+			} else {
+				ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
+				if ((error = xfs_bmap_add_extent_unwritten_real(
+					ip, idx, &cur, new, &logflags)))
+					goto done;
+			}
+			ASSERT(*curp == cur || *curp == NULL);
+		}
+		/*
+		 * Otherwise we're filling in a hole with an allocation.
+		 */
+		else {
+			if (cur)
+				ASSERT((cur->bc_private.b.flags &
+					XFS_BTCUR_BPRV_WASDEL) == 0);
+			if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+					new, &logflags, whichfork)))
+				goto done;
+		}
+	}
+
+	ASSERT(*curp == cur || *curp == NULL);
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first,
+			flist, &cur, da_old > 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+	/*
+	 * Adjust for changes in reserved delayed indirect blocks.
+	 * Nothing to do for disk quotas here.
+	 */
+	if (da_old || da_new) {
+		xfs_filblks_t	nblks;
+
+		nblks = da_new;
+		if (cur)
+			nblks += cur->bc_private.b.allocated;
+		ASSERT(nblks <= da_old);
+		if (nblks < da_old)
+			xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+				(int)(da_old - nblks), rsvd);
+	}
+	/*
+	 * Clear out the allocated field, done with it now in any case.
+	 */
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		*curp = cur;
+	}
+done:
+#ifdef XFSDEBUG
+	if (!error)
+		xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+#endif
+	*logflagsp = logflags;
+	return error;
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_delay_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
+	xfs_fsblock_t		*first, /* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			rsvd)	/* OK to use reserved data block allocation */
+{
+	xfs_bmbt_rec_t		*base;	/* base of extent entry list */
+	xfs_btree_cur_t		*cur;	/* btree cursor */
+	int			diff;	/* temp value */
+	xfs_bmbt_rec_t		*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent_delay_real";
+#endif
+	int			i;	/* temp state */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0; /* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+	xfs_filblks_t		temp;	/* value for dnew calculations */
+	xfs_filblks_t		temp2;	/* value for dnew calculations */
+	int			tmp_rval;	/* partial logging flags */
+	enum {				/* bit number definitions for state */
+		LEFT_CONTIG,	RIGHT_CONTIG,
+		LEFT_FILLING,	RIGHT_FILLING,
+		LEFT_DELAY,	RIGHT_DELAY,
+		LEFT_VALID,	RIGHT_VALID
+	};
+
+#define LEFT		r[0]
+#define RIGHT		r[1]
+#define PREV		r[2]
+#define MASK(b)		(1 << (b))
+#define MASK2(a,b)	(MASK(a) | MASK(b))
+#define MASK3(a,b,c)	(MASK2(a,b) | MASK(c))
+#define MASK4(a,b,c,d)	(MASK3(a,b,c) | MASK(d))
+#define STATE_SET(b,v)	((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)	(state & MASK(b))
+#define STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
+				       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE		\
+	(state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	cur = *curp;
+	base = ip->i_df.if_u1.if_extents;
+	ep = &base[idx];
+	xfs_bmbt_get_all(ep, &PREV);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+	/*
+	 * Set flags determining what part of the previous delayed allocation
+	 * extent is being replaced by a real allocation.
+	 */
+	STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
+	STATE_SET(RIGHT_FILLING,
+		PREV.br_startoff + PREV.br_blockcount == new_endoff);
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+		xfs_bmbt_get_all(ep - 1, &LEFT);
+		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+	}
+	STATE_SET(LEFT_CONTIG,
+		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+		LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+		LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+		LEFT.br_state == new->br_state &&
+		LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (STATE_SET_TEST(RIGHT_VALID,
+			idx <
+			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
+		xfs_bmbt_get_all(ep + 1, &RIGHT);
+		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+	}
+	STATE_SET(RIGHT_CONTIG,
+		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+		new_endoff == RIGHT.br_startoff &&
+		new->br_startblock + new->br_blockcount ==
+		    RIGHT.br_startblock &&
+		new->br_state == RIGHT.br_state &&
+		new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+		((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
+		  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
+		 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+		     <= MAXEXTLEN));
+	error = 0;
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (SWITCH_STATE) {
+
+	case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		*dnew = 0;
+		break;
+
+	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + PREV.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		*dnew = 0;
+		break;
+
+	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+					new->br_startblock,
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, PREV.br_state)))
+				goto done;
+		}
+		*dnew = 0;
+		break;
+
+	case MASK2(LEFT_FILLING, RIGHT_FILLING):
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		*dnew = 0;
+		break;
+
+	case MASK2(LEFT_FILLING, LEFT_CONTIG):
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep, temp);
+		ip->i_df.if_lastex = idx - 1;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					new->br_blockcount,
+					LEFT.br_state)))
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			STARTBLOCKVAL(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
+			XFS_DATA_FORK);
+		*dnew = temp;
+		break;
+
+	case MASK(LEFT_FILLING):
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is not contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+					first, flist, &cur, 1, &tmp_rval,
+					XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			STARTBLOCKVAL(PREV.br_startblock) -
+			(cur ? cur->bc_private.b.allocated : 0));
+		base = ip->i_df.if_u1.if_extents;
+		ep = &base[idx + 1];
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1,
+			XFS_DATA_FORK);
+		*dnew = temp;
+		break;
+
+	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount,
+			RIGHT.br_state);
+		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+					RIGHT.br_blockcount,
+					RIGHT.br_state)))
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			STARTBLOCKVAL(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		*dnew = temp;
+		break;
+
+	case MASK(RIGHT_FILLING):
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is not contiguous.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
+			new, NULL, XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+				first, flist, &cur, 1, &tmp_rval,
+				XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			STARTBLOCKVAL(PREV.br_startblock) -
+			(cur ? cur->bc_private.b.allocated : 0));
+		base = ip->i_df.if_u1.if_extents;
+		ep = &base[idx];
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+		*dnew = temp;
+		break;
+
+	case 0:
+		/*
+		 * Filling in the middle part of a previous delayed allocation.
+		 * Contiguity is impossible here.
+		 * This case is avoided almost all the time.
+		 */
+		temp = new->br_startoff - PREV.br_startoff;
+		xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep, temp);
+		r[0] = *new;
+		r[1].br_startoff = new_endoff;
+		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		r[1].br_blockcount = temp2;
+		xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+					first, flist, &cur, 1, &tmp_rval,
+					XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = xfs_bmap_worst_indlen(ip, temp);
+		temp2 = xfs_bmap_worst_indlen(ip, temp2);
+		diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) -
+			(cur ? cur->bc_private.b.allocated : 0));
+		if (diff > 0 &&
+		    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) {
+			/*
+			 * Ick gross gag me with a spoon.
+			 */
+			ASSERT(0);	/* want to see if this ever happens! */
+			while (diff > 0) {
+				if (temp) {
+					temp--;
+					diff--;
+					if (!diff ||
+					    !xfs_mod_incore_sb(ip->i_mount,
+						    XFS_SBS_FDBLOCKS, -diff, rsvd))
+						break;
+				}
+				if (temp2) {
+					temp2--;
+					diff--;
+					if (!diff ||
+					    !xfs_mod_incore_sb(ip->i_mount,
+						    XFS_SBS_FDBLOCKS, -diff, rsvd))
+						break;
+				}
+			}
+		}
+		base = ip->i_df.if_u1.if_extents;
+		ep = &base[idx];
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
+		xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_startblock(ep + 2, NULLSTARTBLOCK((int)temp2));
+		xfs_bmap_trace_post_update(fname, "0", ip, idx + 2,
+			XFS_DATA_FORK);
+		*dnew = temp + temp2;
+		break;
+
+	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK2(LEFT_FILLING, RIGHT_CONTIG):
+	case MASK2(RIGHT_FILLING, LEFT_CONTIG):
+	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK(LEFT_CONTIG):
+	case MASK(RIGHT_CONTIG):
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+	*curp = cur;
+done:
+	*logflagsp = rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+#undef	MASK
+#undef	MASK2
+#undef	MASK3
+#undef	MASK4
+#undef	STATE_SET
+#undef	STATE_TEST
+#undef	STATE_SET_TEST
+#undef	SWITCH_STATE
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_unwritten_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp) /* inode logging flags */
+{
+	xfs_bmbt_rec_t		*base;	/* base of extent entry list */
+	xfs_btree_cur_t		*cur;	/* btree cursor */
+	xfs_bmbt_rec_t		*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent_unwritten_real";
+#endif
+	int			i;	/* temp state */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_exntst_t		newext; /* new extent state */
+	xfs_exntst_t		oldext; /* old extent state */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0; /* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+	enum {				/* bit number definitions for state */
+		LEFT_CONTIG,	RIGHT_CONTIG,
+		LEFT_FILLING,	RIGHT_FILLING,
+		LEFT_DELAY,	RIGHT_DELAY,
+		LEFT_VALID,	RIGHT_VALID
+	};
+
+#define LEFT		r[0]
+#define RIGHT		r[1]
+#define PREV		r[2]
+#define MASK(b)		(1 << (b))
+#define MASK2(a,b)	(MASK(a) | MASK(b))
+#define MASK3(a,b,c)	(MASK2(a,b) | MASK(c))
+#define MASK4(a,b,c,d)	(MASK3(a,b,c) | MASK(d))
+#define STATE_SET(b,v)	((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)	(state & MASK(b))
+#define STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
+				       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE		\
+	(state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	error = 0;
+	cur = *curp;
+	base = ip->i_df.if_u1.if_extents;
+	ep = &base[idx];
+	xfs_bmbt_get_all(ep, &PREV);
+	newext = new->br_state;
+	oldext = (newext == XFS_EXT_UNWRITTEN) ?
+		XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+	ASSERT(PREV.br_state == oldext);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
+	STATE_SET(RIGHT_FILLING,
+		PREV.br_startoff + PREV.br_blockcount == new_endoff);
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+		xfs_bmbt_get_all(ep - 1, &LEFT);
+		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+	}
+	STATE_SET(LEFT_CONTIG,
+		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+		LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+		LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+		LEFT.br_state == newext &&
+		LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (STATE_SET_TEST(RIGHT_VALID,
+			idx <
+			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
+		xfs_bmbt_get_all(ep + 1, &RIGHT);
+		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+	}
+	STATE_SET(RIGHT_CONTIG,
+		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+		new_endoff == RIGHT.br_startoff &&
+		new->br_startblock + new->br_blockcount ==
+		    RIGHT.br_startblock &&
+		newext == RIGHT.br_state &&
+		new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+		((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
+		  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
+		 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+		     <= MAXEXTLEN));
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (SWITCH_STATE) {
+
+	case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		ip->i_d.di_nextents -= 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount +
+				RIGHT.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + PREV.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount,
+				LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		xfs_bmbt_set_state(ep, newext);
+		xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case MASK2(LEFT_FILLING, RIGHT_FILLING):
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_state(ep, newext);
+		xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock, new->br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case MASK2(LEFT_FILLING, LEFT_CONTIG):
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			if (xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + new->br_blockcount,
+				LEFT.br_state))
+				goto done;
+		}
+		break;
+
+	case MASK(LEFT_FILLING):
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+		ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+		xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			cur->bc_rec.b = *new;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		break;
+
+	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount, newext);
+		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock,
+					PREV.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_bmbt_increment(cur, 0, &i)))
+				goto done;
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case MASK(RIGHT_FILLING):
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+		xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
+			new, NULL, XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep,
+			new->br_startoff - PREV.br_startoff);
+		xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
+		r[0] = *new;
+		r[1].br_startoff = new_endoff;
+		r[1].br_blockcount =
+			PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		r[1].br_startblock = new->br_startblock + new->br_blockcount;
+		r[1].br_state = oldext;
+		xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		ip->i_d.di_nextents += 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			/* new right extent - oldext */
+			if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
+				r[1].br_startblock, r[1].br_blockcount,
+				r[1].br_state)))
+				goto done;
+			/* new left extent - oldext */
+			PREV.br_blockcount =
+				new->br_startoff - PREV.br_startoff;
+			cur->bc_rec.b = PREV;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_increment(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			/* new middle extent - newext */
+			cur->bc_rec.b = *new;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		break;
+
+	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK2(LEFT_FILLING, RIGHT_CONTIG):
+	case MASK2(RIGHT_FILLING, LEFT_CONTIG):
+	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK(LEFT_CONTIG):
+	case MASK(RIGHT_CONTIG):
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+	*curp = cur;
+done:
+	*logflagsp = rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+#undef	MASK
+#undef	MASK2
+#undef	MASK3
+#undef	MASK4
+#undef	STATE_SET
+#undef	STATE_TEST
+#undef	STATE_SET_TEST
+#undef	SWITCH_STATE
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+/*ARGSUSED*/
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_delay(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp, /* inode logging flags */
+	int			rsvd)		/* OK to allocate reserved blocks */
+{
+	xfs_bmbt_rec_t		*base;	/* base of extent entry list */
+	xfs_bmbt_rec_t		*ep;	/* extent list entry for idx */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent_hole_delay";
+#endif
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_filblks_t		newlen=0;	/* new indirect size */
+	xfs_filblks_t		oldlen=0;	/* old indirect size */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			state;	/* state bits, accessed thru macros */
+	xfs_filblks_t		temp;	/* temp for indirect calculations */
+	enum {				/* bit number definitions for state */
+		LEFT_CONTIG,	RIGHT_CONTIG,
+		LEFT_DELAY,	RIGHT_DELAY,
+		LEFT_VALID,	RIGHT_VALID
+	};
+
+#define MASK(b)			(1 << (b))
+#define MASK2(a,b)		(MASK(a) | MASK(b))
+#define STATE_SET(b,v)		((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)		(state & MASK(b))
+#define STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
+				       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE		(state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
+
+	base = ip->i_df.if_u1.if_extents;
+	ep = &base[idx];
+	state = 0;
+	ASSERT(ISNULLSTARTBLOCK(new->br_startblock));
+	/*
+	 * Check and set flags if this segment has a left neighbor
+	 */
+	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+		xfs_bmbt_get_all(ep - 1, &left);
+		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+	}
+	/*
+	 * Check and set flags if the current (right) segment exists.
+	 * If it doesn't exist, we're converting the hole at end-of-file.
+	 */
+	if (STATE_SET_TEST(RIGHT_VALID,
+			   idx <
+			   ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+		xfs_bmbt_get_all(ep, &right);
+		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+	}
+	/*
+	 * Set contiguity flags on the left and right neighbors.
+	 * Don't let extents get too large, even if the pieces are contiguous.
+	 */
+	STATE_SET(LEFT_CONTIG,
+		STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) &&
+		left.br_startoff + left.br_blockcount == new->br_startoff &&
+		left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+	STATE_SET(RIGHT_CONTIG,
+		STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) &&
+		new->br_startoff + new->br_blockcount == right.br_startoff &&
+		new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+		(!STATE_TEST(LEFT_CONTIG) ||
+		 (left.br_blockcount + new->br_blockcount +
+		     right.br_blockcount <= MAXEXTLEN)));
+	/*
+	 * Switch out based on the contiguity flags.
+	 */
+	switch (SWITCH_STATE) {
+
+	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+		/*
+		 * New allocation is contiguous with delayed allocations
+		 * on the left and on the right.
+		 * Merge all three into a single extent list entry.
+		 */
+		temp = left.br_blockcount + new->br_blockcount +
+			right.br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1, temp);
+		oldlen = STARTBLOCKVAL(left.br_startblock) +
+			STARTBLOCKVAL(new->br_startblock) +
+			STARTBLOCKVAL(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
+		xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		break;
+
+	case MASK(LEFT_CONTIG):
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		temp = left.br_blockcount + new->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1, temp);
+		oldlen = STARTBLOCKVAL(left.br_startblock) +
+			STARTBLOCKVAL(new->br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
+		xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		break;
+
+	case MASK(RIGHT_CONTIG):
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK);
+		temp = new->br_blockcount + right.br_blockcount;
+		oldlen = STARTBLOCKVAL(new->br_startblock) +
+			STARTBLOCKVAL(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_allf(ep, new->br_startoff,
+			NULLSTARTBLOCK((int)newlen), temp, right.br_state);
+		xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * delayed allocation.
+		 * Insert a new entry.
+		 */
+		oldlen = newlen = 0;
+		xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		break;
+	}
+	if (oldlen != newlen) {
+		ASSERT(oldlen > newlen);
+		xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+			(int)(oldlen - newlen), rsvd);
+		/*
+		 * Nothing to do for disk quota accounting here.
+		 */
+	}
+	*logflagsp = 0;
+	return 0;
+#undef	MASK
+#undef	MASK2
+#undef	STATE_SET
+#undef	STATE_TEST
+#undef	STATE_SET_TEST
+#undef	SWITCH_STATE
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork) /* data or attr fork */
+{
+	xfs_bmbt_rec_t		*ep;	/* pointer to extent entry ins. point */
+	int			error;	/* error return value */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent_hole_real";
+#endif
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			state;	/* state bits, accessed thru macros */
+	enum {				/* bit number definitions for state */
+		LEFT_CONTIG,	RIGHT_CONTIG,
+		LEFT_DELAY,	RIGHT_DELAY,
+		LEFT_VALID,	RIGHT_VALID
+	};
+
+#define MASK(b)			(1 << (b))
+#define MASK2(a,b)		(MASK(a) | MASK(b))
+#define STATE_SET(b,v)		((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)		(state & MASK(b))
+#define STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
+				       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE		(state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+	ep = &ifp->if_u1.if_extents[idx];
+	state = 0;
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 */
+	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+		xfs_bmbt_get_all(ep - 1, &left);
+		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+	}
+	/*
+	 * Check and set flags if this segment has a current value.
+	 * Not true if we're inserting into the "hole" at eof.
+	 */
+	if (STATE_SET_TEST(RIGHT_VALID,
+			   idx <
+			   ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+		xfs_bmbt_get_all(ep, &right);
+		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+	}
+	/*
+	 * We're inserting a real allocation between "left" and "right".
+	 * Set the contiguity flags.  Don't let extents get too large.
+	 */
+	STATE_SET(LEFT_CONTIG,
+		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+		left.br_startoff + left.br_blockcount == new->br_startoff &&
+		left.br_startblock + left.br_blockcount == new->br_startblock &&
+		left.br_state == new->br_state &&
+		left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+	STATE_SET(RIGHT_CONTIG,
+		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+		new->br_startoff + new->br_blockcount == right.br_startoff &&
+		new->br_startblock + new->br_blockcount ==
+		    right.br_startblock &&
+		new->br_state == right.br_state &&
+		new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+		(!STATE_TEST(LEFT_CONTIG) ||
+		 left.br_blockcount + new->br_blockcount +
+		     right.br_blockcount <= MAXEXTLEN));
+
+	/*
+	 * Select which case we're in here, and implement it.
+	 */
+	switch (SWITCH_STATE) {
+
+	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+		/*
+		 * New allocation is contiguous with real allocations on the
+		 * left and on the right.
+		 * Merge all three into a single extent list entry.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
+			whichfork);
+		xfs_bmbt_set_blockcount(ep - 1,
+			left.br_blockcount + new->br_blockcount +
+			right.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
+			whichfork);
+		xfs_bmap_trace_delete(fname, "LC|RC", ip,
+			idx, 1, whichfork);
+		xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
+		ifp->if_lastex = idx - 1;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		if (cur == NULL) {
+			*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			return 0;
+		}
+		*logflagsp = XFS_ILOG_CORE;
+		if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
+				right.br_startblock, right.br_blockcount, &i)))
+			return error;
+		ASSERT(i == 1);
+		if ((error = xfs_bmbt_delete(cur, 0, &i)))
+			return error;
+		ASSERT(i == 1);
+		if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+			return error;
+		ASSERT(i == 1);
+		error = xfs_bmbt_update(cur, left.br_startoff,
+				left.br_startblock,
+				left.br_blockcount + new->br_blockcount +
+				right.br_blockcount, left.br_state);
+		return error;
+
+	case MASK(LEFT_CONTIG):
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork);
+		xfs_bmbt_set_blockcount(ep - 1,
+			left.br_blockcount + new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork);
+		ifp->if_lastex = idx - 1;
+		if (cur == NULL) {
+			*logflagsp = XFS_ILOG_FEXT(whichfork);
+			return 0;
+		}
+		*logflagsp = 0;
+		if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
+				left.br_startblock, left.br_blockcount, &i)))
+			return error;
+		ASSERT(i == 1);
+		error = xfs_bmbt_update(cur, left.br_startoff,
+				left.br_startblock,
+				left.br_blockcount + new->br_blockcount,
+				left.br_state);
+		return error;
+
+	case MASK(RIGHT_CONTIG):
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork);
+		xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+			new->br_blockcount + right.br_blockcount,
+			right.br_state);
+		xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork);
+		ifp->if_lastex = idx;
+		if (cur == NULL) {
+			*logflagsp = XFS_ILOG_FEXT(whichfork);
+			return 0;
+		}
+		*logflagsp = 0;
+		if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
+				right.br_startblock, right.br_blockcount, &i)))
+			return error;
+		ASSERT(i == 1);
+		error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + right.br_blockcount,
+				right.br_state);
+		return error;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * real allocation.
+		 * Insert a new entry.
+		 */
+		xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
+			whichfork);
+		xfs_bmap_insert_exlist(ip, idx, 1, new, whichfork);
+		ifp->if_lastex = idx;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		if (cur == NULL) {
+			*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			return 0;
+		}
+		*logflagsp = XFS_ILOG_CORE;
+		if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+				new->br_startblock, new->br_blockcount, &i)))
+			return error;
+		ASSERT(i == 0);
+		cur->bc_rec.b.br_state = new->br_state;
+		if ((error = xfs_bmbt_insert(cur, &i)))
+			return error;
+		ASSERT(i == 1);
+		return 0;
+	}
+#undef	MASK
+#undef	MASK2
+#undef	STATE_SET
+#undef	STATE_TEST
+#undef	STATE_SET_TEST
+#undef	SWITCH_STATE
+	/* NOTREACHED */
+	ASSERT(0);
+	return 0; /* keep gcc quite */
+}
+
+#define XFS_ALLOC_GAP_UNITS	4
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int				/* error */
+xfs_bmap_alloc(
+	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
+{
+	xfs_fsblock_t	adjust;		/* adjustment to block numbers */
+	xfs_alloctype_t atype=0;	/* type for allocation routines */
+	int		error;		/* error return value */
+	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		nullfb;		/* true if ap->firstblock isn't set */
+	int		rt;		/* true if inode is realtime */
+#ifdef __KERNEL__
+	xfs_extlen_t	prod=0;		/* product factor for allocators */
+	xfs_extlen_t	ralen=0;	/* realtime allocation length */
+#endif
+
+#define ISLEGAL(x,y)	\
+	(rt ? \
+		(x) < mp->m_sb.sb_rblocks : \
+		XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+		XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+		XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+
+	/*
+	 * Set up variables.
+	 */
+	mp = ap->ip->i_mount;
+	nullfb = ap->firstblock == NULLFSBLOCK;
+	rt = (ap->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && ap->userdata;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+#ifdef __KERNEL__
+	if (rt) {
+		xfs_extlen_t	extsz;		/* file extent size for rt */
+		xfs_fileoff_t	nexto;		/* next file offset */
+		xfs_extlen_t	orig_alen;	/* original ap->alen */
+		xfs_fileoff_t	orig_end;	/* original off+len */
+		xfs_fileoff_t	orig_off;	/* original ap->off */
+		xfs_extlen_t	mod_off;	/* modulus calculations */
+		xfs_fileoff_t	prevo;		/* previous file offset */
+		xfs_rtblock_t	rtx;		/* realtime extent number */
+		xfs_extlen_t	temp;		/* temp for rt calculations */
+
+		/*
+		 * Set prod to match the realtime extent size.
+		 */
+		if (!(extsz = ap->ip->i_d.di_extsize))
+			extsz = mp->m_sb.sb_rextsize;
+		prod = extsz / mp->m_sb.sb_rextsize;
+		orig_off = ap->off;
+		orig_alen = ap->alen;
+		orig_end = orig_off + orig_alen;
+		/*
+		 * If the file offset is unaligned vs. the extent size
+		 * we need to align it.	 This will be possible unless
+		 * the file was previously written with a kernel that didn't
+		 * perform this alignment.
+		 */
+		mod_off = do_mod(orig_off, extsz);
+		if (mod_off) {
+			ap->alen += mod_off;
+			ap->off -= mod_off;
+		}
+		/*
+		 * Same adjustment for the end of the requested area.
+		 */
+		if ((temp = (ap->alen % extsz)))
+			ap->alen += extsz - temp;
+		/*
+		 * If the previous block overlaps with this proposed allocation
+		 * then move the start forward without adjusting the length.
+		 */
+		prevo =
+			ap->prevp->br_startoff == NULLFILEOFF ?
+				0 :
+				(ap->prevp->br_startoff +
+				 ap->prevp->br_blockcount);
+		if (ap->off != orig_off && ap->off < prevo)
+			ap->off = prevo;
+		/*
+		 * If the next block overlaps with this proposed allocation
+		 * then move the start back without adjusting the length,
+		 * but not before offset 0.
+		 * This may of course make the start overlap previous block,
+		 * and if we hit the offset 0 limit then the next block
+		 * can still overlap too.
+		 */
+		nexto = (ap->eof || ap->gotp->br_startoff == NULLFILEOFF) ?
+			NULLFILEOFF : ap->gotp->br_startoff;
+		if (!ap->eof &&
+		    ap->off + ap->alen != orig_end &&
+		    ap->off + ap->alen > nexto)
+			ap->off = nexto > ap->alen ? nexto - ap->alen : 0;
+		/*
+		 * If we're now overlapping the next or previous extent that
+		 * means we can't fit an extsz piece in this hole.  Just move
+		 * the start forward to the first legal spot and set
+		 * the length so we hit the end.
+		 */
+		if ((ap->off != orig_off && ap->off < prevo) ||
+		    (ap->off + ap->alen != orig_end &&
+		     ap->off + ap->alen > nexto)) {
+			ap->off = prevo;
+			ap->alen = nexto - prevo;
+		}
+		/*
+		 * If the result isn't a multiple of rtextents we need to
+		 * remove blocks until it is.
+		 */
+		if ((temp = (ap->alen % mp->m_sb.sb_rextsize))) {
+			/*
+			 * We're not covering the original request, or
+			 * we won't be able to once we fix the length.
+			 */
+			if (orig_off < ap->off ||
+			    orig_end > ap->off + ap->alen ||
+			    ap->alen - temp < orig_alen)
+				return XFS_ERROR(EINVAL);
+			/*
+			 * Try to fix it by moving the start up.
+			 */
+			if (ap->off + temp <= orig_off) {
+				ap->alen -= temp;
+				ap->off += temp;
+			}
+			/*
+			 * Try to fix it by moving the end in.
+			 */
+			else if (ap->off + ap->alen - temp >= orig_end)
+				ap->alen -= temp;
+			/*
+			 * Set the start to the minimum then trim the length.
+			 */
+			else {
+				ap->alen -= orig_off - ap->off;
+				ap->off = orig_off;
+				ap->alen -= ap->alen % mp->m_sb.sb_rextsize;
+			}
+			/*
+			 * Result doesn't cover the request, fail it.
+			 */
+			if (orig_off < ap->off || orig_end > ap->off + ap->alen)
+				return XFS_ERROR(EINVAL);
+		}
+		ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+		/*
+		 * If the offset & length are not perfectly aligned
+		 * then kill prod, it will just get us in trouble.
+		 */
+		if (do_mod(ap->off, extsz) || ap->alen % extsz)
+			prod = 1;
+		/*
+		 * Set ralen to be the actual requested length in rtextents.
+		 */
+		ralen = ap->alen / mp->m_sb.sb_rextsize;
+		/*
+		 * If the old value was close enough to MAXEXTLEN that
+		 * we rounded up to it, cut it back so it's legal again.
+		 * Note that if it's a really large request (bigger than
+		 * MAXEXTLEN), we don't hear about that number, and can't
+		 * adjust the starting point to match it.
+		 */
+		if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+			ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+		/*
+		 * If it's an allocation to an empty file at offset 0,
+		 * pick an extent that will space things out in the rt area.
+		 */
+		if (ap->eof && ap->off == 0) {
+			error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+			if (error)
+				return error;
+			ap->rval = rtx * mp->m_sb.sb_rextsize;
+		} else
+			ap->rval = 0;
+	}
+#else
+	if (rt)
+		ap->rval = 0;
+#endif	/* __KERNEL__ */
+	else if (nullfb)
+		ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+	else
+		ap->rval = ap->firstblock;
+	/*
+	 * If allocating at eof, and there's a previous real block,
+	 * try to use it's last block as our starting point.
+	 */
+	if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
+	    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+	    ISLEGAL(ap->prevp->br_startblock + ap->prevp->br_blockcount,
+		    ap->prevp->br_startblock)) {
+		ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+		/*
+		 * Adjust for the gap between prevp and us.
+		 */
+		adjust = ap->off -
+			(ap->prevp->br_startoff + ap->prevp->br_blockcount);
+		if (adjust &&
+		    ISLEGAL(ap->rval + adjust, ap->prevp->br_startblock))
+			ap->rval += adjust;
+	}
+	/*
+	 * If not at eof, then compare the two neighbor blocks.
+	 * Figure out whether either one gives us a good starting point,
+	 * and pick the better one.
+	 */
+	else if (!ap->eof) {
+		xfs_fsblock_t	gotbno;		/* right side block number */
+		xfs_fsblock_t	gotdiff=0;	/* right side difference */
+		xfs_fsblock_t	prevbno;	/* left side block number */
+		xfs_fsblock_t	prevdiff=0;	/* left side difference */
+
+		/*
+		 * If there's a previous (left) block, select a requested
+		 * start block based on it.
+		 */
+		if (ap->prevp->br_startoff != NULLFILEOFF &&
+		    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+		    (prevbno = ap->prevp->br_startblock +
+			       ap->prevp->br_blockcount) &&
+		    ISLEGAL(prevbno, ap->prevp->br_startblock)) {
+			/*
+			 * Calculate gap to end of previous block.
+			 */
+			adjust = prevdiff = ap->off -
+				(ap->prevp->br_startoff +
+				 ap->prevp->br_blockcount);
+			/*
+			 * Figure the startblock based on the previous block's
+			 * end and the gap size.
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an illegal block
+			 * number, then just use the end of the previous block.
+			 */
+			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			    ISLEGAL(prevbno + prevdiff,
+				    ap->prevp->br_startblock))
+				prevbno += adjust;
+			else
+				prevdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+				prevbno = NULLFSBLOCK;
+		}
+		/*
+		 * No previous block or can't follow it, just default.
+		 */
+		else
+			prevbno = NULLFSBLOCK;
+		/*
+		 * If there's a following (right) block, select a requested
+		 * start block based on it.
+		 */
+		if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) {
+			/*
+			 * Calculate gap to start of next block.
+			 */
+			adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+			/*
+			 * Figure the startblock based on the next block's
+			 * start and the gap size.
+			 */
+			gotbno = ap->gotp->br_startblock;
+			/*
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an illegal block
+			 * number, then just use the start of the next block
+			 * offset by our length.
+			 */
+			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			    ISLEGAL(gotbno - gotdiff, gotbno))
+				gotbno -= adjust;
+			else if (ISLEGAL(gotbno - ap->alen, gotbno)) {
+				gotbno -= ap->alen;
+				gotdiff += adjust - ap->alen;
+			} else
+				gotdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+				gotbno = NULLFSBLOCK;
+		}
+		/*
+		 * No next block, just default.
+		 */
+		else
+			gotbno = NULLFSBLOCK;
+		/*
+		 * If both valid, pick the better one, else the only good
+		 * one, else ap->rval is already set (to 0 or the inode block).
+		 */
+		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+			ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+		else if (prevbno != NULLFSBLOCK)
+			ap->rval = prevbno;
+		else if (gotbno != NULLFSBLOCK)
+			ap->rval = gotbno;
+	}
+	/*
+	 * If allowed, use ap->rval; otherwise must use firstblock since
+	 * it's in the right allocation group.
+	 */
+	if (nullfb || rt || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+		;
+	else
+		ap->rval = ap->firstblock;
+	/*
+	 * Realtime allocation, done through xfs_rtallocate_extent.
+	 */
+	if (rt) {
+#ifndef __KERNEL__
+		ASSERT(0);
+#else
+		xfs_rtblock_t	rtb;
+
+		atype = ap->rval == 0 ?
+			XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+		do_div(ap->rval, mp->m_sb.sb_rextsize);
+		rtb = ap->rval;
+		ap->alen = ralen;
+		if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
+				&ralen, atype, ap->wasdel, prod, &rtb)))
+			return error;
+		if (rtb == NULLFSBLOCK && prod > 1 &&
+		    (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
+						   ap->alen, &ralen, atype,
+						   ap->wasdel, 1, &rtb)))
+			return error;
+		ap->rval = rtb;
+		if (ap->rval != NULLFSBLOCK) {
+			ap->rval *= mp->m_sb.sb_rextsize;
+			ralen *= mp->m_sb.sb_rextsize;
+			ap->alen = ralen;
+			ap->ip->i_d.di_nblocks += ralen;
+			xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+			if (ap->wasdel)
+				ap->ip->i_delayed_blks -= ralen;
+			/*
+			 * Adjust the disk quota also. This was reserved
+			 * earlier.
+			 */
+			if (XFS_IS_QUOTA_ON(mp) &&
+			    ap->ip->i_ino != mp->m_sb.sb_uquotino &&
+			    ap->ip->i_ino != mp->m_sb.sb_gquotino)
+				xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+					ap->wasdel ?
+						XFS_TRANS_DQ_DELRTBCOUNT :
+						XFS_TRANS_DQ_RTBCOUNT,
+					(long)ralen);
+		} else
+			ap->alen = 0;
+#endif	/* __KERNEL__ */
+	}
+	/*
+	 * Normal allocation, done through xfs_alloc_vextent.
+	 */
+	else {
+		xfs_agnumber_t	ag;
+		xfs_alloc_arg_t args;
+		xfs_extlen_t	blen;
+		xfs_extlen_t	delta;
+		int		isaligned;
+		xfs_extlen_t	longest;
+		xfs_extlen_t	need;
+		xfs_extlen_t	nextminlen=0;
+		int		notinit;
+		xfs_perag_t	*pag;
+		xfs_agnumber_t	startag;
+		int		tryagain;
+
+		tryagain = isaligned = 0;
+		args.tp = ap->tp;
+		args.mp = mp;
+		args.fsbno = ap->rval;
+		args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+		blen = 0;
+		if (nullfb) {
+			args.type = XFS_ALLOCTYPE_START_BNO;
+			args.total = ap->total;
+			/*
+			 * Find the longest available space.
+			 * We're going to try for the whole allocation at once.
+			 */
+			startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
+			notinit = 0;
+			down_read(&mp->m_peraglock);
+			while (blen < ap->alen) {
+				pag = &mp->m_perag[ag];
+				if (!pag->pagf_init &&
+				    (error = xfs_alloc_pagf_init(mp, args.tp,
+					    ag, XFS_ALLOC_FLAG_TRYLOCK))) {
+					up_read(&mp->m_peraglock);
+					return error;
+				}
+				/*
+				 * See xfs_alloc_fix_freelist...
+				 */
+				if (pag->pagf_init) {
+					need = XFS_MIN_FREELIST_PAG(pag, mp);
+					delta = need > pag->pagf_flcount ?
+						need - pag->pagf_flcount : 0;
+					longest = (pag->pagf_longest > delta) ?
+						(pag->pagf_longest - delta) :
+						(pag->pagf_flcount > 0 ||
+						 pag->pagf_longest > 0);
+					if (blen < longest)
+						blen = longest;
+				} else
+					notinit = 1;
+				if (++ag == mp->m_sb.sb_agcount)
+					ag = 0;
+				if (ag == startag)
+					break;
+			}
+			up_read(&mp->m_peraglock);
+			/*
+			 * Since the above loop did a BUF_TRYLOCK, it is
+			 * possible that there is space for this request.
+			 */
+			if (notinit || blen < ap->minlen)
+				args.minlen = ap->minlen;
+			/*
+			 * If the best seen length is less than the request
+			 * length, use the best as the minimum.
+			 */
+			else if (blen < ap->alen)
+				args.minlen = blen;
+			/*
+			 * Otherwise we've seen an extent as big as alen,
+			 * use that as the minimum.
+			 */
+			else
+				args.minlen = ap->alen;
+		} else if (ap->low) {
+			args.type = XFS_ALLOCTYPE_FIRST_AG;
+			args.total = args.minlen = ap->minlen;
+		} else {
+			args.type = XFS_ALLOCTYPE_NEAR_BNO;
+			args.total = ap->total;
+			args.minlen = ap->minlen;
+		}
+		if (ap->ip->i_d.di_extsize) {
+			args.prod = ap->ip->i_d.di_extsize;
+			if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
+				args.mod = (xfs_extlen_t)(args.prod - args.mod);
+		} else if (mp->m_sb.sb_blocksize >= NBPP) {
+			args.prod = 1;
+			args.mod = 0;
+		} else {
+			args.prod = NBPP >> mp->m_sb.sb_blocklog;
+			if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
+				args.mod = (xfs_extlen_t)(args.prod - args.mod);
+		}
+		/*
+		 * If we are not low on available data blocks, and the
+		 * underlying logical volume manager is a stripe, and
+		 * the file offset is zero then try to allocate data
+		 * blocks on stripe unit boundary.
+		 * NOTE: ap->aeof is only set if the allocation length
+		 * is >= the stripe unit and the allocation offset is
+		 * at the end of file.
+		 */
+		if (!ap->low && ap->aeof) {
+			if (!ap->off) {
+				args.alignment = mp->m_dalign;
+				atype = args.type;
+				isaligned = 1;
+				/*
+				 * Adjust for alignment
+				 */
+				if (blen > args.alignment && blen <= ap->alen)
+					args.minlen = blen - args.alignment;
+				args.minalignslop = 0;
+			} else {
+				/*
+				 * First try an exact bno allocation.
+				 * If it fails then do a near or start bno
+				 * allocation with alignment turned on.
+				 */
+				atype = args.type;
+				tryagain = 1;
+				args.type = XFS_ALLOCTYPE_THIS_BNO;
+				args.alignment = 1;
+				/*
+				 * Compute the minlen+alignment for the
+				 * next case.  Set slop so that the value
+				 * of minlen+alignment+slop doesn't go up
+				 * between the calls.
+				 */
+				if (blen > mp->m_dalign && blen <= ap->alen)
+					nextminlen = blen - mp->m_dalign;
+				else
+					nextminlen = args.minlen;
+				if (nextminlen + mp->m_dalign > args.minlen + 1)
+					args.minalignslop =
+						nextminlen + mp->m_dalign -
+						args.minlen - 1;
+				else
+					args.minalignslop = 0;
+			}
+		} else {
+			args.alignment = 1;
+			args.minalignslop = 0;
+		}
+		args.minleft = ap->minleft;
+		args.wasdel = ap->wasdel;
+		args.isfl = 0;
+		args.userdata = ap->userdata;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+		if (tryagain && args.fsbno == NULLFSBLOCK) {
+			/*
+			 * Exact allocation failed. Now try with alignment
+			 * turned on.
+			 */
+			args.type = atype;
+			args.fsbno = ap->rval;
+			args.alignment = mp->m_dalign;
+			args.minlen = nextminlen;
+			args.minalignslop = 0;
+			isaligned = 1;
+			if ((error = xfs_alloc_vextent(&args)))
+				return error;
+		}
+		if (isaligned && args.fsbno == NULLFSBLOCK) {
+			/*
+			 * allocation failed, so turn off alignment and
+			 * try again.
+			 */
+			args.type = atype;
+			args.fsbno = ap->rval;
+			args.alignment = 0;
+			if ((error = xfs_alloc_vextent(&args)))
+				return error;
+		}
+		if (args.fsbno == NULLFSBLOCK && nullfb &&
+		    args.minlen > ap->minlen) {
+			args.minlen = ap->minlen;
+			args.type = XFS_ALLOCTYPE_START_BNO;
+			args.fsbno = ap->rval;
+			if ((error = xfs_alloc_vextent(&args)))
+				return error;
+		}
+		if (args.fsbno == NULLFSBLOCK && nullfb) {
+			args.fsbno = 0;
+			args.type = XFS_ALLOCTYPE_FIRST_AG;
+			args.total = ap->minlen;
+			args.minleft = 0;
+			if ((error = xfs_alloc_vextent(&args)))
+				return error;
+			ap->low = 1;
+		}
+		if (args.fsbno != NULLFSBLOCK) {
+			ap->firstblock = ap->rval = args.fsbno;
+			ASSERT(nullfb || fb_agno == args.agno ||
+			       (ap->low && fb_agno < args.agno));
+			ap->alen = args.len;
+			ap->ip->i_d.di_nblocks += args.len;
+			xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+			if (ap->wasdel)
+				ap->ip->i_delayed_blks -= args.len;
+			/*
+			 * Adjust the disk quota also. This was reserved
+			 * earlier.
+			 */
+			if (XFS_IS_QUOTA_ON(mp) &&
+			    ap->ip->i_ino != mp->m_sb.sb_uquotino &&
+			    ap->ip->i_ino != mp->m_sb.sb_gquotino)
+				xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+					ap->wasdel ?
+						XFS_TRANS_DQ_DELBCOUNT :
+						XFS_TRANS_DQ_BCOUNT,
+					(long)args.len);
+		} else {
+			ap->rval = NULLFSBLOCK;
+			ap->alen = 0;
+		}
+	}
+	return 0;
+#undef	ISLEGAL
+}
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the extent list is already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork,  /* data or attr fork */
+	int			async)	    /* xaction can be async */
+{
+	/* REFERENCED */
+	xfs_bmbt_block_t	*cblock;/* child btree block */
+	xfs_fsblock_t		cbno;	/* child block number */
+	xfs_buf_t		*cbp;	/* child block's buffer */
+	int			error;	/* error return value */
+	xfs_ifork_t		*ifp;	/* inode fork data */
+	xfs_mount_t		*mp;	/* mount point structure */
+	xfs_bmbt_ptr_t		*pp;	/* ptr to block address */
+	xfs_bmbt_block_t	*rblock;/* root btree block */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+	rblock = ifp->if_broot;
+	ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) == 1);
+	ASSERT(INT_GET(rblock->bb_numrecs, ARCH_CONVERT) == 1);
+	ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
+	mp = ip->i_mount;
+	pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
+	*logflagsp = 0;
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1)))
+		return error;
+#endif
+	cbno = INT_GET(*pp, ARCH_CONVERT);
+	if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
+			XFS_BMAP_BTREE_REF)))
+		return error;
+	cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
+	if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+		return error;
+	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+	if (!async)
+		xfs_trans_set_sync(tp);
+	ip->i_d.di_nblocks--;
+	if (XFS_IS_QUOTA_ON(mp) &&
+	    ip->i_ino != mp->m_sb.sb_uquotino &&
+	    ip->i_ino != mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, cbp);
+	if (cur->bc_bufs[0] == cbp)
+		cur->bc_bufs[0] = NULL;
+	xfs_iroot_realloc(ip, -1, whichfork);
+	ASSERT(ifp->if_broot == NULL);
+	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+	return 0;
+}
+
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current transaction pointer */
+	xfs_extnum_t		idx,	/* extent number to update/delete */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*del,	/* data to remove from extent list */
+	int			iflags, /* input flags */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd)	/* OK to allocate reserved blocks */
+{
+	xfs_filblks_t		da_new; /* new delay-alloc indirect blocks */
+	xfs_filblks_t		da_old; /* old delay-alloc indirect blocks */
+	xfs_fsblock_t		del_endblock=0; /* first block past del */
+	xfs_fileoff_t		del_endoff;	/* first offset past del */
+	int			delay;	/* current block is delayed allocated */
+	int			do_fx;	/* free extent at end of routine */
+	xfs_bmbt_rec_t		*ep;	/* current extent entry pointer */
+	int			error;	/* error return value */
+	int			flags;	/* inode logging flags */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_del_extent";
+#endif
+	xfs_bmbt_irec_t		got;	/* current extent entry */
+	xfs_fileoff_t		got_endoff;	/* first offset past got */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_filblks_t		nblks;	/* quota/sb block count */
+	xfs_bmbt_irec_t		new;	/* new record to be inserted */
+	/* REFERENCED */
+	xfs_extnum_t		nextents;	/* number of extents in list */
+	uint			qfield; /* quota field to update */
+	xfs_filblks_t		temp;	/* for indirect length calculations */
+	xfs_filblks_t		temp2;	/* for indirect length calculations */
+
+	XFS_STATS_INC(xfsstats.xs_del_exlist);
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(idx >= 0 && idx < nextents);
+	ASSERT(del->br_blockcount > 0);
+	ep = &ifp->if_u1.if_extents[idx];
+	xfs_bmbt_get_all(ep, &got);
+	ASSERT(got.br_startoff <= del->br_startoff);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got.br_startoff + got.br_blockcount;
+	ASSERT(got_endoff >= del_endoff);
+	delay = ISNULLSTARTBLOCK(got.br_startblock);
+	ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay);
+	flags = 0;
+	qfield = 0;
+	error = 0;
+	/*
+	 * If deleting a real allocation, must free up the disk space.
+	 */
+	if (!delay) {
+		flags = XFS_ILOG_CORE;
+		/*
+		 * Realtime allocation.	 Free it and record di_nblocks update.
+		 */
+		if (whichfork == XFS_DATA_FORK &&
+		    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+			xfs_fsblock_t	bno;
+			xfs_filblks_t	len;
+
+			ASSERT(do_mod(del->br_blockcount,
+				      mp->m_sb.sb_rextsize) == 0);
+			ASSERT(do_mod(del->br_startblock,
+				      mp->m_sb.sb_rextsize) == 0);
+			bno = del->br_startblock;
+			len = del->br_blockcount;
+			do_div(bno, mp->m_sb.sb_rextsize);
+			do_div(len, mp->m_sb.sb_rextsize);
+			if ((error = xfs_rtfree_extent(ip->i_transp, bno,
+					(xfs_extlen_t)len)))
+				goto done;
+			do_fx = 0;
+			nblks = len * mp->m_sb.sb_rextsize;
+			if (XFS_IS_QUOTA_ON(mp) &&
+			    ip->i_ino != mp->m_sb.sb_uquotino &&
+			    ip->i_ino != mp->m_sb.sb_gquotino)
+				qfield = XFS_TRANS_DQ_RTBCOUNT;
+		}
+		/*
+		 * Ordinary allocation.
+		 */
+		else {
+			do_fx = 1;
+			nblks = del->br_blockcount;
+			if (XFS_IS_QUOTA_ON(mp) &&
+			    ip->i_ino != mp->m_sb.sb_uquotino &&
+			    ip->i_ino != mp->m_sb.sb_gquotino)
+				qfield = XFS_TRANS_DQ_BCOUNT;
+		}
+		/*
+		 * Set up del_endblock and cur for later.
+		 */
+		del_endblock = del->br_startblock + del->br_blockcount;
+		if (cur) {
+			if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+					got.br_startblock, got.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		da_old = da_new = 0;
+	} else {
+		da_old = STARTBLOCKVAL(got.br_startblock);
+		da_new = 0;
+		nblks = 0;
+		do_fx = 0;
+	}
+	/*
+	 * Set flag value to use in switch statement.
+	 * Left-contig is 2, right-contig is 1.
+	 */
+	switch (((got.br_startoff == del->br_startoff) << 1) |
+		(got_endoff == del_endoff)) {
+	case 3:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork);
+		xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
+		ifp->if_lastex = idx;
+		if (delay)
+			break;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		flags |= XFS_ILOG_CORE;
+		if (!cur) {
+			flags |= XFS_ILOG_FEXT(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_delete(cur, iflags & XFS_BMAPI_ASYNC, &i)))
+			goto done;
+		ASSERT(i == 1);
+		break;
+
+	case 2:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork);
+		xfs_bmbt_set_startoff(ep, del_endoff);
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		ifp->if_lastex = idx;
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmap_trace_post_update(fname, "2", ip, idx,
+				whichfork);
+			da_new = temp;
+			break;
+		}
+		xfs_bmbt_set_startblock(ep, del_endblock);
+		xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork);
+		if (!cur) {
+			flags |= XFS_ILOG_FEXT(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 1:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork);
+		xfs_bmbt_set_blockcount(ep, temp);
+		ifp->if_lastex = idx;
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmap_trace_post_update(fname, "1", ip, idx,
+				whichfork);
+			da_new = temp;
+			break;
+		}
+		xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork);
+		if (!cur) {
+			flags |= XFS_ILOG_FEXT(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, got.br_startoff,
+				got.br_startblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		temp = del->br_startoff - got.br_startoff;
+		xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork);
+		xfs_bmbt_set_blockcount(ep, temp);
+		new.br_startoff = del_endoff;
+		temp2 = got_endoff - del_endoff;
+		new.br_blockcount = temp2;
+		new.br_state = got.br_state;
+		if (!delay) {
+			new.br_startblock = del_endblock;
+			flags |= XFS_ILOG_CORE;
+			if (cur) {
+				if ((error = xfs_bmbt_update(cur,
+						got.br_startoff,
+						got.br_startblock, temp,
+						got.br_state)))
+					goto done;
+				if ((error = xfs_bmbt_increment(cur, 0, &i)))
+					goto done;
+				cur->bc_rec.b = new;
+				error = xfs_bmbt_insert(cur, &i);
+				if (error && error != ENOSPC)
+					goto done;
+				/*
+				 * If get no-space back from btree insert,
+				 * it tried a split, and we have a zero
+				 * block reservation.
+				 * Fix up our state and return the error.
+				 */
+				if (error == ENOSPC) {
+					/*
+					 * Reset the cursor, don't trust
+					 * it after any insert operation.
+					 */
+					if ((error = xfs_bmbt_lookup_eq(cur,
+							got.br_startoff,
+							got.br_startblock,
+							temp, &i)))
+						goto done;
+					ASSERT(i == 1);
+					/*
+					 * Update the btree record back
+					 * to the original value.
+					 */
+					if ((error = xfs_bmbt_update(cur,
+							got.br_startoff,
+							got.br_startblock,
+							got.br_blockcount,
+							got.br_state)))
+						goto done;
+					/*
+					 * Reset the extent record back
+					 * to the original value.
+					 */
+					xfs_bmbt_set_blockcount(ep,
+						got.br_blockcount);
+					flags = 0;
+					error = XFS_ERROR(ENOSPC);
+					goto done;
+				}
+				ASSERT(i == 1);
+			} else
+				flags |= XFS_ILOG_FEXT(whichfork);
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		} else {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			temp = xfs_bmap_worst_indlen(ip, temp);
+			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			temp2 = xfs_bmap_worst_indlen(ip, temp2);
+			new.br_startblock = NULLSTARTBLOCK((int)temp2);
+			da_new = temp + temp2;
+			while (da_new > da_old) {
+				if (temp) {
+					temp--;
+					da_new--;
+					xfs_bmbt_set_startblock(ep,
+						NULLSTARTBLOCK((int)temp));
+				}
+				if (da_new == da_old)
+					break;
+				if (temp2) {
+					temp2--;
+					da_new--;
+					new.br_startblock =
+						NULLSTARTBLOCK((int)temp2);
+				}
+			}
+		}
+		xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork);
+		xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL,
+			whichfork);
+		xfs_bmap_insert_exlist(ip, idx + 1, 1, &new, whichfork);
+		ifp->if_lastex = idx + 1;
+		break;
+	}
+	/*
+	 * If we need to, add to list of extents to delete.
+	 */
+	if (do_fx)
+		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+			mp);
+	/*
+	 * Adjust inode # blocks in the file.
+	 */
+	if (nblks)
+		ip->i_d.di_nblocks -= nblks;
+	/*
+	 * Adjust quota data.
+	 */
+	if (qfield)
+		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+	/*
+	 * Account for change in delayed indirect blocks.
+	 * Nothing to do for disk quota accounting here.
+	 */
+	ASSERT(da_old >= da_new);
+	if (da_old > da_new)
+		xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new),
+			rsvd);
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+	xfs_bmap_free_t		*flist, /* free item list header */
+	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
+	xfs_bmap_free_item_t	*free)	/* list item to be freed */
+{
+	if (prev)
+		prev->xbfi_next = free->xbfi_next;
+	else
+		flist->xbf_first = free->xbfi_next;
+	flist->xbf_count--;
+	kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+/*
+ * Remove count entries from the extents array for inode "ip", starting
+ * at index "idx".  Copies the remaining items down over the deleted ones,
+ * and gives back the excess memory.
+ */
+STATIC void
+xfs_bmap_delete_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* starting delete index */
+	xfs_extnum_t	count,		/* count of items to delete */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;		/* base of extent list */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extnum_t	nextents;	/* number of extents in list after */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	base = ifp->if_u1.if_extents;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - count;
+	ovbcopy(&base[idx + count], &base[idx],
+		(nextents - idx) * sizeof(*base));
+	xfs_iext_realloc(ip, -count, whichfork);
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
+	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
+	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_block_t	*ablock;	/* allocated (child) bt block */
+	xfs_buf_t		*abp;		/* buffer for ablock */
+	xfs_alloc_arg_t		args;		/* allocation arguments */
+	xfs_bmbt_rec_t		*arp;		/* child record pointer */
+	xfs_bmbt_block_t	*block;		/* btree root block */
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_rec_t		*ep;		/* extent list pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		i;		/* extent list index */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_bmbt_key_t		*kp;		/* root block key pointer */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* extent list size */
+	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	/*
+	 * Make space in the inode incore.
+	 */
+	xfs_iroot_realloc(ip, 1, whichfork);
+	ifp->if_flags |= XFS_IFBROOT;
+	/*
+	 * Fill in the root.
+	 */
+	block = ifp->if_broot;
+	INT_SET(block->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+	INT_SET(block->bb_level, ARCH_CONVERT, 1);
+	INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+	INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
+	INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
+	/*
+	 * Need a cursor.  Can't allocate until bb_level is filled in.
+	 */
+	mp = ip->i_mount;
+	cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+		whichfork);
+	cur->bc_private.b.firstblock = *firstblock;
+	cur->bc_private.b.flist = flist;
+	cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+	/*
+	 * Convert to a btree with two levels, one record in root.
+	 */
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+	args.tp = tp;
+	args.mp = mp;
+	if (*firstblock == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+	} else if (flist->xbf_low) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = *firstblock;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.fsbno = *firstblock;
+	}
+	args.minlen = args.maxlen = args.prod = 1;
+	args.total = args.minleft = args.alignment = args.mod = args.isfl =
+		args.minalignslop = 0;
+	args.wasdel = wasdel;
+	*logflagsp = 0;
+	if ((error = xfs_alloc_vextent(&args))) {
+		xfs_iroot_realloc(ip, -1, whichfork);
+		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+		return error;
+	}
+	/*
+	 * Allocation can't fail, the space was reserved.
+	 */
+	ASSERT(args.fsbno != NULLFSBLOCK);
+	ASSERT(*firstblock == NULLFSBLOCK ||
+	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+	       (flist->xbf_low &&
+		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	ip->i_d.di_nblocks++;
+	if (XFS_IS_QUOTA_ON(mp) &&
+	    ip->i_ino != mp->m_sb.sb_uquotino &&
+	    ip->i_ino != mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+	/*
+	 * Fill in the child block.
+	 */
+	ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
+	INT_SET(ablock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+	INT_ZERO(ablock->bb_level, ARCH_CONVERT);
+	INT_ZERO(ablock->bb_numrecs, ARCH_CONVERT);
+	INT_SET(ablock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
+	INT_SET(ablock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
+	arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (ep = ifp->if_u1.if_extents, i = 0; i < nextents; i++, ep++) {
+		if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) {
+			*arp++ = *ep;
+			INT_MOD(ablock->bb_numrecs, ARCH_CONVERT, +1);
+		}
+	}
+	ASSERT(INT_GET(ablock->bb_numrecs, ARCH_CONVERT) == XFS_IFORK_NEXTENTS(ip, whichfork));
+	/*
+	 * Fill in the root key and pointer.
+	 */
+	kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+	arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+	INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(arp));
+	pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+	INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
+	xfs_bmbt_log_recs(cur, abp, 1, INT_GET(ablock->bb_numrecs, ARCH_CONVERT));
+	ASSERT(*curp == NULL);
+	*curp = cur;
+	*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
+	return 0;
+}
+
+/*
+ * Insert new item(s) in the extent list for inode "ip".
+ * Count new items are inserted at offset idx.
+ */
+STATIC void
+xfs_bmap_insert_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* starting index of new items */
+	xfs_extnum_t	count,		/* number of inserted items */
+	xfs_bmbt_irec_t *new,		/* items to insert */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;		/* extent list base */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extnum_t	nextents;	/* extent list size */
+	xfs_extnum_t	to;		/* extent list index */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	xfs_iext_realloc(ip, count, whichfork);
+	base = ifp->if_u1.if_extents;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ovbcopy(&base[idx], &base[idx + count],
+		(nextents - (idx + count)) * sizeof(*base));
+	for (to = idx; to < idx + count; to++, new++)
+		xfs_bmbt_set_all(&base[to], new);
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork)	/* data or attr fork */
+{
+	int		error;		/* error return value */
+	int		flags;		/* logging flags returned */
+#ifdef XFS_BMAP_TRACE
+	static char	fname[] = "xfs_bmap_local_to_extents";
+#endif
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	/*
+	 * We don't want to deal with the case of keeping inode data inline yet.
+	 * So sending the data fork of a regular inode is illegal.
+	 */
+	ASSERT(!((ip->i_d.di_mode & IFMT) == IFREG &&
+		 whichfork == XFS_DATA_FORK));
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	flags = 0;
+	error = 0;
+	if (ifp->if_bytes) {
+		xfs_alloc_arg_t args;	/* allocation arguments */
+		xfs_buf_t	*bp;	/* buffer for extent list block */
+		xfs_bmbt_rec_t	*ep;	/* extent list pointer */
+
+		args.tp = tp;
+		args.mp = ip->i_mount;
+		ASSERT(ifp->if_flags & XFS_IFINLINE);
+		/*
+		 * Allocate a block.  We know we need only one, since the
+		 * file currently fits in an inode.
+		 */
+		if (*firstblock == NULLFSBLOCK) {
+			args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+			args.type = XFS_ALLOCTYPE_START_BNO;
+		} else {
+			args.fsbno = *firstblock;
+			args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		}
+		args.total = total;
+		args.mod = args.minleft = args.alignment = args.wasdel =
+			args.isfl = args.minalignslop = 0;
+		args.minlen = args.maxlen = args.prod = 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			goto done;
+		/*
+		 * Can't fail, the space was reserved.
+		 */
+		ASSERT(args.fsbno != NULLFSBLOCK);
+		ASSERT(args.len == 1);
+		*firstblock = args.fsbno;
+		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+		bcopy(ifp->if_u1.if_data, (char *)XFS_BUF_PTR(bp),
+			ifp->if_bytes);
+		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+		xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+		xfs_iext_realloc(ip, 1, whichfork);
+		ep = ifp->if_u1.if_extents;
+		xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+		xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork);
+		XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+		ip->i_d.di_nblocks = 1;
+		if (XFS_IS_QUOTA_ON(args.mp) &&
+		    ip->i_ino != args.mp->m_sb.sb_uquotino &&
+		    ip->i_ino != args.mp->m_sb.sb_gquotino)
+			xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+				1L);
+		flags |= XFS_ILOG_FEXT(whichfork);
+	} else
+		ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+	ifp->if_flags &= ~XFS_IFINLINE;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	flags |= XFS_ILOG_CORE;
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+xfs_bmbt_rec_t *			/* pointer to found extent entry */
+xfs_bmap_do_search_extents(
+	xfs_bmbt_rec_t	*base,		/* base of extent list */
+	xfs_extnum_t	lastx,		/* last extent index used */
+	xfs_extnum_t	nextents,	/* extent list size */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t *gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t *prevp)		/* out: previous extent entry found */
+{
+	xfs_bmbt_rec_t	*ep;		/* extent list entry pointer */
+	xfs_bmbt_irec_t got;		/* extent list entry, decoded */
+	int		high;		/* high index of binary search */
+	int		low;		/* low index of binary search */
+
+	if (lastx != NULLEXTNUM && lastx < nextents)
+		ep = base + lastx;
+	else
+		ep = NULL;
+	prevp->br_startoff = NULLFILEOFF;
+	if (ep && bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep)) &&
+	    bno < got.br_startoff +
+		  (got.br_blockcount = xfs_bmbt_get_blockcount(ep)))
+		*eofp = 0;
+	else if (ep && lastx < nextents - 1 &&
+		 bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep + 1)) &&
+		 bno < got.br_startoff +
+		       (got.br_blockcount = xfs_bmbt_get_blockcount(ep + 1))) {
+		lastx++;
+		ep++;
+		*eofp = 0;
+	} else if (nextents == 0)
+		*eofp = 1;
+	else if (bno == 0 &&
+		 (got.br_startoff = xfs_bmbt_get_startoff(base)) == 0) {
+		ep = base;
+		lastx = 0;
+		got.br_blockcount = xfs_bmbt_get_blockcount(ep);
+		*eofp = 0;
+	} else {
+		/* binary search the extents array */
+		low = 0;
+		high = nextents - 1;
+		while (low <= high) {
+			XFS_STATS_INC(xfsstats.xs_cmp_exlist);
+			lastx = (low + high) >> 1;
+			ep = base + lastx;
+			got.br_startoff = xfs_bmbt_get_startoff(ep);
+			got.br_blockcount = xfs_bmbt_get_blockcount(ep);
+			if (bno < got.br_startoff)
+				high = lastx - 1;
+			else if (bno >= got.br_startoff + got.br_blockcount)
+				low = lastx + 1;
+			else {
+				got.br_startblock = xfs_bmbt_get_startblock(ep);
+				got.br_state = xfs_bmbt_get_state(ep);
+				*eofp = 0;
+				*lastxp = lastx;
+				*gotp = got;
+				return ep;
+			}
+		}
+		if (bno >= got.br_startoff + got.br_blockcount) {
+			lastx++;
+			if (lastx == nextents) {
+				*eofp = 1;
+				got.br_startblock = xfs_bmbt_get_startblock(ep);
+				got.br_state = xfs_bmbt_get_state(ep);
+				*prevp = got;
+				ep = NULL;
+			} else {
+				*eofp = 0;
+				xfs_bmbt_get_all(ep, prevp);
+				ep++;
+				got.br_startoff = xfs_bmbt_get_startoff(ep);
+				got.br_blockcount = xfs_bmbt_get_blockcount(ep);
+			}
+		} else {
+			*eofp = 0;
+			if (ep > base)
+				xfs_bmbt_get_all(ep - 1, prevp);
+		}
+	}
+	if (ep) {
+		got.br_startblock = xfs_bmbt_get_startblock(ep);
+		got.br_state = xfs_bmbt_get_state(ep);
+	}
+	*lastxp = lastx;
+	*gotp = got;
+	return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_t *			/* pointer to found extent entry */
+xfs_bmap_search_extents(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		whichfork,	/* data or attr fork */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t *gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t *prevp)		/* out: previous extent entry found */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_t	*base;		/* base of extent list */
+	xfs_extnum_t	lastx;		/* last extent index used */
+	xfs_extnum_t	nextents;	/* extent list size */
+
+	XFS_STATS_INC(xfsstats.xs_look_exlist);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	lastx = ifp->if_lastex;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	base = &ifp->if_u1.if_extents[0];
+
+	return xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp,
+					  lastxp, gotp, prevp);
+}
+
+
+#ifdef XFS_BMAP_TRACE
+/*
+ * Add a bmap trace buffer entry.  Base routine for the others.
+ */
+STATIC void
+xfs_bmap_trace_addentry(
+	int		opcode,		/* operation */
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(ies) */
+	xfs_extnum_t	cnt,		/* count of entries, 1 or 2 */
+	xfs_bmbt_rec_t	*r1,		/* first record */
+	xfs_bmbt_rec_t	*r2,		/* second record or null */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	tr2;
+
+	ASSERT(cnt == 1 || cnt == 2);
+	ASSERT(r1 != NULL);
+	if (cnt == 1) {
+		ASSERT(r2 == NULL);
+		r2 = &tr2;
+		bzero(&tr2, sizeof(tr2));
+	} else
+		ASSERT(r2 != NULL);
+	ktrace_enter(xfs_bmap_trace_buf,
+		(void *)(__psint_t)(opcode | (whichfork << 16)),
+		(void *)fname, (void *)desc, (void *)ip,
+		(void *)(__psint_t)idx,
+		(void *)(__psint_t)cnt,
+		(void *)(__psunsigned_t)(ip->i_ino >> 32),
+		(void *)(__psunsigned_t)(unsigned)ip->i_ino,
+		(void *)(__psunsigned_t)(INT_GET(r1->l0, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l0, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r1->l1, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l1, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r2->l0, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l0, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r2->l1, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l1, ARCH_CONVERT))
+		);
+	ASSERT(ip->i_xtrace);
+	ktrace_enter(ip->i_xtrace,
+		(void *)(__psint_t)(opcode | (whichfork << 16)),
+		(void *)fname, (void *)desc, (void *)ip,
+		(void *)(__psint_t)idx,
+		(void *)(__psint_t)cnt,
+		(void *)(__psunsigned_t)(ip->i_ino >> 32),
+		(void *)(__psunsigned_t)(unsigned)ip->i_ino,
+		(void *)(__psunsigned_t)(INT_GET(r1->l0, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l0, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r1->l1, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l1, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r2->l0, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l0, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r2->l1, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l1, ARCH_CONVERT))
+		);
+}
+
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
+ */
+STATIC void
+xfs_bmap_trace_delete(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(entries) deleted */
+	xfs_extnum_t	cnt,		/* count of entries deleted, 1 or 2 */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx,
+		cnt, &ifp->if_u1.if_extents[idx],
+		cnt == 2 ? &ifp->if_u1.if_extents[idx + 1] : NULL,
+		whichfork);
+}
+
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
+ * reading in the extents list from the disk (in the btree).
+ */
+STATIC void
+xfs_bmap_trace_insert(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(entries) inserted */
+	xfs_extnum_t	cnt,		/* count of entries inserted, 1 or 2 */
+	xfs_bmbt_irec_t *r1,		/* inserted record 1 */
+	xfs_bmbt_irec_t *r2,		/* inserted record 2 or null */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	tr1;		/* compressed record 1 */
+	xfs_bmbt_rec_t	tr2;		/* compressed record 2 if needed */
+
+	xfs_bmbt_set_all(&tr1, r1);
+	if (cnt == 2) {
+		ASSERT(r2 != NULL);
+		xfs_bmbt_set_all(&tr2, r2);
+	} else {
+		ASSERT(cnt == 1);
+		ASSERT(r2 == NULL);
+	}
+	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx,
+		cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork);
+}
+
+/*
+ * Add bmap trace entry after updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_post_update(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry updated */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx,
+		1, &ifp->if_u1.if_extents[idx], NULL, whichfork);
+}
+
+/*
+ * Add bmap trace entry prior to updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_pre_update(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry to be updated */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1,
+		&ifp->if_u1.if_extents[idx], NULL, whichfork);
+}
+#endif	/* XFS_BMAP_TRACE */
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_filblks_t	len)		/* delayed extent length */
+{
+	int		level;		/* btree level number */
+	int		maxrecs;	/* maximum record count at this level */
+	xfs_mount_t	*mp;		/* mount structure */
+	xfs_filblks_t	rval;		/* return value */
+
+	mp = ip->i_mount;
+	maxrecs = mp->m_bmap_dmxr[0];
+	for (level = 0, rval = 0;
+	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+	     level++) {
+		len += maxrecs - 1;
+		do_div(len, maxrecs);
+		rval += len;
+		if (len == 1)
+			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+				level - 1;
+		if (level == 0)
+			maxrecs = mp->m_bmap_dmxr[1];
+	}
+	return rval;
+}
+
+#if defined(DEBUG) && defined(XFS_RW_TRACE)
+STATIC void
+xfs_bunmap_trace(
+	xfs_inode_t		*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	inst_t			*ra)
+{
+	if (ip->i_rwtrace == NULL)
+		return;
+	ktrace_enter(ip->i_rwtrace,
+		(void *)(__psint_t)XFS_BUNMAPI,
+		(void *)ip,
+		(void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
+		(void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
+		(void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff),
+		(void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff),
+		(void *)(__psint_t)len,
+		(void *)(__psint_t)flags,
+		(void *)(__psint_t)private.p_cpuid,
+		(void *)ra,
+		(void *)0,
+		(void *)0,
+		(void *)0,
+		(void *)0,
+		(void *)0,
+		(void *)0);
+}
+#endif
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int						/* error code */
+xfs_bmap_add_attrfork(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			rsvd)		/* OK to allocated reserved blocks in trans */
+{
+	int			blks;		/* space reservation */
+	int			committed;	/* xaction was committed */
+	int			error;		/* error return value */
+	xfs_fsblock_t		firstblock;	/* 1st block/ag allocated */
+	xfs_bmap_free_t		flist;		/* freed extent list */
+	int			logflags;	/* logging flags */
+	xfs_mount_t		*mp;		/* mount structure */
+	unsigned long		s;		/* spinlock spl value */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+	if (XFS_IFORK_Q(ip))
+		return 0;
+	mp = ip->i_mount;
+	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+	blks = XFS_ADDAFORK_SPACE_RES(mp);
+	if (rsvd)
+		tp->t_flags |= XFS_TRANS_RESERVE;
+	if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
+		goto error0;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (rsvd) {
+			error = xfs_trans_reserve_blkquota_force(tp, ip, blks);
+		} else {
+			error = xfs_trans_reserve_blkquota(tp, ip, blks);
+		}
+
+		if (error) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+			return error;
+		}
+	}
+	if (XFS_IFORK_Q(ip))
+		goto error1;
+	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+		/*
+		 * For inodes coming from pre-6.2 filesystems.
+		 */
+		ASSERT(ip->i_d.di_aformat == 0);
+		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	}
+	ASSERT(ip->i_d.di_anextents == 0);
+	VN_HOLD(XFS_ITOV(ip));
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		ip->i_d.di_forkoff = roundup(sizeof(dev_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_UUID:
+		ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		ip->i_d.di_forkoff = mp->m_attroffset >> 3;
+		break;
+	default:
+		ASSERT(0);
+		error = XFS_ERROR(EINVAL);
+		goto error1;
+	}
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	ip->i_afp->if_flags = XFS_IFEXTENTS;
+	logflags = 0;
+	XFS_BMAP_INIT(&flist, &firstblock);
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+			&flist, &logflags);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (error)
+		goto error2;
+	if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
+		s = XFS_SB_LOCK(mp);
+		if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
+			XFS_SB_VERSION_ADDATTR(&mp->m_sb);
+			XFS_SB_UNLOCK(mp, s);
+			xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+		} else
+			XFS_SB_UNLOCK(mp, s);
+	}
+	if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed)))
+		goto error2;
+	error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+	return error;
+error2:
+	xfs_bmap_cancel(&flist);
+error1:
+	ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE));
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+	return error;
+}
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+/* ARGSUSED */
+void
+xfs_bmap_add_free(
+	xfs_fsblock_t		bno,		/* fs block number of extent */
+	xfs_filblks_t		len,		/* length of extent */
+	xfs_bmap_free_t		*flist,		/* list of extents */
+	xfs_mount_t		*mp)		/* mount point structure */
+{
+	xfs_bmap_free_item_t	*cur;		/* current (next) element */
+	xfs_bmap_free_item_t	*new;		/* new element */
+	xfs_bmap_free_item_t	*prev;		/* previous element */
+#ifdef DEBUG
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(len > 0);
+	ASSERT(len <= MAXEXTLEN);
+	ASSERT(!ISNULLSTARTBLOCK(bno));
+	agno = XFS_FSB_TO_AGNO(mp, bno);
+	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(agbno < mp->m_sb.sb_agblocks);
+	ASSERT(len < mp->m_sb.sb_agblocks);
+	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+	ASSERT(xfs_bmap_free_item_zone != NULL);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new->xbfi_startblock = bno;
+	new->xbfi_blockcount = (xfs_extlen_t)len;
+	for (prev = NULL, cur = flist->xbf_first;
+	     cur != NULL;
+	     prev = cur, cur = cur->xbfi_next) {
+		if (cur->xbfi_startblock >= bno)
+			break;
+	}
+	if (prev)
+		prev->xbfi_next = new;
+	else
+		flist->xbf_first = new;
+	new->xbfi_next = cur;
+	flist->xbf_count++;
+}
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.	Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	int		whichfork)	/* data or attr fork */
+{
+	int		level;		/* btree level */
+	uint		maxblocks;	/* max blocks at this level */
+	uint		maxleafents;	/* max leaf entries possible */
+	int		maxrootrecs;	/* max records in root block */
+	int		minleafrecs;	/* min records in leaf block */
+	int		minnoderecs;	/* min records in node block */
+	int		sz;		/* root block size */
+
+	/*
+	 * The maximum number of extents in a file, hence the maximum
+	 * number of leaf entries, is controlled by the type of di_nextents
+	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+	 * (a signed 16-bit number, xfs_aextnum_t).
+	 */
+	maxleafents = (whichfork == XFS_DATA_FORK) ? MAXEXTNUM : MAXAEXTNUM;
+	minleafrecs = mp->m_bmap_dmnr[0];
+	minnoderecs = mp->m_bmap_dmnr[1];
+	sz = (whichfork == XFS_DATA_FORK) ?
+		mp->m_attroffset :
+		mp->m_sb.sb_inodesize - mp->m_attroffset;
+	maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++) {
+		if (maxblocks <= maxrootrecs)
+			maxblocks = 1;
+		else
+			maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	}
+	mp->m_bm_maxlevels[whichfork] = level;
+}
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.	We never free any extents in
+ * the first transaction.  This is to allow the caller to make the first
+ * transaction a synchronous one so that the pointers to the data being
+ * broken in this transaction will be permanent before the data is actually
+ * freed.  This is necessary to prevent blocks from being reallocated
+ * and written to before the free and reallocation are actually permanent.
+ * We do not just make the first transaction synchronous here, because
+ * there are more efficient ways to gain the same protection in some cases
+ * (see the file truncation code).
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+/*ARGSUSED*/
+int						/* error */
+xfs_bmap_finish(
+	xfs_trans_t		**tp,		/* transaction pointer addr */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_fsblock_t		firstblock,	/* controlled ag for allocs */
+	int			*committed)	/* xact committed or not */
+{
+	xfs_efd_log_item_t	*efd;		/* extent free data */
+	xfs_efi_log_item_t	*efi;		/* extent free intention */
+	int			error;		/* error return value */
+	xfs_bmap_free_item_t	*free;		/* free extent list item */
+	unsigned int		logres;		/* new log reservation */
+	unsigned int		logcount;	/* new log count */
+	xfs_mount_t		*mp;		/* filesystem mount structure */
+	xfs_bmap_free_item_t	*next;		/* next item on free list */
+	xfs_trans_t		*ntp;		/* new transaction pointer */
+
+	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+	if (flist->xbf_count == 0) {
+		*committed = 0;
+		return 0;
+	}
+	ntp = *tp;
+	efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+	for (free = flist->xbf_first; free; free = free->xbfi_next)
+		xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+			free->xbfi_blockcount);
+	logres = ntp->t_log_res;
+	logcount = ntp->t_log_count;
+	ntp = xfs_trans_dup(*tp);
+	error = xfs_trans_commit(*tp, 0, NULL);
+	*tp = ntp;
+	*committed = 1;
+	/*
+	 * We have a new transaction, so we should return committed=1,
+	 * even though we're returning an error.
+	 */
+	if (error) {
+		return error;
+	}
+	if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
+			logcount)))
+		return error;
+	efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+	for (free = flist->xbf_first; free != NULL; free = next) {
+		next = free->xbfi_next;
+		if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+				free->xbfi_blockcount))) {
+			/*
+			 * The bmap free list will be cleaned up at a
+			 * higher level.  The EFI will be canceled when
+			 * this transaction is aborted.
+			 * Need to force shutdown here to make sure it
+			 * happens, since this transaction may not be
+			 * dirty yet.
+			 */
+			mp = ntp->t_mountp;
+			if (!XFS_FORCED_SHUTDOWN(mp))
+				xfs_force_shutdown(mp,
+						   (error == EFSCORRUPTED) ?
+						   XFS_CORRUPT_INCORE :
+						   XFS_METADATA_IO_ERROR);
+			return error;
+		}
+		xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+			free->xbfi_blockcount);
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	return 0;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+	xfs_bmap_free_t		*flist) /* list of bmap_free_items */
+{
+	xfs_bmap_free_item_t	*free;	/* free list item */
+	xfs_bmap_free_item_t	*next;
+
+	if (flist->xbf_count == 0)
+		return;
+	ASSERT(flist->xbf_first != NULL);
+	for (free = flist->xbf_first; free; free = next) {
+		next = free->xbfi_next;
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Returns EINVAL if the specified file is not swappable.
+ */
+int						/* error */
+xfs_bmap_check_swappable(
+	xfs_inode_t	*ip)			/* incore inode */
+{
+	xfs_bmbt_rec_t	*base;			/* base of extent array */
+	xfs_bmbt_rec_t	*ep;			/* pointer to an extent entry */
+	xfs_fileoff_t	end_fsb;		/* last block of file within size */
+	xfs_bmbt_irec_t ext;			/* extent list entry, decoded */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_fileoff_t	lastaddr;		/* last block number seen */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+	int		retval = 0;		/* return value */
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+
+	/*
+	 * Check for a zero length file.
+	 */
+	if (ip->i_d.di_size == 0)
+		goto check_done;
+
+	ASSERT(XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS);
+
+	ifp = &ip->i_df;
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (retval = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
+		goto check_done;
+	/*
+	 * Scan extents until the file size is reached. Look for
+	 * holes or unwritten extents, since I/O to these would cause
+	 * a transaction.
+	 */
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, ip->i_d.di_size);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	base = &ifp->if_u1.if_extents[0];
+	for (lastaddr = 0, ep = base; ep < &base[nextents]; ep++) {
+		xfs_bmbt_get_all(ep, &ext);
+		if (lastaddr < ext.br_startoff ||
+		    ext.br_state != XFS_EXT_NORM) {
+			goto error_done;
+		}
+		if (end_fsb <= (lastaddr = ext.br_startoff +
+						ext.br_blockcount))
+			goto check_done;
+	}
+error_done:
+	retval = XFS_ERROR(EINVAL);
+
+
+check_done:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	return retval;
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_extlen_t	len,			/* size of hole to find */
+	xfs_fileoff_t	*first_unused,		/* unused block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;			/* base of extent array */
+	xfs_bmbt_rec_t	*ep;			/* pointer to an extent entry */
+	int		error;			/* error return value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_fileoff_t	lastaddr;		/* last block number seen */
+	xfs_fileoff_t	lowest;			/* lowest useful block */
+	xfs_fileoff_t	max;			/* starting useful block */
+	xfs_fileoff_t	off;			/* offset for this block */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*first_unused = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	lowest = *first_unused;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	base = &ifp->if_u1.if_extents[0];
+	for (lastaddr = 0, max = lowest, ep = base;
+	     ep < &base[nextents];
+	     ep++) {
+		off = xfs_bmbt_get_startoff(ep);
+		/*
+		 * See if the hole before this extent will work.
+		 */
+		if (off >= lowest + len && off - max >= len) {
+			*first_unused = max;
+			return 0;
+		}
+		lastaddr = off + xfs_bmbt_get_blockcount(ep);
+		max = XFS_FILEOFF_MAX(lastaddr, lowest);
+	}
+	*first_unused = max;
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_fileoff_t	*last_block,		/* last block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_fileoff_t	bno;			/* input file offset */
+	int		eof;			/* hit end of file */
+	xfs_bmbt_rec_t	*ep;			/* pointer to last extent */
+	int		error;			/* error return value */
+	xfs_bmbt_irec_t got;			/* current extent value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	lastx;			/* last extent used */
+	xfs_bmbt_irec_t prev;			/* previous extent value */
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	       return XFS_ERROR(EIO);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*last_block = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	bno = *last_block - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+		if (prev.br_startoff == NULLFILEOFF)
+			*last_block = 0;
+		else
+			*last_block = prev.br_startoff + prev.br_blockcount;
+	}
+	/*
+	 * Otherwise *last_block is already the right answer.
+	 */
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_offset(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_fileoff_t	*last_block,		/* last block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;			/* base of extent array */
+	xfs_bmbt_rec_t	*ep;			/* pointer to last extent */
+	int		error;			/* error return value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	       return XFS_ERROR(EIO);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*last_block = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (!nextents) {
+		*last_block = 0;
+		return 0;
+	}
+	base = &ifp->if_u1.if_extents[0];
+	ASSERT(base != NULL);
+	ep = &base[nextents - 1];
+	*last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
+	return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int					/* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+	xfs_inode_t	*ip,		/* incore inode */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*ep;		/* ptr to fork's extent */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		rval;		/* return value */
+	xfs_bmbt_irec_t s;		/* internal version of extent */
+
+#ifndef DEBUG
+	if (whichfork == XFS_DATA_FORK)
+		return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize;
+#endif	/* !DEBUG */
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+		return 0;
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		return 0;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ep = ifp->if_u1.if_extents;
+	xfs_bmbt_get_all(ep, &s);
+	rval = s.br_startoff == 0 && s.br_blockcount == 1;
+	if (rval && whichfork == XFS_DATA_FORK)
+		ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
+	return rval;
+}
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int					/* error */
+xfs_bmap_read_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode */
+	int			whichfork) /* data or attr fork */
+{
+	xfs_bmbt_block_t	*block; /* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_exntfmt_t		exntf;	/* XFS_EXTFMT_NOSTATE, if checking */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_read_extents";
+#endif
+	xfs_extnum_t		i;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	xfs_bmbt_ptr_t		*pp;	/* pointer to block address */
+	/* REFERENCED */
+	xfs_extnum_t		room;	/* number of entries there's room for */
+	xfs_bmbt_rec_t		*trp;	/* target record pointer */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+					XFS_EXTFMT_INODE(ip);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+	level = INT_GET(block->bb_level, ARCH_CONVERT);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+	ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+	bno = INT_GET(*pp, ARCH_CONVERT);
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			return error;
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			XFS_BMAP_SANITY_CHECK(mp, block, level),
+			error0);
+		if (level == 0)
+			break;
+		pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
+			1, mp->m_bmap_dmxr[1]);
+#ifndef __KERNEL__
+		XFS_WANT_CORRUPTED_GOTO(
+			XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)),
+			error0);
+#else	/* additional, temporary, debugging code */
+		if (!(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)))) {
+			cmn_err(CE_NOTE,
+			"xfs_bmap_read_extents: FSB Sanity Check:");
+			if (!(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount))
+				cmn_err(CE_NOTE,
+					"bad AG count %d < agcount %d",
+					XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)),
+					mp->m_sb.sb_agcount);
+			if (!(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks))
+				cmn_err(CE_NOTE,
+					"bad AG BNO %d < %d",
+					XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)),
+					mp->m_sb.sb_agblocks);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto error0;
+		}
+#endif
+		bno = INT_GET(*pp, ARCH_CONVERT);
+		xfs_trans_brelse(tp, bp);
+	}
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	room = ifp->if_bytes / (uint)sizeof(*trp);
+	trp = ifp->if_u1.if_extents;
+	i = 0;
+	/*
+	 * Loop over all leaf nodes.  Copy information to the extent list.
+	 */
+	for (;;) {
+		xfs_bmbt_rec_t	*frp;
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+
+
+		num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+		if (i + num_recs > room) {
+			ASSERT(i + num_recs <= room);
+			xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+				"corrupt dinode %Lu, (btree extents).  "
+				"Unmount and run xfs_repair.",
+				(unsigned long long) ip->i_ino);
+			goto error0;
+		}
+#ifndef __KERNEL__
+		XFS_WANT_CORRUPTED_GOTO(
+			XFS_BMAP_SANITY_CHECK(mp, block, 0),
+			error0);
+#else	/* additional, temporary, debugging code */
+		if (!(XFS_BMAP_SANITY_CHECK(mp, block, 0))) {
+			cmn_err(CE_NOTE,
+			"xfs_bmap_read_extents: BMAP Sanity Check:");
+			if (!(INT_GET(block->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC))
+				cmn_err(CE_NOTE,
+					"bb_magic 0x%x",
+					INT_GET(block->bb_magic, ARCH_CONVERT));
+			if (!(INT_GET(block->bb_level, ARCH_CONVERT) == level))
+				cmn_err(CE_NOTE,
+					"bb_level %d",
+					INT_GET(block->bb_level, ARCH_CONVERT));
+			if (!(INT_GET(block->bb_numrecs, ARCH_CONVERT) > 0))
+				cmn_err(CE_NOTE,
+					"bb_numrecs %d",
+					INT_GET(block->bb_numrecs, ARCH_CONVERT));
+			if (!(INT_GET(block->bb_numrecs, ARCH_CONVERT) <= (mp)->m_bmap_dmxr[(level) != 0]))
+				cmn_err(CE_NOTE,
+					"bb_numrecs %d < m_bmap_dmxr[] %d",
+					INT_GET(block->bb_numrecs, ARCH_CONVERT),
+					(mp)->m_bmap_dmxr[(level) != 0]);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto error0;
+		}
+#endif
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+		nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+		if (nextbno != NULLFSBLOCK)
+			xfs_btree_reada_bufl(mp, nextbno, 1);
+		/*
+		 * Copy records into the extent list.
+		 */
+		frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
+			block, 1, mp->m_bmap_dmxr[0]);
+		bcopy(frp, trp, num_recs * sizeof(*frp));
+		if (exntf == XFS_EXTFMT_NOSTATE) {
+			/*
+			 * Check all attribute bmap btree records and
+			 * any "older" data bmap btree records for a
+			 * set bit in the "extent flag" position.
+			 */
+			if (xfs_check_nostate_extents(trp, num_recs)) {
+				goto error0;
+			}
+		}
+		trp += num_recs;
+		i += num_recs;
+		xfs_trans_brelse(tp, bp);
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+		if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			return error;
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+	}
+	ASSERT(i == ifp->if_bytes / (uint)sizeof(*trp));
+	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+	xfs_bmap_trace_exlist(fname, ip, i, whichfork);
+	return 0;
+error0:
+	xfs_trans_brelse(tp, bp);
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+#ifdef XFS_BMAP_TRACE
+/*
+ * Add bmap trace insert entries for all the contents of the extent list.
+ */
+void
+xfs_bmap_trace_exlist(
+	char		*fname,		/* function name */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	cnt,		/* count of entries in the list */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;		/* base of extent list */
+	xfs_bmbt_rec_t	*ep;		/* current entry in extent list */
+	xfs_extnum_t	idx;		/* extent list entry number */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_irec_t s;		/* extent list record */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(cnt == ifp->if_bytes / (uint)sizeof(*base));
+	base = ifp->if_u1.if_extents;
+	for (idx = 0, ep = base; idx < cnt; idx++, ep++) {
+		xfs_bmbt_get_all(ep, &s);
+		xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL,
+			whichfork);
+	}
+}
+#endif
+
+#ifdef DEBUG
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the callers original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extent beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap)
+{
+	int			i;		/* index to map values */
+
+	ASSERT(ret_nmap <= nmap);
+
+	for (i = 0; i < ret_nmap; i++) {
+		ASSERT(mval[i].br_blockcount > 0);
+		if (!(flags & XFS_BMAPI_ENTIRE)) {
+			ASSERT(mval[i].br_startoff >= bno);
+			ASSERT(mval[i].br_blockcount <= len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+			       bno + len);
+		} else {
+			ASSERT(mval[i].br_startoff < bno + len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+			       bno);
+		}
+		ASSERT(i == 0 ||
+		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+		       mval[i].br_startoff);
+		if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
+			ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+			       mval[i].br_startblock != HOLESTARTBLOCK);
+		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+		       mval[i].br_state == XFS_EXT_UNWRITTEN);
+	}
+}
+#endif /* DEBUG */
+
+
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int					/* error */
+xfs_bmapi(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode */
+	xfs_fileoff_t	bno,		/* starting file offs. mapped */
+	xfs_filblks_t	len,		/* length to map in file */
+	int		flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t	*firstblock,	/* first allocated block
+					   controls a.g. for allocs */
+	xfs_extlen_t	total,		/* total blocks needed */
+	xfs_bmbt_irec_t *mval,		/* output: map values */
+	int		*nmap,		/* i/o: mval size/count */
+	xfs_bmap_free_t *flist)		/* i/o: list extents to free */
+{
+	xfs_fsblock_t	abno;		/* allocated block number */
+	xfs_extlen_t	alen;		/* allocated extent length */
+	xfs_fileoff_t	aoff;		/* allocated file offset */
+	xfs_bmalloca_t	bma;		/* args for xfs_bmap_alloc */
+	char		contig;		/* allocation must be one extent */
+	xfs_btree_cur_t *cur;		/* bmap btree cursor */
+	char		delay;		/* this request is for delayed alloc */
+	xfs_fileoff_t	end;		/* end of mapped file region */
+	int		eof;		/* we've hit the end of extent list */
+	xfs_bmbt_rec_t	*ep;		/* extent list entry pointer */
+	int		error;		/* error return */
+	char		exact;		/* don't do all of wasdelayed extent */
+	xfs_bmbt_irec_t got;		/* current extent list record */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extlen_t	indlen;		/* indirect blocks length */
+	char		inhole;		/* current location is hole in file */
+	xfs_extnum_t	lastx;		/* last useful extent number */
+	int		logflags;	/* flags for transaction logging */
+	xfs_extlen_t	minleft;	/* min blocks left after allocation */
+	xfs_extlen_t	minlen;		/* min allocation size */
+	xfs_mount_t	*mp;		/* xfs mount structure */
+	int		n;		/* current extent index */
+	int		nallocs;	/* number of extents alloc\'d */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	xfs_fileoff_t	obno;		/* old block number (offset) */
+	xfs_bmbt_irec_t prev;		/* previous extent list record */
+	char		stateless;	/* ignore state flag set */
+	int		tmp_logflags;	/* temp flags holder */
+	char		trim;		/* output trimmed to match range */
+	char		userdata;	/* allocating non-metadata */
+	char		wasdelay;	/* old extent was delayed */
+	int		whichfork;	/* data or attr fork */
+	char		wr;		/* this is a write request */
+	char		rsvd;		/* OK to allocate reserved blocks */
+#ifdef DEBUG
+	xfs_fileoff_t	orig_bno;	/* original block number value */
+	int		orig_flags;	/* original flags arg value */
+	xfs_filblks_t	orig_len;	/* original value of len arg */
+	xfs_bmbt_irec_t *orig_mval;	/* original value of mval */
+	int		orig_nmap;	/* original value of *nmap */
+
+	orig_bno = bno;
+	orig_len = len;
+	orig_flags = flags;
+	orig_mval = mval;
+	orig_nmap = *nmap;
+#endif
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) {
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	mp = ip->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
+		XFS_STATS_INC(xfsstats.xs_blk_mapw);
+	else
+		XFS_STATS_INC(xfsstats.xs_blk_mapr);
+	delay = (flags & XFS_BMAPI_DELAY) != 0;
+	trim = (flags & XFS_BMAPI_ENTIRE) == 0;
+	userdata = (flags & XFS_BMAPI_METADATA) == 0;
+	exact = (flags & XFS_BMAPI_EXACT) != 0;
+	rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+	contig = (flags & XFS_BMAPI_CONTIG) != 0;
+	/*
+	 * stateless is used to combine extents which
+	 * differ only due to the state of the extents.
+	 * This technique is used from xfs_getbmap()
+	 * when the caller does not wish to see the
+	 * separation (which is the default).
+	 *
+	 * This technique is also used when writing a
+	 * buffer which has been partially written,
+	 * (usually by being flushed during a chunkread),
+	 * to ensure one write takes place. This also
+	 * prevents a change in the xfs inode extents at
+	 * this time, intentionally. This change occurs
+	 * on completion of the write operation, in
+	 * xfs_strat_comp(), where the xfs_bmapi() call
+	 * is transactioned, and the extents combined.
+	 */
+	stateless = (flags & XFS_BMAPI_IGSTATE) != 0;
+	if (stateless && wr)	/* if writing unwritten space, no */
+		wr = 0;		/* allocations are allowed */
+	ASSERT(wr || !delay);
+	logflags = 0;
+	nallocs = 0;
+	cur = NULL;
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		ASSERT(wr && tp);
+		if ((error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
+				&logflags, whichfork)))
+			goto error0;
+	}
+	if (wr && *firstblock == NULLFSBLOCK) {
+		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+			minleft = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1;
+		else
+			minleft = 1;
+	} else
+		minleft = 0;
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		goto error0;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	n = 0;
+	end = bno + len;
+	obno = bno;
+	bma.ip = NULL;
+	while (bno < end && n < *nmap) {
+		/*
+		 * Reading past eof, act as though there's a hole
+		 * up to end.
+		 */
+		if (eof && !wr)
+			got.br_startoff = end;
+		inhole = eof || got.br_startoff > bno;
+		wasdelay = wr && !inhole && !delay &&
+			ISNULLSTARTBLOCK(got.br_startblock);
+		/*
+		 * First, deal with the hole before the allocated space
+		 * that we found, if any.
+		 */
+		if (wr && (inhole || wasdelay)) {
+			/*
+			 * For the wasdelay case, we could also just
+			 * allocate the stuff asked for in this bmap call
+			 * but that wouldn't be as good.
+			 */
+			if (wasdelay && !exact) {
+				alen = (xfs_extlen_t)got.br_blockcount;
+				aoff = got.br_startoff;
+				if (lastx != NULLEXTNUM && lastx) {
+					ep = &ifp->if_u1.if_extents[lastx - 1];
+					xfs_bmbt_get_all(ep, &prev);
+				}
+			} else if (wasdelay) {
+				alen = (xfs_extlen_t)
+					XFS_FILBLKS_MIN(len,
+						(got.br_startoff +
+						 got.br_blockcount) - bno);
+				aoff = bno;
+			} else {
+				alen = (xfs_extlen_t)
+					XFS_FILBLKS_MIN(len, MAXEXTLEN);
+				if (!eof)
+					alen = (xfs_extlen_t)
+						XFS_FILBLKS_MIN(alen,
+							got.br_startoff - bno);
+				aoff = bno;
+			}
+			minlen = contig ? alen : 1;
+			if (delay) {
+				indlen = (xfs_extlen_t)
+					xfs_bmap_worst_indlen(ip, alen);
+				ASSERT(indlen > 0);
+				/*
+				 * Make a transaction-less quota reservation for
+				 * delayed allocation blocks. This number gets
+				 * adjusted later.
+				 * We return EDQUOT if we haven't allocated
+				 * blks already inside this loop;
+				 */
+				if (XFS_IS_QUOTA_ON(ip->i_mount) &&
+				    xfs_trans_reserve_blkquota(NULL, ip,
+					    (long)alen)) {
+					if (n == 0) {
+						*nmap = 0;
+						ASSERT(cur == NULL);
+						return XFS_ERROR(EDQUOT);
+					}
+					break;
+				}
+				if (xfs_mod_incore_sb(ip->i_mount,
+						XFS_SBS_FDBLOCKS,
+						-(alen + indlen), rsvd)) {
+					if (XFS_IS_QUOTA_ON(ip->i_mount))
+						xfs_trans_unreserve_blkquota(
+							NULL, ip, (long)alen);
+					break;
+				}
+				ip->i_delayed_blks += alen;
+				abno = NULLSTARTBLOCK(indlen);
+			} else {
+				/*
+				 * If first time, allocate and fill in
+				 * once-only bma fields.
+				 */
+				if (bma.ip == NULL) {
+					bma.tp = tp;
+					bma.ip = ip;
+					bma.prevp = &prev;
+					bma.gotp = &got;
+					bma.total = total;
+					bma.userdata = 0;
+				}
+				/* Indicate if this is the first user data
+				 * in the file, or just any user data.
+				 */
+				if (userdata) {
+					bma.userdata = (aoff == 0) ?
+						XFS_ALLOC_INITIAL_USER_DATA :
+						XFS_ALLOC_USERDATA;
+				}
+				/*
+				 * Fill in changeable bma fields.
+				 */
+				bma.eof = eof;
+				bma.firstblock = *firstblock;
+				bma.alen = alen;
+				bma.off = aoff;
+				bma.wasdel = wasdelay;
+				bma.minlen = minlen;
+				bma.low = flist->xbf_low;
+				bma.minleft = minleft;
+				/*
+				 * Only want to do the alignment at the
+				 * eof if it is userdata and allocation length
+				 * is larger than a stripe unit.
+				 */
+				if (mp->m_dalign && alen >= mp->m_dalign &&
+				    userdata && whichfork == XFS_DATA_FORK) {
+					if ((error = xfs_bmap_isaeof(ip, aoff,
+							whichfork, &bma.aeof)))
+						goto error0;
+				} else
+					bma.aeof = 0;
+				/*
+				 * Call allocator.
+				 */
+				if ((error = xfs_bmap_alloc(&bma)))
+					goto error0;
+				/*
+				 * Copy out result fields.
+				 */
+				abno = bma.rval;
+				if ((flist->xbf_low = bma.low))
+					minleft = 0;
+				alen = bma.alen;
+				aoff = bma.off;
+				ASSERT(*firstblock == NULLFSBLOCK ||
+				       XFS_FSB_TO_AGNO(ip->i_mount,
+					       *firstblock) ==
+				       XFS_FSB_TO_AGNO(ip->i_mount,
+					       bma.firstblock) ||
+				       (flist->xbf_low &&
+					XFS_FSB_TO_AGNO(ip->i_mount,
+						*firstblock) <
+					XFS_FSB_TO_AGNO(ip->i_mount,
+						bma.firstblock)));
+				*firstblock = bma.firstblock;
+				if (cur)
+					cur->bc_private.b.firstblock =
+						*firstblock;
+				if (abno == NULLFSBLOCK)
+					break;
+				if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+					cur = xfs_btree_init_cursor(ip->i_mount,
+						tp, NULL, 0, XFS_BTNUM_BMAP,
+						ip, whichfork);
+					cur->bc_private.b.firstblock =
+						*firstblock;
+					cur->bc_private.b.flist = flist;
+				}
+				/*
+				 * Bump the number of extents we've allocated
+				 * in this call.
+				 */
+				nallocs++;
+			}
+			if (cur)
+				cur->bc_private.b.flags =
+					wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
+			got.br_startoff = aoff;
+			got.br_startblock = abno;
+			got.br_blockcount = alen;
+			got.br_state = XFS_EXT_NORM;	/* assume normal */
+			/*
+			 * Determine state of extent, and the filesystem.
+			 * A wasdelay extent has been initialized, so
+			 * shouldn't be flagged as unwritten.
+			 */
+			if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+				if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
+					got.br_state = XFS_EXT_UNWRITTEN;
+			}
+			error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
+				firstblock, flist, &tmp_logflags, whichfork,
+				rsvd);
+			logflags |= tmp_logflags;
+			if (error)
+				goto error0;
+			lastx = ifp->if_lastex;
+			ep = &ifp->if_u1.if_extents[lastx];
+			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+			xfs_bmbt_get_all(ep, &got);
+			ASSERT(got.br_startoff <= aoff);
+			ASSERT(got.br_startoff + got.br_blockcount >=
+				aoff + alen);
+#ifdef DEBUG
+			if (delay) {
+				ASSERT(ISNULLSTARTBLOCK(got.br_startblock));
+				ASSERT(STARTBLOCKVAL(got.br_startblock) > 0);
+			}
+			ASSERT(got.br_state == XFS_EXT_NORM ||
+			       got.br_state == XFS_EXT_UNWRITTEN);
+#endif
+			/*
+			 * Fall down into the found allocated space case.
+			 */
+		} else if (inhole) {
+			/*
+			 * Reading in a hole.
+			 */
+			mval->br_startoff = bno;
+			mval->br_startblock = HOLESTARTBLOCK;
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+			mval->br_state = XFS_EXT_NORM;
+			bno += mval->br_blockcount;
+			len -= mval->br_blockcount;
+			mval++;
+			n++;
+			continue;
+		}
+		/*
+		 * Then deal with the allocated space we found.
+		 */
+		ASSERT(ep != NULL);
+		if (trim && (got.br_startoff + got.br_blockcount > obno)) {
+			if (obno > bno)
+				bno = obno;
+			ASSERT((bno >= obno) || (n == 0));
+			ASSERT(bno < end);
+			mval->br_startoff = bno;
+			if (ISNULLSTARTBLOCK(got.br_startblock)) {
+				ASSERT(!wr || delay);
+				mval->br_startblock = DELAYSTARTBLOCK;
+			} else
+				mval->br_startblock =
+					got.br_startblock +
+					(bno - got.br_startoff);
+			/*
+			 * Return the minimum of what we got and what we
+			 * asked for for the length.  We can use the len
+			 * variable here because it is modified below
+			 * and we could have been there before coming
+			 * here if the first part of the allocation
+			 * didn't overlap what was asked for.
+			 */
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
+					(bno - got.br_startoff));
+			mval->br_state = got.br_state;
+			ASSERT(mval->br_blockcount <= len);
+		} else {
+			*mval = got;
+			if (ISNULLSTARTBLOCK(mval->br_startblock)) {
+				ASSERT(!wr || delay);
+				mval->br_startblock = DELAYSTARTBLOCK;
+			}
+		}
+
+		/*
+		 * Check if writing previously allocated but
+		 * unwritten extents.
+		 */
+		if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
+		    ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
+			/*
+			 * Modify (by adding) the state flag, if writing.
+			 */
+			ASSERT(mval->br_blockcount <= len);
+			if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+				cur = xfs_btree_init_cursor(ip->i_mount,
+					tp, NULL, 0, XFS_BTNUM_BMAP,
+					ip, whichfork);
+				cur->bc_private.b.firstblock =
+					*firstblock;
+				cur->bc_private.b.flist = flist;
+			}
+			mval->br_state = XFS_EXT_NORM;
+			error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
+				firstblock, flist, &tmp_logflags, whichfork,
+				rsvd);
+			logflags |= tmp_logflags;
+			if (error)
+				goto error0;
+			lastx = ifp->if_lastex;
+			ep = &ifp->if_u1.if_extents[lastx];
+			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+			xfs_bmbt_get_all(ep, &got);
+			/*
+			 * We may have combined previously unwritten
+			 * space with written space, so generate
+			 * another request.
+			 */
+			if (mval->br_blockcount < len)
+				continue;
+		}
+
+		ASSERT(!trim ||
+		       ((mval->br_startoff + mval->br_blockcount) <= end));
+		ASSERT(!trim || (mval->br_blockcount <= len) ||
+		       (mval->br_startoff < obno));
+		bno = mval->br_startoff + mval->br_blockcount;
+		len = end - bno;
+		if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+			ASSERT(mval->br_startblock == mval[-1].br_startblock);
+			ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+			ASSERT(mval->br_state == mval[-1].br_state);
+			mval[-1].br_blockcount = mval->br_blockcount;
+			mval[-1].br_state = mval->br_state;
+		} else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+			   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+			   mval[-1].br_startblock != HOLESTARTBLOCK &&
+			   mval->br_startblock ==
+			   mval[-1].br_startblock + mval[-1].br_blockcount &&
+			   (stateless || mval[-1].br_state == mval->br_state)) {
+			ASSERT(mval->br_startoff ==
+			       mval[-1].br_startoff + mval[-1].br_blockcount);
+			mval[-1].br_blockcount += mval->br_blockcount;
+		} else if (n > 0 &&
+			   mval->br_startblock == DELAYSTARTBLOCK &&
+			   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+			   mval->br_startoff ==
+			   mval[-1].br_startoff + mval[-1].br_blockcount) {
+			mval[-1].br_blockcount += mval->br_blockcount;
+			mval[-1].br_state = mval->br_state;
+		} else if (!((n == 0) &&
+			     ((mval->br_startoff + mval->br_blockcount) <=
+			      obno))) {
+			mval++;
+			n++;
+		}
+		/*
+		 * If we're done, stop now.  Stop when we've allocated
+		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
+		 * the transaction may get too big.
+		 */
+		if (bno >= end || n >= *nmap || nallocs >= *nmap)
+			break;
+		/*
+		 * Else go on to the next record.
+		 */
+		ep++;
+		lastx++;
+		if (lastx >= nextents) {
+			eof = 1;
+			prev = got;
+		} else
+			xfs_bmbt_get_all(ep, &got);
+	}
+	ifp->if_lastex = lastx;
+	*nmap = n;
+	/*
+	 * Transform from btree to extents, give it cur.
+	 */
+	if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+		ASSERT(wr && cur);
+		error = xfs_bmap_btree_to_extents(tp, ip, cur,
+			&tmp_logflags, whichfork, 0);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+	error = 0;
+
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent list if we've converted to btree format.
+	 */
+	if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~XFS_ILOG_FEXT(whichfork);
+	else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		logflags &= ~XFS_ILOG_FBROOT(whichfork);
+	/*
+	 * Log whatever the flags say, even if error.  Otherwise we might miss
+	 * detecting a case where the data is changed, there's an error,
+	 * and it's not logged so we don't shutdown when we should.
+	 */
+	if (logflags) {
+		ASSERT(tp && wr);
+		xfs_trans_log_inode(tp, ip, logflags);
+	}
+	if (cur) {
+		if (!error) {
+			ASSERT(*firstblock == NULLFSBLOCK ||
+			       XFS_FSB_TO_AGNO(ip->i_mount, *firstblock) ==
+			       XFS_FSB_TO_AGNO(ip->i_mount,
+				       cur->bc_private.b.firstblock) ||
+			       (flist->xbf_low &&
+				XFS_FSB_TO_AGNO(ip->i_mount, *firstblock) <
+				XFS_FSB_TO_AGNO(ip->i_mount,
+					cur->bc_private.b.firstblock)));
+			*firstblock = cur->bc_private.b.firstblock;
+		}
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	if (!error)
+		xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+			orig_nmap, *nmap);
+	return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block (extent) only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int						/* error */
+xfs_bmapi_single(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode */
+	int		whichfork,	/* data or attr fork */
+	xfs_fsblock_t	*fsb,		/* output: mapped block */
+	xfs_fileoff_t	bno)		/* starting file offs. mapped */
+{
+	int		eof;		/* we've hit the end of extent list */
+	int		error;		/* error return */
+	xfs_bmbt_irec_t got;		/* current extent list record */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extnum_t	lastx;		/* last useful extent number */
+	xfs_bmbt_irec_t prev;		/* previous extent list record */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) {
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+	       return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return XFS_ERROR(EIO);
+	XFS_STATS_INC(xfsstats.xs_blk_mapr);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	(void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	/*
+	 * Reading past eof, act as though there's a hole
+	 * up to end.
+	 */
+	if (eof || got.br_startoff > bno) {
+		*fsb = NULLFSBLOCK;
+		return 0;
+	}
+	ASSERT(!ISNULLSTARTBLOCK(got.br_startblock));
+	ASSERT(bno < got.br_startoff + got.br_blockcount);
+	*fsb = got.br_startblock + (bno - got.br_startoff);
+	ifp->if_lastex = lastx;
+	return 0;
+}
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.	If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int						/* error */
+xfs_bunmapi(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting offset to unmap */
+	xfs_filblks_t		len,		/* length to unmap in file */
+	int			flags,		/* misc flags */
+	xfs_extnum_t		nexts,		/* number of extents max */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*done)		/* set if not done yet */
+{
+	int			async;		/* xactions can be async */
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_irec_t		del;		/* extent being deleted */
+	int			eof;		/* is deleting at eof */
+	xfs_bmbt_rec_t		*ep;		/* extent list entry pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		extno;		/* extent number in list */
+	xfs_bmbt_irec_t		got;		/* current extent list entry */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	int			isrt;		/* freeing in rt area */
+	xfs_extnum_t		lastx;		/* last extent index used */
+	int			logflags;	/* transaction logging flags */
+	xfs_extlen_t		mod;		/* rt extent offset */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* size of extent list */
+	xfs_bmbt_irec_t		prev;		/* previous extent list entry */
+	xfs_fileoff_t		start;		/* first file offset deleted */
+	int			tmp_logflags;	/* partial logging flags */
+	int			wasdel;		/* was a delayed alloc extent */
+	int			whichfork;	/* data or attribute fork */
+	int			rsvd;		/* OK to allocate reserved blocks */
+	xfs_fsblock_t		sum;
+
+	xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address);
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	mp = ip->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+	async = flags & XFS_BMAPI_ASYNC;
+	rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+	ASSERT(len > 0);
+	ASSERT(nexts >= 0);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*done = 1;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_blk_unmap);
+	isrt = (whichfork == XFS_DATA_FORK) &&
+	       (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
+	start = bno;
+	bno = start + len - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	/*
+	 * Check to see if the given block number is past the end of the
+	 * file, back up to the last block if so...
+	 */
+	if (eof) {
+		ep = &ifp->if_u1.if_extents[--lastx];
+		xfs_bmbt_get_all(ep, &got);
+		bno = got.br_startoff + got.br_blockcount - 1;
+	}
+	logflags = 0;
+	if (ifp->if_flags & XFS_IFBROOT) {
+		ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+		cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+			whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	} else
+		cur = NULL;
+	extno = 0;
+	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+	       (nexts == 0 || extno < nexts)) {
+		/*
+		 * Is the found extent after a hole in which bno lives?
+		 * Just back up to the previous extent, if so.
+		 */
+		if (got.br_startoff > bno) {
+			if (--lastx < 0)
+				break;
+			ep--;
+			xfs_bmbt_get_all(ep, &got);
+		}
+		/*
+		 * Is the last block of this extent before the range
+		 * we're supposed to delete?  If so, we're done.
+		 */
+		bno = XFS_FILEOFF_MIN(bno,
+			got.br_startoff + got.br_blockcount - 1);
+		if (bno < start)
+			break;
+		/*
+		 * Then deal with the (possibly delayed) allocated space
+		 * we found.
+		 */
+		ASSERT(ep != NULL);
+		del = got;
+		wasdel = ISNULLSTARTBLOCK(del.br_startblock);
+		if (got.br_startoff < start) {
+			del.br_startoff = start;
+			del.br_blockcount -= start - got.br_startoff;
+			if (!wasdel)
+				del.br_startblock += start - got.br_startoff;
+		}
+		if (del.br_startoff + del.br_blockcount > bno + 1)
+			del.br_blockcount = bno + 1 - del.br_startoff;
+		sum = del.br_startblock + del.br_blockcount;
+		if (isrt &&
+		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent not lined up at the end.
+			 * The extent could have been split into written
+			 * and unwritten pieces, or we could just be
+			 * unmapping part of it.  But we can't really
+			 * get rid of part of a realtime extent.
+			 */
+			if (del.br_state == XFS_EXT_UNWRITTEN ||
+			    !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+				/*
+				 * This piece is unwritten, or we're not
+				 * using unwritten extents.  Skip over it.
+				 */
+				ASSERT(bno >= mod);
+				bno -= mod > del.br_blockcount ?
+					del.br_blockcount : mod;
+				if (bno < got.br_startoff) {
+					if (--lastx >= 0)
+						xfs_bmbt_get_all(--ep, &got);
+				}
+				continue;
+			}
+			/*
+			 * It's written, turn it unwritten.
+			 * This is better than zeroing it.
+			 */
+			ASSERT(del.br_state == XFS_EXT_NORM);
+			ASSERT(xfs_trans_get_block_res(tp) > 0);
+			/*
+			 * If this spans a realtime extent boundary,
+			 * chop it back to the start of the one we end at.
+			 */
+			if (del.br_blockcount > mod) {
+				del.br_startoff += del.br_blockcount - mod;
+				del.br_startblock += del.br_blockcount - mod;
+				del.br_blockcount = mod;
+			}
+			del.br_state = XFS_EXT_UNWRITTEN;
+			error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
+				firstblock, flist, &logflags, XFS_DATA_FORK, 0);
+			if (error)
+				goto error0;
+			goto nodelete;
+		}
+		if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent is lined up at the end but not
+			 * at the front.  We'll get rid of full extents if
+			 * we can.
+			 */
+			mod = mp->m_sb.sb_rextsize - mod;
+			if (del.br_blockcount > mod) {
+				del.br_blockcount -= mod;
+				del.br_startoff += mod;
+				del.br_startblock += mod;
+			} else if ((del.br_startoff == start &&
+				    (del.br_state == XFS_EXT_UNWRITTEN ||
+				     xfs_trans_get_block_res(tp) == 0)) ||
+				   !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+				/*
+				 * Can't make it unwritten.  There isn't
+				 * a full extent here so just skip it.
+				 */
+				ASSERT(bno >= del.br_blockcount);
+				bno -= del.br_blockcount;
+				if (bno < got.br_startoff) {
+					if (--lastx >= 0)
+						xfs_bmbt_get_all(--ep, &got);
+				}
+				continue;
+			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+				/*
+				 * This one is already unwritten.
+				 * It must have a written left neighbor.
+				 * Unwrite the killed part of that one and
+				 * try again.
+				 */
+				ASSERT(lastx > 0);
+				xfs_bmbt_get_all(ep - 1, &prev);
+				ASSERT(prev.br_state == XFS_EXT_NORM);
+				ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock));
+				ASSERT(del.br_startblock ==
+				       prev.br_startblock + prev.br_blockcount);
+				if (prev.br_startoff < start) {
+					mod = start - prev.br_startoff;
+					prev.br_blockcount -= mod;
+					prev.br_startblock += mod;
+					prev.br_startoff = start;
+				}
+				prev.br_state = XFS_EXT_UNWRITTEN;
+				error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
+					&prev, firstblock, flist, &logflags,
+					XFS_DATA_FORK, 0);
+				if (error)
+					goto error0;
+				goto nodelete;
+			} else {
+				ASSERT(del.br_state == XFS_EXT_NORM);
+				del.br_state = XFS_EXT_UNWRITTEN;
+				error = xfs_bmap_add_extent(ip, lastx, &cur,
+					&del, firstblock, flist, &logflags,
+					XFS_DATA_FORK, 0);
+				if (error)
+					goto error0;
+				goto nodelete;
+			}
+		}
+		if (wasdel) {
+			ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
+			xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
+				(int)del.br_blockcount, rsvd);
+			if (XFS_IS_QUOTA_ON(ip->i_mount)) {
+				ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
+				ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+				if (!isrt)
+					xfs_trans_unreserve_blkquota(NULL, ip,
+					      (long)del.br_blockcount);
+				else
+					xfs_trans_unreserve_rtblkquota(NULL, ip,
+					      (long)del.br_blockcount);
+			}
+			ip->i_delayed_blks -= del.br_blockcount;
+			if (cur)
+				cur->bc_private.b.flags |=
+					XFS_BTCUR_BPRV_WASDEL;
+		} else if (cur)
+			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+		/*
+		 * If it's the case where the directory code is running
+		 * with no block reservation, and the deleted block is in
+		 * the middle of its extent, and the resulting insert
+		 * of an extent would cause transformation to btree format,
+		 * then reject it.  The calling code will then swap
+		 * blocks around instead.
+		 * We have to do this now, rather than waiting for the
+		 * conversion to btree format, since the transaction
+		 * will be dirty.
+		 */
+		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+		    XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+		    del.br_startoff > got.br_startoff &&
+		    del.br_startoff + del.br_blockcount <
+		    got.br_startoff + got.br_blockcount) {
+			error = XFS_ERROR(ENOSPC);
+			goto error0;
+		}
+		error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
+			flags, &tmp_logflags, whichfork, rsvd);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+		bno = del.br_startoff - 1;
+nodelete:
+		lastx = ifp->if_lastex;
+		/*
+		 * If not done go on to the next (previous) record.
+		 * Reset ep in case the extents array was re-alloced.
+		 */
+		ep = &ifp->if_u1.if_extents[lastx];
+		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
+			if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
+			    xfs_bmbt_get_startoff(ep) > bno) {
+				lastx--;
+				ep--;
+			}
+			if (lastx >= 0)
+				xfs_bmbt_get_all(ep, &got);
+			extno++;
+		}
+	}
+	ifp->if_lastex = lastx;
+	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+			&cur, 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from btree to extents, give it cur
+	 */
+	else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+		 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+		ASSERT(cur != NULL);
+		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+			whichfork, async);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from extents to local?
+	 */
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	error = 0;
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent list if we've converted to btree format.
+	 */
+	if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~XFS_ILOG_FEXT(whichfork);
+	else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		logflags &= ~XFS_ILOG_FBROOT(whichfork);
+	/*
+	 * Log inode even in the error case, if the transaction
+	 * is dirty we'll need to shut down the filesystem.
+	 */
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (cur) {
+		if (!error) {
+			*firstblock = cur->bc_private.b.firstblock;
+			cur->bc_private.b.allocated = 0;
+		}
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * Fcntl interface to xfs_bmapi.
+ */
+int						/* error code */
+xfs_getbmap(
+	bhv_desc_t		*bdp,		/* XFS behavior descriptor*/
+	struct getbmap		*bmv,		/* user bmap structure */
+	void			*ap,		/* pointer to user's array */
+	int			interface)	/* interface flags */
+{
+	__int64_t		bmvend;		/* last block requested */
+	int			error;		/* return value */
+	__int64_t		fixlen;		/* length for -1 case */
+	int			i;		/* extent number */
+	xfs_inode_t		*ip;		/* xfs incore inode pointer */
+	vnode_t			*vp;		/* corresponding vnode */
+	int			lock;		/* lock state */
+	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			nex;		/* # of user extents can do */
+	int			nexleft;	/* # of user extents left */
+	int			subnex;		/* # of bmapi's can do */
+	int			nmap;		/* number of map entries */
+	struct getbmap		out;		/* output structure */
+	int			whichfork;	/* data or attr fork */
+	int			prealloced;	/* this is a file with
+						 * preallocated data space */
+	int			sh_unwritten;	/* true, if unwritten */
+						/* extents listed seperately */
+	int			bmapi_flags;	/* flags for xfs_bmapi */
+	__int32_t		oflags;		/* getbmapx bmv_oflags field */
+
+	ip = XFS_BHVTOI(bdp);
+	vp = BHV_TO_VNODE(bdp);
+
+	whichfork = interface & BMV_IF_ATTRFORK ?
+				XFS_ATTR_FORK : XFS_DATA_FORK;
+	sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
+
+
+	/*	If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
+	 *	generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
+	 *	bit is set for the file, generate a read event in order
+	 *	that the DMAPI application may do its thing before we return
+	 *	the extents.  Usually this means restoring user file data to
+	 *	regions of the file that look like holes.
+	 *
+	 *	The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
+	 *	BMV_IF_NO_DMAPI_READ so that read events are generated.
+	 *	If this were not true, callers of ioctl( XFS_IOC_GETBMAP )
+	 *	could misinterpret holes in a DMAPI file as true holes,
+	 *	when in fact they may represent offline user data.
+	 */
+	if (   (interface & BMV_IF_NO_DMAPI_READ) == 0
+	    && DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)
+	    && whichfork == XFS_DATA_FORK) {
+
+		error = xfs_dm_send_data_event(DM_EVENT_READ, bdp,
+				0, 0, 0, NULL);
+		if (error)
+			return XFS_ERROR(error);
+	}
+
+	if (whichfork == XFS_ATTR_FORK) {
+		if (XFS_IFORK_Q(ip)) {
+			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+				return XFS_ERROR(EINVAL);
+		} else if (ip->i_d.di_aformat != 0 &&
+			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+			cmn_err(CE_NOTE,
+				"EFSCORRUPTED returned from file %s line %d",
+				__FILE__, __LINE__);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	} else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+		   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+		   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+		return XFS_ERROR(EINVAL);
+
+	mp = ip->i_mount;
+
+	if (whichfork == XFS_DATA_FORK) {
+		if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) {
+			prealloced = 1;
+			fixlen = XFS_MAX_FILE_OFFSET;
+		} else {
+			prealloced = 0;
+			fixlen = ip->i_d.di_size;
+		}
+	} else {
+		prealloced = 0;
+		fixlen = 1LL << 32;
+	}
+
+	if (bmv->bmv_length == -1) {
+		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+		bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset),
+					(__int64_t)0);
+	} else if (bmv->bmv_length < 0)
+		return XFS_ERROR(EINVAL);
+	if (bmv->bmv_length == 0) {
+		bmv->bmv_entries = 0;
+		return 0;
+	}
+
+	nex = bmv->bmv_count - 1;
+
+	if (nex <= 0)
+		return XFS_ERROR(EINVAL);
+
+	bmvend = bmv->bmv_offset + bmv->bmv_length;
+
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+	if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
+
+		VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+	}
+
+	ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
+
+	lock = xfs_ilock_map_shared(ip);
+
+	/*
+	 * Don't let nex be bigger than the number of extents
+	 * we can have assuming alternating holes and real extents.
+	 */
+	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+
+	bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
+			((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
+
+	subnex = 16;	/* XXXjtk - need a #define?	*/
+
+	/*
+	 * Allocate enough space to handle "subnex" maps at a time.
+	 */
+	map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP);
+
+	bmv->bmv_entries = 0;
+
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
+		error = 0;
+		goto unlock_and_return;
+	}
+
+	nexleft = nex;
+
+	do {
+	    if (nexleft > subnex)
+		    nmap = subnex;
+	    else
+		    nmap = nexleft;
+
+	    error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+					XFS_BB_TO_FSB(mp, bmv->bmv_length),
+					bmapi_flags, NULL, 0,
+					map, &nmap, NULL);
+	    ASSERT(nmap <= subnex);
+
+	    if (error)
+		    goto unlock_and_return;
+
+	    for (error = i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+		nexleft--;
+
+		oflags = 0;
+
+		out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
+		out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+
+		ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+
+		if (   prealloced
+		    && map[i].br_startblock == HOLESTARTBLOCK
+		    && out.bmv_offset + out.bmv_length == bmvend) {
+			/*
+			 * came to hole at end of file
+			 */
+			goto unlock_and_return;
+		} else {
+			if (map[i].br_startblock == HOLESTARTBLOCK)
+				out.bmv_block = -1;
+			else
+				out.bmv_block =
+					XFS_FSB_TO_DB(ip, map[i].br_startblock);
+
+			/* return either a getbmap or a getbmapx structure. */
+
+			if (interface & BMV_IF_EXTENDED) {
+				struct	getbmapx	outx;
+
+				GETBMAP_CONVERT(out,outx);
+
+				outx.bmv_oflags = oflags;
+				outx.bmv_unused1 = outx.bmv_unused2 = 0;
+
+				if (copy_to_user(ap, &outx, sizeof(outx))) {
+					error = XFS_ERROR(EFAULT);
+					goto unlock_and_return;
+				}
+			} else {
+				if (copy_to_user(ap, &out, sizeof(out))) {
+					error = XFS_ERROR(EFAULT);
+					goto unlock_and_return;
+				}
+			}
+
+			bmv->bmv_offset = out.bmv_offset + out.bmv_length;
+			bmv->bmv_length = MAX( (__int64_t)0,
+					(__int64_t)(bmvend - bmv->bmv_offset) );
+
+			bmv->bmv_entries++;
+
+			if (interface & BMV_IF_EXTENDED)
+				ap = (void *)((struct getbmapx *)ap + 1);
+			else
+				ap = (void *)((struct getbmap *)ap + 1);
+		}
+	    }
+	} while (nmap && nexleft && bmv->bmv_length);
+
+unlock_and_return:
+	xfs_iunlock_map_shared(ip, lock);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	kmem_free(map, subnex * sizeof(*map));
+
+	return error;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+int					/* error */
+xfs_bmap_isaeof(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	off,		/* file offset in fsblocks */
+	int		whichfork,	/* data or attribute fork */
+	char		*aeof)		/* return value */
+{
+	int		error;		/* error return value */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_t	*lastrec;	/* extent list entry pointer */
+	xfs_extnum_t	nextents;	/* size of extent list */
+	xfs_bmbt_irec_t s;		/* expanded extent list entry */
+
+	ASSERT(whichfork == XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(NULL, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*aeof = 1;
+		return 0;
+	}
+	/*
+	 * Go to the last extent
+	 */
+	lastrec = &ifp->if_u1.if_extents[nextents - 1];
+	xfs_bmbt_get_all(lastrec, &s);
+	/*
+	 * Check we are allocating in the last extent (for delayed allocations)
+	 * or past the last extent for non-delayed allocations.
+	 */
+	*aeof = (off >= s.br_startoff &&
+		 off < s.br_startoff + s.br_blockcount &&
+		 ISNULLSTARTBLOCK(s.br_startblock)) ||
+		off >= s.br_startoff + s.br_blockcount;
+	return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.
+ */
+int					/* error */
+xfs_bmap_eof(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	endoff,		/* file offset in fsblocks */
+	int		whichfork,	/* data or attribute fork */
+	int		*eof)		/* result value */
+{
+	xfs_fsblock_t	blockcount;	/* extent block count */
+	int		error;		/* error return value */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_t	*lastrec;	/* extent list entry pointer */
+	xfs_extnum_t	nextents;	/* size of extent list */
+	xfs_fileoff_t	startoff;	/* extent starting file offset */
+
+	ASSERT(whichfork == XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(NULL, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*eof = 1;
+		return 0;
+	}
+	/*
+	 * Go to the last extent
+	 */
+	lastrec = &ifp->if_u1.if_extents[nextents - 1];
+	startoff = xfs_bmbt_get_startoff(lastrec);
+	blockcount = xfs_bmbt_get_blockcount(lastrec);
+	*eof = endoff >= startoff + blockcount;
+	return 0;
+}
+
+#ifdef XFSDEBUG
+/*
+ * Check that the extents list for the inode ip is in the right order.
+ */
+STATIC void
+xfs_bmap_check_extents(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t		*base;		/* base of extents list */
+	xfs_bmbt_rec_t		*ep;		/* current extent entry */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_extnum_t		nextents;	/* number of extents in list */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	base = ifp->if_u1.if_extents;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (ep = base; ep < &base[nextents - 1]; ep++) {
+		xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
+			(void *)(ep + 1));
+	}
+}
+
+STATIC
+xfs_buf_t *
+xfs_bmap_get_bp(
+	xfs_btree_cur_t		*cur,
+	xfs_fsblock_t		bno)
+{
+	int i;
+	xfs_buf_t *bp;
+
+	if (!cur)
+		return(NULL);
+
+	bp = NULL;
+	for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+		bp = cur->bc_bufs[i];
+		if (!bp) break;
+		if (XFS_BUF_ADDR(bp) == bno)
+			break;	/* Found it */
+	}
+	if (i == XFS_BTREE_MAXLEVELS)
+		bp = NULL;
+
+	if (!bp) { /* Chase down all the log items to see if the bp is there */
+		xfs_log_item_chunk_t	*licp;
+		xfs_trans_t		*tp;
+
+		tp = cur->bc_tp;
+		licp = &tp->t_items;
+		while (!bp && licp != NULL) {
+			if (XFS_LIC_ARE_ALL_FREE(licp)) {
+				licp = licp->lic_next;
+				continue;
+			}
+			for (i = 0; i < licp->lic_unused; i++) {
+				xfs_log_item_desc_t	*lidp;
+				xfs_log_item_t		*lip;
+				xfs_buf_log_item_t	*bip;
+				xfs_buf_t		*lbp;
+
+				if (XFS_LIC_ISFREE(licp, i)) {
+					continue;
+				}
+
+				lidp = XFS_LIC_SLOT(licp, i);
+				lip = lidp->lid_item;
+				if (lip->li_type != XFS_LI_BUF)
+					continue;
+
+				bip = (xfs_buf_log_item_t *)lip;
+				lbp = bip->bli_buf;
+
+				if (XFS_BUF_ADDR(lbp) == bno) {
+					bp = lbp;
+					break; /* Found it */
+				}
+			}
+			licp = licp->lic_next;
+		}
+	}
+	return(bp);
+}
+
+void
+xfs_check_block(
+	xfs_bmbt_block_t	*block,
+	xfs_mount_t		*mp,
+	int			root,
+	short			sz)
+{
+	int			i, j, dmxr;
+	xfs_bmbt_ptr_t		*pp, *thispa;	/* pointer to block address */
+	xfs_bmbt_key_t		*prevp, *keyp;
+
+	ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+
+	prevp = NULL;
+	for( i = 1; i <= INT_GET(block->bb_numrecs, ARCH_CONVERT);i++) {
+		dmxr = mp->m_bmap_dmxr[0];
+
+		if (root) {
+			keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
+		} else {
+			keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize,
+				xfs_bmbt, block, i, dmxr);
+		}
+
+		if (prevp) {
+			xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
+		}
+		prevp = keyp;
+
+		/*
+		 * Compare the block numbers to see if there are dups.
+		 */
+
+		if (root) {
+			pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
+		} else {
+			pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
+				xfs_bmbt, block, i, dmxr);
+		}
+		for (j = i+1; j <= INT_GET(block->bb_numrecs, ARCH_CONVERT); j++) {
+			if (root) {
+				thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
+			} else {
+				thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
+					xfs_bmbt, block, j, dmxr);
+			}
+			if (INT_GET(*thispa, ARCH_CONVERT) == INT_GET(*pp, ARCH_CONVERT)) {
+				printk("xfs_check_block: thispa(%d) == pp(%d) %Ld\n",
+						j, i, INT_GET(*thispa, ARCH_CONVERT));
+				panic("xfs_check_block: ptrs are equal in node\n");
+			}
+		}
+	}
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+	xfs_btree_cur_t		*cur,	/* btree cursor or null */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_block_t	*block; /* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_extnum_t		i=0;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	xfs_bmbt_ptr_t		*pp;	/* pointer to block address */
+	xfs_bmbt_rec_t		*ep, *lastp;	/* extent pointers in block entry */
+	int			bp_release = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+		return;
+	}
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+	level = INT_GET(block->bb_level, ARCH_CONVERT);
+	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+	ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+	bno = INT_GET(*pp, ARCH_CONVERT);
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		/* See if buf is in cur first */
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (bp) {
+			bp_release = 0;
+		} else {
+			bp_release = 1;
+		}
+		if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			goto error_norelse;
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			XFS_BMAP_SANITY_CHECK(mp, block, level),
+			error0);
+		if (level == 0)
+			break;
+
+		/*
+		 * Check this block for basic sanity (increasing keys and
+		 * no duplicate blocks).
+		 */
+
+		xfs_check_block(block, mp, 0, 0);
+		pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
+			1, mp->m_bmap_dmxr[1]);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0);
+		bno = INT_GET(*pp, ARCH_CONVERT);
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+	}
+
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	i = 0;
+
+	/*
+	 * Loop over all leaf nodes checking that all extents are in the right order.
+	 */
+	lastp = NULL;
+	for (;;) {
+		xfs_bmbt_rec_t	*frp;
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+
+
+		num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+
+		nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+
+		/*
+		 * Check all the extents to make sure they are OK.
+		 * If we had a previous block, the last entry should
+		 * conform with the first entry in this one.
+		 */
+
+		frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
+			block, 1, mp->m_bmap_dmxr[0]);
+
+		for (ep = frp;ep < frp + (num_recs - 1); ep++) {
+			if (lastp) {
+				xfs_btree_check_rec(XFS_BTNUM_BMAP,
+					(void *)lastp, (void *)ep);
+			}
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
+				(void *)(ep + 1));
+		}
+		lastp = frp + num_recs - 1; /* For the next iteration */
+
+		i += num_recs;
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (bp) {
+			bp_release = 0;
+		} else {
+			bp_release = 1;
+		}
+		if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			goto error_norelse;
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+	}
+	if (bp_release) {
+		bp_release = 0;
+		xfs_trans_brelse(NULL, bp);
+	}
+	return;
+
+error0:
+	printk("at error0\n");
+	if (bp_release)
+		xfs_trans_brelse(NULL, bp);
+error_norelse:
+	printk("xfs_bmap_check_leaf_extents: BAD after btree leaves for %d extents\n", i);
+	panic("xfs_bmap_check_leaf_extents: CORRUPTED BTREE OR SOMETHING");
+	return;
+}
+#endif
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int						/* error */
+xfs_bmap_count_blocks(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode */
+	int			whichfork,	/* data or attr fork */
+	int			*count)		/* out: count of blocks */
+{
+	xfs_bmbt_block_t	*block; /* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	xfs_bmbt_ptr_t		*pp;	/* pointer to block address */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+		if (xfs_bmap_count_leaves(ifp->if_u1.if_extents,
+			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+			count) < 0) {
+				cmn_err(CE_NOTE,
+					"EFSCORRUPTED returned from file %s line %d",
+					__FILE__, __LINE__);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		return 0;
+	}
+
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	block = ifp->if_broot;
+	ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+	level = INT_GET(block->bb_level, ARCH_CONVERT);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+	ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+	bno = INT_GET(*pp, ARCH_CONVERT);
+
+	if (xfs_bmap_count_tree(mp, tp, bno, level, count) < 0) {
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	return 0;
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks is use.
+ */
+int					/* error */
+xfs_bmap_count_tree(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	blockno,	/* file system block number */
+	int		levelin,	/* level in btree */
+	int		*count)		/* Count of blocks */
+{
+	int			error;
+	xfs_buf_t		*bp, *nbp;
+	int			level = levelin;
+	xfs_bmbt_ptr_t		*pp;
+	xfs_fsblock_t		bno = blockno;
+	xfs_fsblock_t		nextbno;
+	xfs_bmbt_block_t	*block, *nextblock;
+	int			numrecs;
+	xfs_bmbt_rec_t		*frp;
+
+	if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+		return error;
+	*count += 1;
+	block = XFS_BUF_TO_BMBT_BLOCK(bp);
+
+	if (--level) {
+		/* Not at node above leafs, count this level of nodes */
+		nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+		while (nextbno != NULLFSBLOCK) {
+			if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
+				0, &nbp, XFS_BMAP_BTREE_REF)))
+				return error;
+			*count += 1;
+			nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
+			nextbno = INT_GET(nextblock->bb_rightsib, ARCH_CONVERT);
+			xfs_trans_brelse(tp, nbp);
+		}
+
+		/* Dive to the next level */
+		pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
+			xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+		bno = INT_GET(*pp, ARCH_CONVERT);
+		if ((error =
+		     xfs_bmap_count_tree(mp, tp, bno, level, count)) < 0) {
+			xfs_trans_brelse(tp, bp);
+			cmn_err(CE_NOTE,
+				"EFSCORRUPTED returned from file %s line %d",
+				__FILE__, __LINE__);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		xfs_trans_brelse(tp, bp);
+	} else {
+		/* count all level 1 nodes and their leaves */
+		for (;;) {
+			nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+			numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+			frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize,
+				xfs_bmbt, block, 1, mp->m_bmap_dmxr[0]);
+			if (xfs_bmap_count_leaves(frp, numrecs, count) < 0) {
+				xfs_trans_brelse(tp, bp);
+				cmn_err(CE_NOTE,
+					"EFSCORRUPTED returned from file %s line %d",
+					__FILE__, __LINE__);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+			xfs_trans_brelse(tp, bp);
+			if (nextbno == NULLFSBLOCK)
+				break;
+			bno = nextbno;
+			if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+				return error;
+			*count += 1;
+			block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Count leaf blocks given a pointer to an extent list.
+ */
+int
+xfs_bmap_count_leaves(
+	xfs_bmbt_rec_t		*frp,
+	int			numrecs,
+	int			*count)
+{
+	int		b;
+
+	for ( b = 1; b <= numrecs; b++, frp++)
+		*count += xfs_bmbt_get_blockcount(frp);
+	return 0;
+}
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bmap.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bmap.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bmap.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bmap.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BMAP_H__
+#define __XFS_BMAP_H__
+
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+typedef struct xfs_bmap_free_item
+{
+	xfs_fsblock_t		xbfi_startblock;/* starting fs block number */
+	xfs_extlen_t		xbfi_blockcount;/* number of blocks in extent */
+	struct xfs_bmap_free_item *xbfi_next;	/* link to next entry */
+} xfs_bmap_free_item_t;
+
+/*
+ * Header for free extent list.
+ */
+typedef struct xfs_bmap_free
+{
+	xfs_bmap_free_item_t	*xbf_first;	/* list of to-be-free extents */
+	int			xbf_count;	/* count of items on list */
+	int			xbf_low;	/* kludge: alloc in low mode */
+} xfs_bmap_free_t;
+
+#define XFS_BMAP_MAX_NMAP	4
+
+/*
+ * Flags for xfs_bmapi
+ */
+#define XFS_BMAPI_WRITE		0x001	/* write operation: allocate space */
+#define XFS_BMAPI_DELAY		0x002	/* delayed write operation */
+#define XFS_BMAPI_ENTIRE	0x004	/* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA	0x008	/* mapping metadata not user data */
+#define XFS_BMAPI_EXACT		0x010	/* allocate only to spec'd bounds */
+#define XFS_BMAPI_ATTRFORK	0x020	/* use attribute fork not data */
+#define XFS_BMAPI_ASYNC		0x040	/* bunmapi xactions can be async */
+#define XFS_BMAPI_RSVBLOCKS	0x080	/* OK to alloc. reserved data blocks */
+#define XFS_BMAPI_PREALLOC	0x100	/* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE	0x200	/* Ignore state - */
+					/* combine contig. space */
+#define XFS_BMAPI_CONTIG	0x400	/* must allocate only one extent */
+#define XFS_BMAPI_DIRECT_IO	0x800	/* Flag from cxfs client, not used
+					 * by xfs directly. Indicates alloc
+					 * request is for direct I/O not
+					 * extent conversion by server */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAPI_AFLAG)
+int xfs_bmapi_aflag(int w);
+#define XFS_BMAPI_AFLAG(w)	xfs_bmapi_aflag(w)
+#else
+#define XFS_BMAPI_AFLAG(w)	((w) == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0)
+#endif
+
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define DELAYSTARTBLOCK		((xfs_fsblock_t)-1LL)
+#define HOLESTARTBLOCK		((xfs_fsblock_t)-2LL)
+
+/*
+ * Trace operations for bmap extent tracing
+ */
+#define XFS_BMAP_KTRACE_DELETE	1
+#define XFS_BMAP_KTRACE_INSERT	2
+#define XFS_BMAP_KTRACE_PRE_UP	3
+#define XFS_BMAP_KTRACE_POST_UP 4
+
+#define XFS_BMAP_TRACE_SIZE	4096	/* size of global trace buffer */
+#define XFS_BMAP_KTRACE_SIZE	32	/* size of per-inode trace buffer */
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_BMAP_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef	XFS_BMAP_TRACE
+#endif
+
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_INIT)
+void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp);
+#define XFS_BMAP_INIT(flp,fbp)	xfs_bmap_init(flp,fbp)
+#else
+#define XFS_BMAP_INIT(flp,fbp)	\
+	((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
+	 (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK)
+#endif
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+typedef struct xfs_bmalloca {
+	xfs_fsblock_t		firstblock; /* i/o first block allocated */
+	xfs_fsblock_t		rval;	/* starting block of new extent */
+	xfs_fileoff_t		off;	/* offset in file filling in */
+	struct xfs_trans	*tp;	/* transaction pointer */
+	struct xfs_inode	*ip;	/* incore inode pointer */
+	struct xfs_bmbt_irec	*prevp; /* extent before the new one */
+	struct xfs_bmbt_irec	*gotp;	/* extent after, or delayed */
+	xfs_extlen_t		alen;	/* i/o length asked/allocated */
+	xfs_extlen_t		total;	/* total blocks needed for xaction */
+	xfs_extlen_t		minlen; /* mininum allocation size (blocks) */
+	xfs_extlen_t		minleft; /* amount must be left after alloc */
+	char			eof;	/* set if allocating past last extent */
+	char			wasdel; /* replacing a delayed allocation */
+	char			userdata;/* set if is user data */
+	char			low;	/* low on space, using seq'l ags */
+	char			aeof;	/* allocated space at eof */
+} xfs_bmalloca_t;
+
+#ifdef __KERNEL__
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int					/* error code */
+xfs_bmap_add_attrfork(
+	struct xfs_inode	*ip,	/* incore inode pointer */
+	int					rsvd);	/* flag for reserved block allocation */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+	xfs_fsblock_t		bno,		/* fs block number of extent */
+	xfs_filblks_t		len,		/* length of extent */
+	xfs_bmap_free_t		*flist,		/* list of extents */
+	struct xfs_mount	*mp);		/* mount point structure */
+
+/*
+ * Routine to clean up the free list data structure when
+ * an error occurs during a transaction.
+ */
+void
+xfs_bmap_cancel(
+	xfs_bmap_free_t		*flist);	/* free list to clean up */
+
+/*
+ * Routine to check if a specified inode is swap capable.
+ */
+int
+xfs_bmap_check_swappable(
+	struct xfs_inode	*ip);		/* incore inode */
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.	Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int						/* error */
+xfs_bmap_finish(
+	struct xfs_trans	**tp,		/* transaction pointer addr */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_fsblock_t		firstblock,	/* controlled a.g. for allocs */
+	int			*committed);	/* xact committed or not */
+
+/*
+ * Returns the file-relative block number of the first unused block in the file.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_extlen_t		len,		/* size of hole to find */
+	xfs_fileoff_t		*unused,	/* unused block num */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		*last_block,	/* last block */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_offset(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		*unused,	/* last block num */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int
+xfs_bmap_one_block(
+	struct xfs_inode	*ip,		/* incore inode */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Read in the extents to iu_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in.
+ */
+int						/* error */
+xfs_bmap_read_extents(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	int			whichfork);	/* data or attr fork */
+
+#if defined(XFS_BMAP_TRACE)
+/*
+ * Add bmap trace insert entries for all the contents of the extent list.
+ */
+void
+xfs_bmap_trace_exlist(
+	char			*fname,		/* function name */
+	struct xfs_inode	*ip,		/* incore inode pointer */
+	xfs_extnum_t		cnt,		/* count of entries in list */
+	int			whichfork);	/* data or attr fork */
+#else
+#define xfs_bmap_trace_exlist(f,ip,c,w)
+#endif
+
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int						/* error */
+xfs_bmapi(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting file offs. mapped */
+	xfs_filblks_t		len,		/* length to map in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_extlen_t		total,		/* total blocks needed */
+	struct xfs_bmbt_irec	*mval,		/* output: map values */
+	int			*nmap,		/* i/o: mval size/count */
+	xfs_bmap_free_t		*flist);	/* i/o: list extents to free */
+
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int						/* error */
+xfs_bmapi_single(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	int			whichfork,	/* data or attr fork */
+	xfs_fsblock_t		*fsb,		/* output: mapped block */
+	xfs_fileoff_t		bno);		/* starting file offs. mapped */
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.	If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int						/* error */
+xfs_bunmapi(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting offset to unmap */
+	xfs_filblks_t		len,		/* length to unmap in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_extnum_t		nexts,		/* number of extents max */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*done);		/* set if not done yet */
+
+/*
+ * Fcntl interface to xfs_bmapi.
+ */
+int						/* error code */
+xfs_getbmap(
+	bhv_desc_t		*bdp,		/* XFS behavior descriptor*/
+	struct getbmap		*bmv,		/* user bmap structure */
+	void			*ap,		/* pointer to user's array */
+	int			iflags);	/* interface flags */
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+int
+xfs_bmap_isaeof(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		off,
+	int			whichfork,
+	char			*aeof);
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary
+ */
+int
+xfs_bmap_eof(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		endoff,
+	int			whichfork,
+	int			*eof);
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int
+xfs_bmap_count_blocks(
+	xfs_trans_t		*tp,
+	xfs_inode_t		*ip,
+	int			whichfork,
+	int			*count);
+
+/*
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+	xfs_bmbt_rec_t		*ep,
+	xfs_extnum_t		num);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_BMAP_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bmap_btree.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bmap_btree.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bmap_btree.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bmap_btree.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,2634 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+#ifdef DEBUG
+ktrace_t	*xfs_bmbt_trace_buf;
+#endif
+
+/*
+ * Prototypes for internal btree functions.
+ */
+
+
+STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *, int);
+STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
+		xfs_bmbt_key_t *, xfs_btree_cur_t **, int *);
+STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
+
+
+#if defined(XFS_BMBT_TRACE)
+/*
+ * Add a trace buffer entry for the arguments given to the routine,
+ * generic form.
+ */
+STATIC void
+xfs_bmbt_trace_enter(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	char		*s,
+	int		type,
+	int		line,
+	__psunsigned_t	a0,
+	__psunsigned_t	a1,
+	__psunsigned_t	a2,
+	__psunsigned_t	a3,
+	__psunsigned_t	a4,
+	__psunsigned_t	a5,
+	__psunsigned_t	a6,
+	__psunsigned_t	a7,
+	__psunsigned_t	a8,
+	__psunsigned_t	a9,
+	__psunsigned_t	a10)
+{
+	xfs_inode_t	*ip;
+	int		whichfork;
+
+	ip = cur->bc_private.b.ip;
+	whichfork = cur->bc_private.b.whichfork;
+	ktrace_enter(xfs_bmbt_trace_buf,
+		(void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+		(void *)func, (void *)s, (void *)ip, (void *)cur,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
+		(void *)a8, (void *)a9, (void *)a10);
+	ASSERT(ip->i_btrace);
+	ktrace_enter(ip->i_btrace,
+		(void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+		(void *)func, (void *)s, (void *)ip, (void *)cur,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
+		(void *)a8, (void *)a9, (void *)a10);
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+STATIC void
+xfs_bmbt_trace_argbi(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	xfs_buf_t	*b,
+	int		i,
+	int		line)
+{
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
+		(__psunsigned_t)b, i, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+STATIC void
+xfs_bmbt_trace_argbii(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	xfs_buf_t	*b,
+	int		i0,
+	int		i1,
+	int		line)
+{
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
+		(__psunsigned_t)b, i0, i1, 0,
+		0, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+STATIC void
+xfs_bmbt_trace_argfffi(
+	char			*func,
+	xfs_btree_cur_t		*cur,
+	xfs_dfiloff_t		o,
+	xfs_dfsbno_t		b,
+	xfs_dfilblks_t		i,
+	int			j,
+	int			line)
+{
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
+		o >> 32, (int)o, b >> 32, (int)b,
+		i >> 32, (int)i, (int)j, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+STATIC void
+xfs_bmbt_trace_argi(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	int		i,
+	int		line)
+{
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
+		i, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+STATIC void
+xfs_bmbt_trace_argifk(
+	char			*func,
+	xfs_btree_cur_t		*cur,
+	int			i,
+	xfs_fsblock_t		f,
+	xfs_bmbt_key_t		*k,
+	int			line)
+{
+	xfs_dfsbno_t		d;
+	xfs_dfiloff_t		o;
+
+	d = (xfs_dfsbno_t)f;
+	o = INT_GET(k->br_startoff, ARCH_CONVERT);
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
+		i, d >> 32, (int)d, o >> 32,
+		(int)o, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+STATIC void
+xfs_bmbt_trace_argifr(
+	char			*func,
+	xfs_btree_cur_t		*cur,
+	int			i,
+	xfs_fsblock_t		f,
+	xfs_bmbt_rec_t		*r,
+	int			line)
+{
+	xfs_dfsbno_t		b;
+	xfs_dfilblks_t		c;
+	xfs_dfsbno_t		d;
+	xfs_dfiloff_t		o;
+	xfs_bmbt_irec_t		s;
+
+	d = (xfs_dfsbno_t)f;
+	xfs_bmbt_get_all(r, &s);
+	o = (xfs_dfiloff_t)s.br_startoff;
+	b = (xfs_dfsbno_t)s.br_startblock;
+	c = s.br_blockcount;
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
+		i, d >> 32, (int)d, o >> 32,
+		(int)o, b >> 32, (int)b, c >> 32,
+		(int)c, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+STATIC void
+xfs_bmbt_trace_argik(
+	char			*func,
+	xfs_btree_cur_t		*cur,
+	int			i,
+	xfs_bmbt_key_t		*k,
+	int			line)
+{
+	xfs_dfiloff_t		o;
+
+	o = INT_GET(k->br_startoff, ARCH_CONVERT);
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
+		i, o >> 32, (int)o, 0,
+		0, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+STATIC void
+xfs_bmbt_trace_cursor(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	char		*s,
+	int		line)
+{
+	xfs_bmbt_rec_t	r;
+
+	xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+	xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
+		(cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
+		cur->bc_private.b.allocated,
+#if BMBT_USE_64
+		INT_GET(r.l0, ARCH_CONVERT) >> 32, (int)INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT) >> 32, (int)INT_GET(r.l1, ARCH_CONVERT),
+#else
+		INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT), INT_GET(r.l2, ARCH_CONVERT), INT_GET(r.l3, ARCH_CONVERT),
+#endif
+		(unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
+		(unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
+		(cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+		(cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
+
+#define XFS_BMBT_TRACE_ARGBI(c,b,i)	\
+	xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__)
+#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)	\
+	xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__)
+#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)	\
+	xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__)
+#define XFS_BMBT_TRACE_ARGI(c,i)	\
+	xfs_bmbt_trace_argi(fname, c, i, __LINE__)
+#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)	\
+	xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__)
+#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)	\
+	xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__)
+#define XFS_BMBT_TRACE_ARGIK(c,i,k)	\
+	xfs_bmbt_trace_argik(fname, c, i, k, __LINE__)
+#define XFS_BMBT_TRACE_CURSOR(c,s)	\
+	xfs_bmbt_trace_cursor(fname, c, s, __LINE__)
+static char	ARGS[] = "args";
+static char	ENTRY[] = "entry";
+static char	ERROR[] = "error";
+#undef EXIT
+static char	EXIT[] = "exit";
+#else
+#define XFS_BMBT_TRACE_ARGBI(c,b,i)
+#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
+#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
+#define XFS_BMBT_TRACE_ARGI(c,i)
+#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)
+#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
+#define XFS_BMBT_TRACE_ARGIK(c,i,k)
+#define XFS_BMBT_TRACE_CURSOR(c,s)
+#endif	/* XFS_BMBT_TRACE */
+
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * Delete record pointed to by cur/level.
+ */
+STATIC int					/* error */
+xfs_bmbt_delrec(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			async,		/* deletion can be async */
+	int			*stat)		/* success/failure */
+{
+	xfs_bmbt_block_t	*block;		/* bmap btree block */
+	xfs_fsblock_t		bno;		/* fs-relative block number */
+	xfs_buf_t		*bp;		/* buffer for block */
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_delrec";
+#endif
+	int			i;		/* loop counter */
+	int			j;		/* temp state */
+	xfs_bmbt_key_t		key;		/* bmap btree key */
+	xfs_bmbt_key_t		*kp=NULL;	/* pointer to bmap btree key */
+	xfs_fsblock_t		lbno;		/* left sibling block number */
+	xfs_buf_t		*lbp;		/* left buffer pointer */
+	xfs_bmbt_block_t	*left;		/* left btree block */
+	xfs_bmbt_key_t		*lkp;		/* left btree key */
+	xfs_bmbt_ptr_t		*lpp;		/* left address pointer */
+	int			lrecs=0;	/* left record count */
+	xfs_bmbt_rec_t		*lrp;		/* left record pointer */
+	xfs_mount_t		*mp;		/* file system mount point */
+	xfs_bmbt_ptr_t		*pp;		/* pointer to bmap block addr */
+	int			ptr;		/* key/record index */
+	xfs_fsblock_t		rbno;		/* right sibling block number */
+	xfs_buf_t		*rbp;		/* right buffer pointer */
+	xfs_bmbt_block_t	*right;		/* right btree block */
+	xfs_bmbt_key_t		*rkp;		/* right btree key */
+	xfs_bmbt_rec_t		*rp;		/* pointer to bmap btree rec */
+	xfs_bmbt_ptr_t		*rpp;		/* right address pointer */
+	xfs_bmbt_block_t	*rrblock;	/* right-right btree block */
+	xfs_buf_t		*rrbp;		/* right-right buffer pointer */
+	int			rrecs=0;	/* right record count */
+	xfs_bmbt_rec_t		*rrp;		/* right record pointer */
+	xfs_btree_cur_t		*tcur;		/* temporary btree cursor */
+	int			numrecs;	/* temporary numrec count */
+	int			numlrecs, numrrecs;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	ptr = cur->bc_ptrs[level];
+	tcur = (xfs_btree_cur_t *)0;
+	if (ptr == 0) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	block = xfs_bmbt_get_block(cur, level, &bp);
+	numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		goto error0;
+	}
+#endif
+	if (ptr > numrecs) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_bmbt_delrec);
+	if (level > 0) {
+		kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+		pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = ptr; i < numrecs; i++) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+		}
+#endif
+		if (ptr < numrecs) {
+			ovbcopy(&kp[ptr], &kp[ptr - 1],
+				(numrecs - ptr) * sizeof(*kp));
+			ovbcopy(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */
+				(numrecs - ptr) * sizeof(*pp));
+			xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
+			xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
+		}
+	} else {
+		rp = XFS_BMAP_REC_IADDR(block, 1, cur);
+		if (ptr < numrecs) {
+			ovbcopy(&rp[ptr], &rp[ptr - 1],
+				(numrecs - ptr) * sizeof(*rp));
+			xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
+		}
+		if (ptr == 1) {
+			INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(rp));
+			kp = &key;
+		}
+	}
+	numrecs--;
+	INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+	xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
+	/*
+	 * We're at the root level.
+	 * First, shrink the root block in-memory.
+	 * Try to get rid of the next level down.
+	 * If we can't then there's nothing left to do.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+			cur->bc_private.b.whichfork);
+		if ((error = xfs_bmbt_killroot(cur, async))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		goto error0;
+	}
+	if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
+		if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+	lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
+	/*
+	 * One child of root, need to get a chance to copy its contents
+	 * into the root and delete it. Can't go up to next level,
+	 * there's nothing to delete there.
+	 */
+	if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
+	    level == cur->bc_nlevels - 2) {
+		if ((error = xfs_bmbt_killroot(cur, async))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
+	if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		goto error0;
+	}
+	bno = NULLFSBLOCK;
+	if (rbno != NULLFSBLOCK) {
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_bmbt_increment(tcur, level, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		rbp = tcur->bc_bufs[level];
+		right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+#endif
+		bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
+		    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
+			if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+				if (level > 0) {
+					if ((error = xfs_bmbt_decrement(cur,
+							level, &i))) {
+						XFS_BMBT_TRACE_CURSOR(cur,
+							ERROR);
+						goto error0;
+					}
+				}
+				XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+				*stat = 1;
+				return 0;
+			}
+		}
+		rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+		if (lbno != NULLFSBLOCK) {
+			i = xfs_btree_firstrec(tcur, level);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		}
+	}
+	if (lbno != NULLFSBLOCK) {
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * decrement to last in block
+		 */
+		if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		lbp = tcur->bc_bufs[level];
+		left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+#endif
+		bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
+		    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
+			if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+				if (level == 0)
+					cur->bc_ptrs[0]++;
+				XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+				*stat = 1;
+				return 0;
+			}
+		}
+		lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	tcur = NULL;
+	mp = cur->bc_mp;
+	ASSERT(bno != NULLFSBLOCK);
+	if (lbno != NULLFSBLOCK &&
+	    lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		rbno = bno;
+		right = block;
+		rbp = bp;
+		if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+		if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+	} else if (rbno != NULLFSBLOCK &&
+		   rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+		   XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		lbno = bno;
+		left = block;
+		lbp = bp;
+		if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+		if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	} else {
+		if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	numlrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	numrrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+	if (level > 0) {
+		lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
+		lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
+		rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < numrrecs; i++) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+		}
+#endif
+		bcopy(rkp, lkp, numrrecs * sizeof(*lkp));
+		bcopy(rpp, lpp, numrrecs * sizeof(*lpp));
+		xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
+		xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
+	} else {
+		lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
+		rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+		bcopy(rrp, lrp, numrrecs * sizeof(*lrp));
+		xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
+	}
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, numrrecs);
+	left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */
+	xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+		if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
+				INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
+		if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
+		xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+	xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
+		cur->bc_private.b.flist, mp);
+	if (!async)
+		xfs_trans_set_sync(cur->bc_tp);
+	cur->bc_private.b.ip->i_d.di_nblocks--;
+	xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+	if (XFS_IS_QUOTA_ON(mp) &&
+	    cur->bc_private.b.ip->i_ino != mp->m_sb.sb_uquotino &&
+	    cur->bc_private.b.ip->i_ino != mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(cur->bc_tp, cur->bc_private.b.ip,
+			XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(cur->bc_tp, rbp);
+	if (bp != lbp) {
+		cur->bc_bufs[level] = lbp;
+		cur->bc_ptrs[level] += lrecs;
+		cur->bc_ra[level] = 0;
+	} else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		goto error0;
+	}
+	if (level > 0)
+		cur->bc_ptrs[level]--;
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 2;
+	return 0;
+
+error0:
+	if (tcur)
+		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+#ifdef XFSDEBUG
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_bmbt_get_rec(
+	xfs_btree_cur_t		*cur,
+	xfs_fileoff_t		*off,
+	xfs_fsblock_t		*bno,
+	xfs_filblks_t		*len,
+	xfs_exntst_t		*state,
+	int			*stat)
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+#ifdef DEBUG
+	int			error;
+#endif
+	int			ptr;
+	xfs_bmbt_rec_t		*rp;
+
+	block = xfs_bmbt_get_block(cur, 0, &bp);
+	ptr = cur->bc_ptrs[0];
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, 0, bp)))
+		return error;
+#endif
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+	rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
+	*off = xfs_bmbt_get_startoff(rp);
+	*bno = xfs_bmbt_get_startblock(rp);
+	*len = xfs_bmbt_get_blockcount(rp);
+	*state = xfs_bmbt_get_state(rp);
+	*stat = 1;
+	return 0;
+}
+#endif
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int					/* error */
+xfs_bmbt_insrec(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	xfs_fsblock_t		*bnop,
+	xfs_bmbt_rec_t		*recp,
+	xfs_btree_cur_t		**curp,
+	int			*stat)		/* no-go/done/continue */
+{
+	xfs_bmbt_block_t	*block;		/* bmap btree block */
+	xfs_buf_t		*bp;		/* buffer for block */
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_insrec";
+#endif
+	int			i;		/* loop index */
+	xfs_bmbt_key_t		key;		/* bmap btree key */
+	xfs_bmbt_key_t		*kp=NULL;	/* pointer to bmap btree key */
+	int			logflags;	/* inode logging flags */
+	xfs_fsblock_t		nbno;		/* new block number */
+	struct xfs_btree_cur	*ncur;		/* new btree cursor */
+	xfs_bmbt_key_t		nkey;		/* new btree key value */
+	xfs_bmbt_rec_t		nrec;		/* new record count */
+	int			optr;		/* old key/record index */
+	xfs_bmbt_ptr_t		*pp;		/* pointer to bmap block addr */
+	int			ptr;		/* key/record index */
+	xfs_bmbt_rec_t		*rp=NULL;	/* pointer to bmap btree rec */
+	int			numrecs;
+
+	ASSERT(level < cur->bc_nlevels);
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
+	ncur = (xfs_btree_cur_t *)0;
+	INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(recp));
+	optr = ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_bmbt_insrec);
+	block = xfs_bmbt_get_block(cur, level, &bp);
+	numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (ptr <= numrecs) {
+		if (level == 0) {
+			rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
+		} else {
+			kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
+			xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
+		}
+	}
+#endif
+	nbno = NULLFSBLOCK;
+	if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
+			/*
+			 * A root block, that can be made bigger.
+			 */
+			xfs_iroot_realloc(cur->bc_private.b.ip, 1,
+				cur->bc_private.b.whichfork);
+			block = xfs_bmbt_get_block(cur, level, &bp);
+		} else if (level == cur->bc_nlevels - 1) {
+			if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
+			    *stat == 0) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+			xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+				logflags);
+			block = xfs_bmbt_get_block(cur, level, &bp);
+		} else {
+			if ((error = xfs_bmbt_rshift(cur, level, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+			if (i) {
+				/* nothing */
+			} else {
+				if ((error = xfs_bmbt_lshift(cur, level, &i))) {
+					XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+					return error;
+				}
+				if (i) {
+					optr = ptr = cur->bc_ptrs[level];
+				} else {
+					if ((error = xfs_bmbt_split(cur, level,
+							&nbno, &nkey, &ncur,
+							&i))) {
+						XFS_BMBT_TRACE_CURSOR(cur,
+							ERROR);
+						return error;
+					}
+					if (i) {
+						block = xfs_bmbt_get_block(
+							    cur, level, &bp);
+#ifdef DEBUG
+						if ((error =
+						    xfs_btree_check_lblock(cur,
+							    block, level, bp))) {
+							XFS_BMBT_TRACE_CURSOR(
+								cur, ERROR);
+							return error;
+						}
+#endif
+						ptr = cur->bc_ptrs[level];
+						xfs_bmbt_set_allf(&nrec,
+							nkey.br_startoff, 0, 0,
+							XFS_EXT_NORM);
+					} else {
+						XFS_BMBT_TRACE_CURSOR(cur,
+							EXIT);
+						*stat = 0;
+						return 0;
+					}
+				}
+			}
+		}
+	}
+	numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+	if (level > 0) {
+		kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+		pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = numrecs; i >= ptr; i--) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT),
+					level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+		}
+#endif
+		ovbcopy(&kp[ptr - 1], &kp[ptr],
+			(numrecs - ptr + 1) * sizeof(*kp));
+		ovbcopy(&pp[ptr - 1], &pp[ptr], /* INT_: direct copy */
+			(numrecs - ptr + 1) * sizeof(*pp));
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop,
+				level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		kp[ptr - 1] = key;
+		INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+		numrecs++;
+		INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+		xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
+		xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
+	} else {
+		rp = XFS_BMAP_REC_IADDR(block, 1, cur);
+		ovbcopy(&rp[ptr - 1], &rp[ptr],
+			(numrecs - ptr + 1) * sizeof(*rp));
+		rp[ptr - 1] = *recp;
+		numrecs++;
+		INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+		xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
+	}
+	xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+	if (ptr < numrecs) {
+		if (level == 0)
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
+				rp + ptr);
+		else
+			xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
+				kp + ptr);
+	}
+#endif
+	if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	*bnop = nbno;
+	if (nbno != NULLFSBLOCK) {
+		*recp = nrec;
+		*curp = ncur;
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_bmbt_killroot(
+	xfs_btree_cur_t		*cur,
+	int			async)
+{
+	xfs_bmbt_block_t	*block;
+	xfs_bmbt_block_t	*cblock;
+	xfs_buf_t		*cbp;
+	xfs_bmbt_key_t		*ckp;
+	xfs_bmbt_ptr_t		*cpp;
+#ifdef DEBUG
+	int			error;
+#endif
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_killroot";
+#endif
+	int			i;
+	xfs_bmbt_key_t		*kp;
+	xfs_inode_t		*ip;
+	xfs_ifork_t		*ifp;
+	int			level;
+	xfs_bmbt_ptr_t		*pp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	level = cur->bc_nlevels - 1;
+	ASSERT(level >= 1);
+	/*
+	 * Don't deal with the root block needs to be a leaf case.
+	 * We're just going to turn the thing back into extents anyway.
+	 */
+	if (level == 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		return 0;
+	}
+	block = xfs_bmbt_get_block(cur, level, &cbp);
+	/*
+	 * Give up if the root has multiple children.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) != 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		return 0;
+	}
+	/*
+	 * Only do this if the next level will fit.
+	 * Then the data must be copied up to the inode,
+	 * instead of freeing the root you free the next level.
+	 */
+	cbp = cur->bc_bufs[level - 1];
+	cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
+	if (INT_GET(cblock->bb_numrecs, ARCH_CONVERT) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		return 0;
+	}
+	ASSERT(INT_GET(cblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO);
+	ASSERT(INT_GET(cblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO);
+	ip = cur->bc_private.b.ip;
+	ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
+	ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
+	       XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
+	i = (int)(INT_GET(cblock->bb_numrecs, ARCH_CONVERT) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
+	if (i) {
+		xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
+		block = ifp->if_broot;
+	}
+	INT_MOD(block->bb_numrecs, ARCH_CONVERT, i);
+	ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) == INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
+	kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+	ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
+	bcopy(ckp, kp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*kp));
+	pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+	cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
+#ifdef DEBUG
+	for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) {
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+	}
+#endif
+	bcopy(cpp, pp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*pp));
+	xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
+		cur->bc_private.b.flist, cur->bc_mp);
+	if (!async)
+		xfs_trans_set_sync(cur->bc_tp);
+	ip->i_d.di_nblocks--;
+	if (XFS_IS_QUOTA_ON(cur->bc_mp) &&
+	    ip->i_ino != cur->bc_mp->m_sb.sb_uquotino &&
+	    ip->i_ino != cur->bc_mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(cur->bc_tp, ip, XFS_TRANS_DQ_BCOUNT,
+			-1L);
+	xfs_trans_binval(cur->bc_tp, cbp);
+	cur->bc_bufs[level - 1] = NULL;
+	INT_MOD(block->bb_level, ARCH_CONVERT, -1);
+	xfs_trans_log_inode(cur->bc_tp, ip,
+		XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+	cur->bc_nlevels--;
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	return 0;
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_bmbt_log_keys(
+	xfs_btree_cur_t *cur,
+	xfs_buf_t	*bp,
+	int		kfirst,
+	int		klast)
+{
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_log_keys";
+#endif
+	xfs_trans_t	*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
+	tp = cur->bc_tp;
+	if (bp) {
+		xfs_bmbt_block_t	*block;
+		int			first;
+		xfs_bmbt_key_t		*kp;
+		int			last;
+
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
+		first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
+		last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
+		xfs_trans_log_buf(tp, bp, first, last);
+	} else {
+		xfs_inode_t		 *ip;
+
+		ip = cur->bc_private.b.ip;
+		xfs_trans_log_inode(tp, ip,
+			XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+
+/*
+ * Log pointer values from the btree block.
+ */
+STATIC void
+xfs_bmbt_log_ptrs(
+	xfs_btree_cur_t *cur,
+	xfs_buf_t	*bp,
+	int		pfirst,
+	int		plast)
+{
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_log_ptrs";
+#endif
+	xfs_trans_t	*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
+	tp = cur->bc_tp;
+	if (bp) {
+		xfs_bmbt_block_t	*block;
+		int			first;
+		int			last;
+		xfs_bmbt_ptr_t		*pp;
+
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
+		first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
+		last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
+		xfs_trans_log_buf(tp, bp, first, last);
+	} else {
+		xfs_inode_t		*ip;
+
+		ip = cur->bc_private.b.ip;
+		xfs_trans_log_inode(tp, ip,
+			XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ */
+STATIC int				/* error */
+xfs_bmbt_lookup(
+	xfs_btree_cur_t		*cur,
+	xfs_lookup_t		dir,
+	int			*stat)		/* success/failure */
+{
+	xfs_bmbt_block_t	*block=NULL;
+	xfs_buf_t		*bp;
+	xfs_daddr_t		d;
+	xfs_sfiloff_t		diff;
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_lookup";
+#endif
+	xfs_fsblock_t		fsbno=0;
+	int			high;
+	int			i;
+	int			keyno=0;
+	xfs_bmbt_key_t		*kkbase=NULL;
+	xfs_bmbt_key_t		*kkp;
+	xfs_bmbt_rec_t		*krbase=NULL;
+	xfs_bmbt_rec_t		*krp;
+	int			level;
+	int			low;
+	xfs_mount_t		*mp;
+	xfs_bmbt_ptr_t		*pp;
+	xfs_bmbt_irec_t		*rp;
+	xfs_fileoff_t		startoff;
+	xfs_trans_t		*tp;
+
+	XFS_STATS_INC(xfsstats.xs_bmbt_lookup);
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, (int)dir);
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+	rp = &cur->bc_rec.b;
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		if (level < cur->bc_nlevels - 1) {
+			d = XFS_FSB_TO_DADDR(mp, fsbno);
+			bp = cur->bc_bufs[level];
+			if (bp && XFS_BUF_ADDR(bp) != d)
+				bp = (xfs_buf_t *)0;
+			if (!bp) {
+				if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
+						0, &bp, XFS_BMAP_BTREE_REF))) {
+					XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+					return error;
+				}
+				xfs_btree_setbuf(cur, level, bp);
+				block = XFS_BUF_TO_BMBT_BLOCK(bp);
+				if ((error = xfs_btree_check_lblock(cur, block,
+						level, bp))) {
+					XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+					return error;
+				}
+			} else
+				block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		} else
+			block = xfs_bmbt_get_block(cur, level, &bp);
+		if (diff == 0)
+			keyno = 1;
+		else {
+			if (level > 0)
+				kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
+			else
+				krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
+			low = 1;
+			if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
+				ASSERT(level == 0);
+				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+				XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+				*stat = 0;
+				return 0;
+			}
+			while (low <= high) {
+				XFS_STATS_INC(xfsstats.xs_bmbt_compare);
+				keyno = (low + high) >> 1;
+				if (level > 0) {
+					kkp = kkbase + keyno - 1;
+					startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT);
+				} else {
+					krp = krbase + keyno - 1;
+					startoff = xfs_bmbt_get_startoff(krp);
+				}
+				diff = (xfs_sfiloff_t)
+						(startoff - rp->br_startoff);
+				if (diff < 0)
+					low = keyno + 1;
+				else if (diff > 0)
+					high = keyno - 1;
+				else
+					break;
+			}
+		}
+		if (level > 0) {
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
+#ifdef DEBUG
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+#endif
+			fsbno = INT_GET(*pp, ARCH_CONVERT);
+			cur->bc_ptrs[level] = keyno;
+		}
+	}
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		if (dir == XFS_LOOKUP_GE && keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
+		    INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+			cur->bc_ptrs[0] = keyno;
+			if ((error = xfs_bmbt_increment(cur, 0, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+			XFS_WANT_CORRUPTED_RETURN(i == 1);
+			XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+			*stat = 1;
+			return 0;
+		}
+	}
+	else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_ptrs[0] = keyno;
+	if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+	} else {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
+	}
+	return 0;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_bmbt_lshift(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_lshift";
+#endif
+#ifdef DEBUG
+	int			i;		/* loop counter */
+#endif
+	xfs_bmbt_key_t		key;		/* bmap btree key */
+	xfs_buf_t		*lbp;		/* left buffer pointer */
+	xfs_bmbt_block_t	*left;		/* left btree block */
+	xfs_bmbt_key_t		*lkp=NULL;	/* left btree key */
+	xfs_bmbt_ptr_t		*lpp;		/* left address pointer */
+	int			lrecs;		/* left record count */
+	xfs_bmbt_rec_t		*lrp=NULL;	/* left record pointer */
+	xfs_mount_t		*mp;		/* file system mount point */
+	xfs_buf_t		*rbp;		/* right buffer pointer */
+	xfs_bmbt_block_t	*right;		/* right btree block */
+	xfs_bmbt_key_t		*rkp=NULL;	/* right btree key */
+	xfs_bmbt_ptr_t		*rpp=NULL;	/* right address pointer */
+	xfs_bmbt_rec_t		*rrp=NULL;	/* right record pointer */
+	int			rrecs;		/* right record count */
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	if (level == cur->bc_nlevels - 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	rbp = cur->bc_bufs[level];
+	right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	if (cur->bc_ptrs[level] <= 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	mp = cur->bc_mp;
+	if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0,
+			&lbp, XFS_BMAP_BTREE_REF))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+	if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
+	if (level > 0) {
+		lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
+		rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+		*lkp = *rkp;
+		xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
+		lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
+		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		*lpp = *rpp; /* INT_: direct copy */
+		xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
+	} else {
+		lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
+		rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+		*lrp = *rrp;
+		xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
+	}
+	INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs);
+	xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+	if (level > 0)
+		xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
+	else
+		xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
+#endif
+	rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1;
+	INT_SET(right->bb_numrecs, ARCH_CONVERT, rrecs);
+	xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
+	if (level > 0) {
+#ifdef DEBUG
+		for (i = 0; i < rrecs; i++) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+					level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+		}
+#endif
+		ovbcopy(rkp + 1, rkp, rrecs * sizeof(*rkp));
+		ovbcopy(rpp + 1, rpp, rrecs * sizeof(*rpp));
+		xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
+		xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
+	} else {
+		ovbcopy(rrp + 1, rrp, rrecs * sizeof(*rrp));
+		xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
+		INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(rrp));
+		rkp = &key;
+	}
+	if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	cur->bc_ptrs[level]--;
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_bmbt_rshift(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_rshift";
+#endif
+	int			i;		/* loop counter */
+	xfs_bmbt_key_t		key;		/* bmap btree key */
+	xfs_buf_t		*lbp;		/* left buffer pointer */
+	xfs_bmbt_block_t	*left;		/* left btree block */
+	xfs_bmbt_key_t		*lkp;		/* left btree key */
+	xfs_bmbt_ptr_t		*lpp;		/* left address pointer */
+	xfs_bmbt_rec_t		*lrp;		/* left record pointer */
+	xfs_mount_t		*mp;		/* file system mount point */
+	xfs_buf_t		*rbp;		/* right buffer pointer */
+	xfs_bmbt_block_t	*right;		/* right btree block */
+	xfs_bmbt_key_t		*rkp;		/* right btree key */
+	xfs_bmbt_ptr_t		*rpp;		/* right address pointer */
+	xfs_bmbt_rec_t		*rrp=NULL;	/* right record pointer */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	if (level == cur->bc_nlevels - 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	lbp = cur->bc_bufs[level];
+	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	mp = cur->bc_mp;
+	if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
+			&rbp, XFS_BMAP_BTREE_REF))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+	if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	if (level > 0) {
+		lkp = XFS_BMAP_KEY_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		lpp = XFS_BMAP_PTR_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+		}
+#endif
+		ovbcopy(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		*rkp = *lkp;
+		*rpp = *lpp; /* INT_: direct copy */
+		xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+	} else {
+		lrp = XFS_BMAP_REC_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+		ovbcopy(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		*rrp = *lrp;
+		xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(rrp));
+		rkp = &key;
+	}
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+#ifdef DEBUG
+	if (level > 0)
+		xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
+	else
+		xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
+#endif
+	xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
+	if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	i = xfs_btree_lastrec(tcur, level);
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	if ((error = xfs_bmbt_increment(tcur, level, &i))) {
+		XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
+		goto error1;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
+		XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
+		goto error1;
+	}
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+error0:
+	XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Determine the extent state.
+ */
+/* ARGSUSED */
+STATIC xfs_exntst_t
+xfs_extent_state(
+	xfs_filblks_t		blks,
+	int			extent_flag)
+{
+	if (extent_flag) {
+		ASSERT(blks != 0);	/* saved for DMIG */
+		return XFS_EXT_UNWRITTEN;
+	}
+	return XFS_EXT_NORM;
+}
+
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and its first record (to be inserted into parent).
+ */
+STATIC int					/* error */
+xfs_bmbt_split(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	xfs_fsblock_t		*bnop,
+	xfs_bmbt_key_t		*keyp,
+	xfs_btree_cur_t		**curp,
+	int			*stat)		/* success/failure */
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_split";
+#endif
+	int			i;		/* loop counter */
+	xfs_fsblock_t		lbno;		/* left sibling block number */
+	xfs_buf_t		*lbp;		/* left buffer pointer */
+	xfs_bmbt_block_t	*left;		/* left btree block */
+	xfs_bmbt_key_t		*lkp;		/* left btree key */
+	xfs_bmbt_ptr_t		*lpp;		/* left address pointer */
+	xfs_bmbt_rec_t		*lrp;		/* left record pointer */
+	xfs_buf_t		*rbp;		/* right buffer pointer */
+	xfs_bmbt_block_t	*right;		/* right btree block */
+	xfs_bmbt_key_t		*rkp;		/* right btree key */
+	xfs_bmbt_ptr_t		*rpp;		/* right address pointer */
+	xfs_bmbt_block_t	*rrblock;	/* right-right btree block */
+	xfs_buf_t		*rrbp;		/* right-right buffer pointer */
+	xfs_bmbt_rec_t		*rrp;		/* right record pointer */
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp);
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	lbp = cur->bc_bufs[level];
+	lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
+	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+	args.fsbno = cur->bc_private.b.firstblock;
+	if (args.fsbno == NULLFSBLOCK) {
+		args.fsbno = lbno;
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else if (cur->bc_private.b.flist->xbf_low)
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+	else
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.mod = args.minleft = args.alignment = args.total = args.isfl =
+		args.userdata = args.minalignslop = 0;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+	if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return XFS_ERROR(ENOSPC);
+	}
+	if ((error = xfs_alloc_vextent(&args))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	cur->bc_private.b.ip->i_d.di_nblocks++;
+	xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+	if (XFS_IS_QUOTA_ON(args.mp) &&
+	    cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_uquotino &&
+	    cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+			XFS_TRANS_DQ_BCOUNT, 1L);
+	rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
+	right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	INT_SET(right->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+	right->bb_level = left->bb_level; /* INT_: direct copy */
+	INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
+	if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
+	    cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
+		INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+	i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
+	if (level > 0) {
+		lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
+		lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
+		rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+		}
+#endif
+		bcopy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		bcopy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+		xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT);
+	} else {
+		lrp = XFS_BMAP_REC_IADDR(left, i, cur);
+		rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+		bcopy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		keyp->br_startoff = xfs_bmbt_get_startoff(rrp);
+	}
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
+	right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
+	INT_SET(left->bb_rightsib, ARCH_CONVERT, args.fsbno);
+	INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
+	xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
+	xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+		if ((error = xfs_btree_read_bufl(args.mp, args.tp,
+				INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
+		if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.fsbno);
+		xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+	if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	if (level + 1 < cur->bc_nlevels) {
+		if ((error = xfs_btree_dup_cursor(cur, curp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		(*curp)->bc_ptrs[level + 1]++;
+	}
+	*bnop = args.fsbno;
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+
+/*
+ * Update keys for the record.
+ */
+STATIC int
+xfs_bmbt_updkey(
+	xfs_btree_cur_t		*cur,
+	xfs_bmbt_key_t		*keyp,	/* on-disk format */
+	int			level)
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+#ifdef DEBUG
+	int			error;
+#endif
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_updkey";
+#endif
+	xfs_bmbt_key_t		*kp;
+	int			ptr;
+
+	ASSERT(level >= 1);
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
+	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+		block = xfs_bmbt_get_block(cur, level, &bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		ptr = cur->bc_ptrs[level];
+		kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
+		*kp = *keyp;
+		xfs_bmbt_log_keys(cur, bp, ptr, ptr);
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	return 0;
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen,
+	xfs_bmbt_block_t	*rblock,
+	int			rblocklen)
+{
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	xfs_bmbt_ptr_t		*fpp;
+	xfs_bmbt_key_t		*tkp;
+	xfs_bmbt_ptr_t		*tpp;
+
+	INT_SET(rblock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+	rblock->bb_level = dblock->bb_level;	/* both in on-disk format */
+	ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0);
+	rblock->bb_numrecs = dblock->bb_numrecs;/* both in on-disk format */
+	INT_SET(rblock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
+	INT_SET(rblock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
+	dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+	fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+	tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+	fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+	tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+	dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT);
+	bcopy(fkp, tkp, sizeof(*fkp) * dmxr);
+	bcopy(fpp, tpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_bmbt_decrement(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_decrement";
+#endif
+	xfs_fsblock_t		fsbno;
+	int			lev;
+	xfs_mount_t		*mp;
+	xfs_trans_t		*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	ASSERT(level < cur->bc_nlevels);
+	if (level < cur->bc_nlevels - 1)
+		xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+	if (--cur->bc_ptrs[level] > 0) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	block = xfs_bmbt_get_block(cur, level, &bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_ptrs[lev] > 0)
+			break;
+		if (lev < cur->bc_nlevels - 1)
+			xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+	if (lev == cur->bc_nlevels) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+	for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
+		fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ */
+int					/* error */
+xfs_bmbt_delete(
+	xfs_btree_cur_t *cur,
+	int		async,		/* deletion can be async */
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_delete";
+#endif
+	int		i;
+	int		level;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	for (level = 0, i = 2; i == 2; level++) {
+		if ((error = xfs_bmbt_delrec(cur, level, async, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+	}
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_ptrs[level] == 0) {
+				if ((error = xfs_bmbt_decrement(cur, level,
+						&i))) {
+					XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+					return error;
+				}
+				break;
+			}
+		}
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = i;
+	return 0;
+}
+
+/*
+ * Convert a compressed bmap extent record to an uncompressed form.
+ * This code must be in sync with the routines xfs_bmbt_get_startoff,
+ * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
+ */
+void
+xfs_bmbt_get_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	int	ext_flag;
+	xfs_exntst_t st;
+	__uint64_t	l0, l1;
+
+	l0 = INT_GET(r->l0, ARCH_CONVERT);
+	l1 = INT_GET(r->l1, ARCH_CONVERT);
+	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
+	s->br_startoff = ((xfs_fileoff_t)l0 &
+			   XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+#if XFS_BIG_FILESYSTEMS
+	s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) |
+			   (((xfs_fsblock_t)l1) >> 21);
+#else
+#ifdef DEBUG
+	{
+		xfs_dfsbno_t	b;
+
+		b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) |
+		    (((xfs_dfsbno_t)l1) >> 21);
+		ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+		s->br_startblock = (xfs_fsblock_t)b;
+	}
+#else	/* !DEBUG */
+	s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
+#endif	/* DEBUG */
+#endif	/* XFS_BIG_FILESYSTEMS */
+	s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21));
+	/* This is xfs_extent_state() in-line */
+	if (ext_flag) {
+		ASSERT(s->br_blockcount != 0);	/* saved for DMIG */
+		st = XFS_EXT_UNWRITTEN;
+	} else
+		st = XFS_EXT_NORM;
+	s->br_state = st;
+}
+
+/*
+ * Get the block pointer for the given level of the cursor.
+ * Fill in the buffer pointer, if applicable.
+ */
+xfs_bmbt_block_t *
+xfs_bmbt_get_block(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	xfs_buf_t		**bpp)
+{
+	xfs_ifork_t		*ifp;
+	xfs_bmbt_block_t	*rval;
+
+	if (level < cur->bc_nlevels - 1) {
+		*bpp = cur->bc_bufs[level];
+		rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
+	} else {
+		*bpp = 0;
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+			cur->bc_private.b.whichfork);
+		rval = ifp->if_broot;
+	}
+	return rval;
+}
+
+/*
+ * Extract the blockcount field from a bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+	xfs_bmbt_rec_t	*r)
+{
+	return (xfs_filblks_t)(INT_GET(r->l1, ARCH_CONVERT) & XFS_MASK64LO(21));
+}
+
+/*
+ * Extract the startblock field from a bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+	xfs_bmbt_rec_t	*r)
+{
+#if XFS_BIG_FILESYSTEMS
+	return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
+	       (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
+#else
+#ifdef DEBUG
+	xfs_dfsbno_t	b;
+
+	b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
+	    (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
+	ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+	return (xfs_fsblock_t)b;
+#else	/* !DEBUG */
+	return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
+#endif	/* DEBUG */
+#endif	/* XFS_BIG_FILESYSTEMS */
+}
+
+/*
+ * Extract the startoff field from a bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+	xfs_bmbt_rec_t	*r)
+{
+	return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) &
+		 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+xfs_exntst_t
+xfs_bmbt_get_state(
+	xfs_bmbt_rec_t	*r)
+{
+	int	ext_flag;
+
+	ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN));
+	return xfs_extent_state(xfs_bmbt_get_blockcount(r),
+				ext_flag);
+}
+
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_bmbt_increment(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_increment";
+#endif
+	xfs_fsblock_t		fsbno;
+	int			lev;
+	xfs_mount_t		*mp;
+	xfs_trans_t		*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	ASSERT(level < cur->bc_nlevels);
+	if (level < cur->bc_nlevels - 1)
+		xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+	block = xfs_bmbt_get_block(cur, level, &bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		block = xfs_bmbt_get_block(cur, lev, &bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			break;
+		if (lev < cur->bc_nlevels - 1)
+			xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+	if (lev == cur->bc_nlevels) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+	for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
+		fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		cur->bc_ptrs[lev] = 1;
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Insert the current record at the point referenced by cur.
+ */
+int					/* error */
+xfs_bmbt_insert(
+	xfs_btree_cur_t *cur,
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_insert";
+#endif
+	int		i;
+	int		level;
+	xfs_fsblock_t	nbno;
+	xfs_btree_cur_t *ncur;
+	xfs_bmbt_rec_t	nrec;
+	xfs_btree_cur_t *pcur;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	level = 0;
+	nbno = NULLFSBLOCK;
+	xfs_bmbt_set_all(&nrec, &cur->bc_rec.b);
+	ncur = (xfs_btree_cur_t *)0;
+	pcur = cur;
+	do {
+		if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
+				&i))) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
+			cur->bc_nlevels = pcur->bc_nlevels;
+			cur->bc_private.b.allocated +=
+				pcur->bc_private.b.allocated;
+			pcur->bc_private.b.allocated = 0;
+			ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
+			       (cur->bc_private.b.ip->i_d.di_flags &
+				XFS_DIFLAG_REALTIME));
+			cur->bc_private.b.firstblock =
+				pcur->bc_private.b.firstblock;
+			ASSERT(cur->bc_private.b.flist ==
+			       pcur->bc_private.b.flist);
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		if (ncur) {
+			pcur = ncur;
+			ncur = (xfs_btree_cur_t *)0;
+		}
+	} while (nbno != NULLFSBLOCK);
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = i;
+	return 0;
+error0:
+	XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+	return error;
+}
+
+/*
+ * Log fields from the btree block header.
+ */
+void
+xfs_bmbt_log_block(
+	xfs_btree_cur_t		*cur,
+	xfs_buf_t		*bp,
+	int			fields)
+{
+	int			first;
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_log_block";
+#endif
+	int			last;
+	xfs_trans_t		*tp;
+	static const short	offsets[] = {
+		offsetof(xfs_bmbt_block_t, bb_magic),
+		offsetof(xfs_bmbt_block_t, bb_level),
+		offsetof(xfs_bmbt_block_t, bb_numrecs),
+		offsetof(xfs_bmbt_block_t, bb_leftsib),
+		offsetof(xfs_bmbt_block_t, bb_rightsib),
+		sizeof(xfs_bmbt_block_t)
+	};
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
+	tp = cur->bc_tp;
+	if (bp) {
+		xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
+				  &last);
+		xfs_trans_log_buf(tp, bp, first, last);
+	} else
+		xfs_trans_log_inode(tp, cur->bc_private.b.ip,
+			XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_bmbt_log_recs(
+	xfs_btree_cur_t		*cur,
+	xfs_buf_t		*bp,
+	int			rfirst,
+	int			rlast)
+{
+	xfs_bmbt_block_t	*block;
+	int			first;
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_log_recs";
+#endif
+	int			last;
+	xfs_bmbt_rec_t		*rp;
+	xfs_trans_t		*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
+	ASSERT(bp);
+	tp = cur->bc_tp;
+	block = XFS_BUF_TO_BMBT_BLOCK(bp);
+	rp = XFS_BMAP_REC_DADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(tp, bp, first, last);
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+
+int					/* error */
+xfs_bmbt_lookup_eq(
+	xfs_btree_cur_t *cur,
+	xfs_fileoff_t	off,
+	xfs_fsblock_t	bno,
+	xfs_filblks_t	len,
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+int					/* error */
+xfs_bmbt_lookup_ge(
+	xfs_btree_cur_t *cur,
+	xfs_fileoff_t	off,
+	xfs_fsblock_t	bno,
+	xfs_filblks_t	len,
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+int					/* error */
+xfs_bmbt_lookup_le(
+	xfs_btree_cur_t *cur,
+	xfs_fileoff_t	off,
+	xfs_fsblock_t	bno,
+	xfs_filblks_t	len,
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_bmbt_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Give the bmap btree a new root block.  Copy the old broot contents
+ * down into a real block and make the broot point to it.
+ */
+int						/* error */
+xfs_bmbt_newroot(
+	xfs_btree_cur_t		*cur,		/* btree cursor */
+	int			*logflags,	/* logging flags for inode */
+	int			*stat)		/* return status - 0 fail */
+{
+	xfs_alloc_arg_t		args;		/* allocation arguments */
+	xfs_bmbt_block_t	*block;		/* bmap btree block */
+	xfs_buf_t		*bp;		/* buffer for block */
+	xfs_bmbt_block_t	*cblock;	/* child btree block */
+	xfs_bmbt_key_t		*ckp;		/* child key pointer */
+	xfs_bmbt_ptr_t		*cpp;		/* child ptr pointer */
+	int			error;		/* error return code */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_newroot";
+#endif
+#ifdef DEBUG
+	int			i;		/* loop counter */
+#endif
+	xfs_bmbt_key_t		*kp;		/* pointer to bmap btree key */
+	int			level;		/* btree level */
+	xfs_bmbt_ptr_t		*pp;		/* pointer to bmap block addr */
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	level = cur->bc_nlevels - 1;
+	block = xfs_bmbt_get_block(cur, level, &bp);
+	/*
+	 * Copy the root into a real block.
+	 */
+	args.mp = cur->bc_mp;
+	pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+	args.tp = cur->bc_tp;
+	args.fsbno = cur->bc_private.b.firstblock;
+	args.mod = args.minleft = args.alignment = args.total = args.isfl =
+		args.userdata = args.minalignslop = 0;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+	if (args.fsbno == NULLFSBLOCK) {
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		args.fsbno = INT_GET(*pp, ARCH_CONVERT);
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else if (args.wasdel)
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+	else
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	if ((error = xfs_alloc_vextent(&args))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	cur->bc_private.b.ip->i_d.di_nblocks++;
+	if (XFS_IS_QUOTA_ON(args.mp) &&
+	    cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_uquotino &&
+	    cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+					  XFS_TRANS_DQ_BCOUNT, 1L);
+	bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
+	cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
+	*cblock = *block;
+	INT_MOD(block->bb_level, ARCH_CONVERT, +1);
+	INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+	cur->bc_nlevels++;
+	cur->bc_ptrs[level + 1] = 1;
+	kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+	ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
+	bcopy(kp, ckp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*kp));
+	cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
+#ifdef DEBUG
+	for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) {
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+	}
+#endif
+	bcopy(pp, cpp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*pp));
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno,
+			level))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+	xfs_iroot_realloc(cur->bc_private.b.ip, 1 - INT_GET(cblock->bb_numrecs, ARCH_CONVERT),
+		cur->bc_private.b.whichfork);
+	xfs_btree_setbuf(cur, level, bp);
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
+	xfs_bmbt_log_keys(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
+	xfs_bmbt_log_ptrs(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*logflags |=
+		XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_set_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	int	extent_flag;
+
+	ASSERT((s->br_state == XFS_EXT_NORM) ||
+		(s->br_state == XFS_EXT_UNWRITTEN));
+	extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1;
+	ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0);
+	ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0);
+#if XFS_BIG_FILESYSTEMS
+	ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0);
+#endif	/* XFS_BIG_FILESYSTEMS */
+#if XFS_BIG_FILESYSTEMS
+	INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		  ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+		  ((xfs_bmbt_rec_base_t)s->br_startblock >> 43));
+	INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+		  ((xfs_bmbt_rec_base_t)s->br_blockcount &
+		   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+#else	/* !XFS_BIG_FILESYSTEMS */
+	if (ISNULLSTARTBLOCK(s->br_startblock)) {
+		INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+			  (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+		INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
+			  ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+			  ((xfs_bmbt_rec_base_t)s->br_blockcount &
+			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	} else {
+		INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)s->br_startoff << 9));
+		INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+			  ((xfs_bmbt_rec_base_t)s->br_blockcount &
+			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	}
+#endif	/* XFS_BIG_FILESYSTEMS */
+}
+
+/*
+ * Set all the fields in a bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_set_allf(
+	xfs_bmbt_rec_t	*r,
+	xfs_fileoff_t	o,
+	xfs_fsblock_t	b,
+	xfs_filblks_t	c,
+	xfs_exntst_t	v)
+{
+	int	extent_flag;
+
+	ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN));
+	extent_flag = (v == XFS_EXT_NORM) ? 0 : 1;
+	ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+#if XFS_BIG_FILESYSTEMS
+	ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+#endif	/* XFS_BIG_FILESYSTEMS */
+#if XFS_BIG_FILESYSTEMS
+	INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		((xfs_bmbt_rec_base_t)o << 9) |
+		((xfs_bmbt_rec_base_t)b >> 43));
+	INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
+		  ((xfs_bmbt_rec_base_t)c &
+		   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+#else	/* !XFS_BIG_FILESYSTEMS */
+	if (ISNULLSTARTBLOCK(b)) {
+		INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)o << 9) |
+			 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+		INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
+			  ((xfs_bmbt_rec_base_t)b << 21) |
+			  ((xfs_bmbt_rec_base_t)c &
+			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	} else {
+		INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)o << 9));
+		INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
+			  ((xfs_bmbt_rec_base_t)c &
+			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	}
+#endif	/* XFS_BIG_FILESYSTEMS */
+}
+
+/*
+ * Set the blockcount field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_blockcount(
+	xfs_bmbt_rec_t	*r,
+	xfs_filblks_t	v)
+{
+	ASSERT((v & XFS_MASK64HI(43)) == 0);
+	INT_SET(r->l1, ARCH_CONVERT, (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) |
+		  (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21)));
+}
+
+/*
+ * Set the startblock field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startblock(
+	xfs_bmbt_rec_t	*r,
+	xfs_fsblock_t	v)
+{
+#if XFS_BIG_FILESYSTEMS
+	ASSERT((v & XFS_MASK64HI(12)) == 0);
+#endif	/* XFS_BIG_FILESYSTEMS */
+#if XFS_BIG_FILESYSTEMS
+	INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) |
+		  (xfs_bmbt_rec_base_t)(v >> 43));
+	INT_SET(r->l1, ARCH_CONVERT, (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) |
+		  (xfs_bmbt_rec_base_t)(v << 21));
+#else	/* !XFS_BIG_FILESYSTEMS */
+	if (ISNULLSTARTBLOCK(v)) {
+		INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) | (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)));
+		INT_SET(r->l1, ARCH_CONVERT, (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) |
+			  ((xfs_bmbt_rec_base_t)v << 21) |
+			  (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	} else {
+		INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) & ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9)));
+		INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)v << 21) |
+			  (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	}
+#endif	/* XFS_BIG_FILESYSTEMS */
+}
+
+/*
+ * Set the startoff field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startoff(
+	xfs_bmbt_rec_t	*r,
+	xfs_fileoff_t	v)
+{
+	ASSERT((v & XFS_MASK64HI(9)) == 0);
+	INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) |
+		((xfs_bmbt_rec_base_t)v << 9) |
+		  (INT_GET(r->l0, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)));
+}
+
+/*
+ * Set the extent state field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_state(
+	xfs_bmbt_rec_t	*r,
+	xfs_exntst_t	v)
+{
+	ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
+	if (v == XFS_EXT_NORM)
+		INT_SET(r->l0, ARCH_CONVERT, INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN));
+	else
+		INT_SET(r->l0, ARCH_CONVERT, INT_GET(r->l0, ARCH_CONVERT) | XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN));
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+	xfs_bmbt_block_t	*rblock,
+	int			rblocklen,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen)
+{
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	xfs_bmbt_ptr_t		*fpp;
+	xfs_bmbt_key_t		*tkp;
+	xfs_bmbt_ptr_t		*tpp;
+
+	ASSERT(INT_GET(rblock->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC);
+	ASSERT(INT_GET(rblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO);
+	ASSERT(INT_GET(rblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO);
+	ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0);
+	dblock->bb_level = rblock->bb_level;	/* both in on-disk format */
+	dblock->bb_numrecs = rblock->bb_numrecs;/* both in on-disk format */
+	dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+	fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+	tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+	fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+	tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+	dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT);
+	bcopy(fkp, tkp, sizeof(*fkp) * dmxr);
+	bcopy(fpp, tpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+}
+
+/*
+ * Update the record to the passed values.
+ */
+int
+xfs_bmbt_update(
+	xfs_btree_cur_t		*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	xfs_exntst_t		state)
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+	int			error;
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_update";
+#endif
+	xfs_bmbt_key_t		key;
+	int			ptr;
+	xfs_bmbt_rec_t		*rp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
+		(xfs_dfilblks_t)len, (int)state);
+	block = xfs_bmbt_get_block(cur, 0, &bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	ptr = cur->bc_ptrs[0];
+	rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
+	xfs_bmbt_set_allf(rp, off, bno, len, state);
+	xfs_bmbt_log_recs(cur, bp, ptr, ptr);
+	if (ptr > 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		return 0;
+	}
+	INT_SET(key.br_startoff, ARCH_CONVERT, off);
+	if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	return 0;
+}
+
+/*
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field. ASSERT on debug
+ * kernels, as this condition should not occur.
+ * Return an error condition (1) if any flags found,
+ * otherwise return 0.
+ */
+int
+xfs_check_nostate_extents(
+	xfs_bmbt_rec_t		*ep,
+	xfs_extnum_t		num)
+{
+	for (; num > 0; num--, ep++) {
+		if (((INT_GET(ep->l0, ARCH_CONVERT)) >>
+		     (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
+			ASSERT(0);
+			return 1;
+		}
+	}
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bmap_btree.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bmap_btree.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_bmap_btree.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_bmap_btree.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+
+#define XFS_BMAP_MAGIC	0x424d4150	/* 'BMAP' */
+
+struct xfs_btree_cur;
+struct xfs_btree_lblock;
+struct xfs_mount;
+struct xfs_inode;
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block
+{
+	__uint16_t	bb_level;	/* 0 is a leaf */
+	__uint16_t	bb_numrecs;	/* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ * For 32-bit kernels,
+ *  l0:31 is an extent flag (value 1 indicates non-normal).
+ *  l0:0-30 and l1:9-31 are startoff.
+ *  l1:0-8, l2:0-31, and l3:21-31 are startblock.
+ *  l3:0-20 are blockcount.
+ * For 64-bit kernels,
+ *  l0:63 is an extent flag (value 1 indicates non-normal).
+ *  l0:9-62 are startoff.
+ *  l0:0-8 and l1:21-63 are startblock.
+ *  l1:0-20 are blockcount.
+ */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+
+#define BMBT_TOTAL_BITLEN	128	/* 128 bits, 16 bytes */
+#define BMBT_EXNTFLAG_BITOFF	0
+#define BMBT_EXNTFLAG_BITLEN	1
+#define BMBT_STARTOFF_BITOFF	(BMBT_EXNTFLAG_BITOFF + BMBT_EXNTFLAG_BITLEN)
+#define BMBT_STARTOFF_BITLEN	54
+#define BMBT_STARTBLOCK_BITOFF	(BMBT_STARTOFF_BITOFF + BMBT_STARTOFF_BITLEN)
+#define BMBT_STARTBLOCK_BITLEN	52
+#define BMBT_BLOCKCOUNT_BITOFF	\
+	(BMBT_STARTBLOCK_BITOFF + BMBT_STARTBLOCK_BITLEN)
+#define BMBT_BLOCKCOUNT_BITLEN	(BMBT_TOTAL_BITLEN - BMBT_BLOCKCOUNT_BITOFF)
+
+#else
+
+#define BMBT_TOTAL_BITLEN	128	/* 128 bits, 16 bytes */
+#define BMBT_EXNTFLAG_BITOFF	63
+#define BMBT_EXNTFLAG_BITLEN	1
+#define BMBT_STARTOFF_BITOFF	(BMBT_EXNTFLAG_BITOFF - BMBT_STARTOFF_BITLEN)
+#define BMBT_STARTOFF_BITLEN	54
+#define BMBT_STARTBLOCK_BITOFF	85 /* 128 - 43 (other 9 is in first word) */
+#define BMBT_STARTBLOCK_BITLEN	52
+#define BMBT_BLOCKCOUNT_BITOFF	64 /* Start of second 64 bit container */
+#define BMBT_BLOCKCOUNT_BITLEN	21
+
+#endif
+
+
+#define BMBT_USE_64	1
+
+typedef struct xfs_bmbt_rec_32
+{
+	__uint32_t		l0, l1, l2, l3;
+} xfs_bmbt_rec_32_t;
+typedef struct xfs_bmbt_rec_64
+{
+	__uint64_t		l0, l1;
+} xfs_bmbt_rec_64_t;
+
+typedef __uint64_t	xfs_bmbt_rec_base_t;	/* use this for casts */
+typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS	17
+#define STARTBLOCKMASKBITS	(15 + XFS_BIG_FILESYSTEMS * 20)
+#define DSTARTBLOCKMASKBITS	(15 + 20)
+#define STARTBLOCKMASK		\
+	(((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK		\
+	(((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLSTARTBLOCK)
+int isnullstartblock(xfs_fsblock_t x);
+#define ISNULLSTARTBLOCK(x)	isnullstartblock(x)
+#else
+#define ISNULLSTARTBLOCK(x)	(((x) & STARTBLOCKMASK) == STARTBLOCKMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLDSTARTBLOCK)
+int isnulldstartblock(xfs_dfsbno_t x);
+#define ISNULLDSTARTBLOCK(x)	isnulldstartblock(x)
+#else
+#define ISNULLDSTARTBLOCK(x)	(((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_NULLSTARTBLOCK)
+xfs_fsblock_t nullstartblock(int k);
+#define NULLSTARTBLOCK(k)	nullstartblock(k)
+#else
+#define NULLSTARTBLOCK(k)	\
+	((ASSERT(k < (1 << STARTBLOCKVALBITS))), (STARTBLOCKMASK | (k)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_STARTBLOCKVAL)
+xfs_filblks_t startblockval(xfs_fsblock_t x);
+#define STARTBLOCKVAL(x)	startblockval(x)
+#else
+#define STARTBLOCKVAL(x)	((xfs_filblks_t)((x) & ~STARTBLOCKMASK))
+#endif
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+	XFS_EXTFMT_NOSTATE = 0,
+	XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+	XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+	XFS_EXT_DMAPI_OFFLINE
+} xfs_exntst_t;
+
+/*
+ * Extent state and extent format macros.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTFMT_INODE )
+xfs_exntfmt_t xfs_extfmt_inode(struct xfs_inode *ip);
+#define XFS_EXTFMT_INODE(x)	xfs_extfmt_inode(x)
+#else
+#define XFS_EXTFMT_INODE(x) \
+  (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \
+	XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
+#endif
+#define ISUNWRITTEN(x)		((x) == XFS_EXT_UNWRITTEN)
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+	xfs_fileoff_t	br_startoff;	/* starting file offset */
+	xfs_fsblock_t	br_startblock;	/* starting block number */
+	xfs_filblks_t	br_blockcount;	/* number of blocks */
+	xfs_exntst_t	br_state;	/* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key
+{
+	xfs_dfiloff_t	br_startoff;	/* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;	/* btree pointer type */
+					/* btree block header type */
+typedef struct xfs_btree_lblock xfs_bmbt_block_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BMBT_BLOCK)
+xfs_bmbt_block_t *xfs_buf_to_bmbt_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_BMBT_BLOCK(bp)		xfs_buf_to_bmbt_block(bp)
+#else
+#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_DSIZE)
+int xfs_bmap_rblock_dsize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_RBLOCK_DSIZE(lev,cur)		xfs_bmap_rblock_dsize(lev,cur)
+#else
+#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_ISIZE)
+int xfs_bmap_rblock_isize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_RBLOCK_ISIZE(lev,cur)		xfs_bmap_rblock_isize(lev,cur)
+#else
+#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \
+	((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
+			    (cur)->bc_private.b.whichfork)->if_broot_bytes)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_IBLOCK_SIZE)
+int xfs_bmap_iblock_size(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_IBLOCK_SIZE(lev,cur)		xfs_bmap_iblock_size(lev,cur)
+#else
+#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DSIZE)
+int xfs_bmap_block_dsize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_DSIZE(lev,cur)		xfs_bmap_block_dsize(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BMAP_RBLOCK_DSIZE(lev,cur) : \
+		XFS_BMAP_IBLOCK_SIZE(lev,cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_ISIZE)
+int xfs_bmap_block_isize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_ISIZE(lev,cur)		xfs_bmap_block_isize(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BMAP_RBLOCK_ISIZE(lev,cur) : \
+		XFS_BMAP_IBLOCK_SIZE(lev,cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMAXRECS)
+int xfs_bmap_block_dmaxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur)	xfs_bmap_block_dmaxrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
+			xfs_bmdr, (lev) == 0) : \
+		((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMAXRECS)
+int xfs_bmap_block_imaxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur)	xfs_bmap_block_imaxrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \
+			xfs_bmbt, (lev) == 0) : \
+		((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMINRECS)
+int xfs_bmap_block_dminrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_DMINRECS(lev,cur)	xfs_bmap_block_dminrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
+			xfs_bmdr, (lev) == 0) : \
+		((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMINRECS)
+int xfs_bmap_block_iminrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_IMINRECS(lev,cur)	xfs_bmap_block_iminrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \
+			xfs_bmbt, (lev) == 0) : \
+		((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_DADDR)
+xfs_bmbt_rec_t *
+xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_REC_DADDR(bb,i,cur)		xfs_bmap_rec_daddr(bb,i,cur)
+#else
+#define XFS_BMAP_REC_DADDR(bb,i,cur) \
+	XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_IADDR)
+xfs_bmbt_rec_t *
+xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_REC_IADDR(bb,i,cur)		xfs_bmap_rec_iaddr(bb,i,cur)
+#else
+#define XFS_BMAP_REC_IADDR(bb,i,cur) \
+	XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_DADDR)
+xfs_bmbt_key_t *
+xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_KEY_DADDR(bb,i,cur)		xfs_bmap_key_daddr(bb,i,cur)
+#else
+#define XFS_BMAP_KEY_DADDR(bb,i,cur) \
+	XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_IADDR)
+xfs_bmbt_key_t *
+xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_KEY_IADDR(bb,i,cur)		xfs_bmap_key_iaddr(bb,i,cur)
+#else
+#define XFS_BMAP_KEY_IADDR(bb,i,cur) \
+	XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_DADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_PTR_DADDR(bb,i,cur)		xfs_bmap_ptr_daddr(bb,i,cur)
+#else
+#define XFS_BMAP_PTR_DADDR(bb,i,cur) \
+	XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_IADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_PTR_IADDR(bb,i,cur)		xfs_bmap_ptr_iaddr(bb,i,cur)
+#else
+#define XFS_BMAP_PTR_IADDR(bb,i,cur) \
+	XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_REC_ADDR)
+xfs_bmbt_rec_t *xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz);
+#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz)	xfs_bmap_broot_rec_addr(bb,i,sz)
+#else
+#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
+	XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_KEY_ADDR)
+xfs_bmbt_key_t *xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz);
+#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz)	xfs_bmap_broot_key_addr(bb,i,sz)
+#else
+#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
+	XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_PTR_ADDR)
+xfs_bmbt_ptr_t *xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz);
+#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz)	xfs_bmap_broot_ptr_addr(bb,i,sz)
+#else
+#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
+	XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_NUMRECS)
+int xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb);
+#define XFS_BMAP_BROOT_NUMRECS(bb)		xfs_bmap_broot_numrecs(bb)
+#else
+#define XFS_BMAP_BROOT_NUMRECS(bb) (INT_GET((bb)->bb_numrecs, ARCH_CONVERT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_MAXRECS)
+int xfs_bmap_broot_maxrecs(int sz);
+#define XFS_BMAP_BROOT_MAXRECS(sz)		xfs_bmap_broot_maxrecs(sz)
+#else
+#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE_CALC)
+int xfs_bmap_broot_space_calc(int nrecs);
+#define XFS_BMAP_BROOT_SPACE_CALC(nrecs)	xfs_bmap_broot_space_calc(nrecs)
+#else
+#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
+	((int)(sizeof(xfs_bmbt_block_t) + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE)
+int xfs_bmap_broot_space(xfs_bmdr_block_t *bb);
+#define XFS_BMAP_BROOT_SPACE(bb)		xfs_bmap_broot_space(bb)
+#else
+#define XFS_BMAP_BROOT_SPACE(bb) \
+	XFS_BMAP_BROOT_SPACE_CALC(INT_GET((bb)->bb_numrecs, ARCH_CONVERT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMDR_SPACE_CALC)
+int xfs_bmdr_space_calc(int nrecs);
+#define XFS_BMDR_SPACE_CALC(nrecs)		xfs_bmdr_space_calc(nrecs)
+#else
+#define XFS_BMDR_SPACE_CALC(nrecs)	\
+	((int)(sizeof(xfs_bmdr_block_t) + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))))
+#endif
+
+/*
+ * Maximum number of bmap btree levels.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BM_MAXLEVELS)
+int xfs_bm_maxlevels(struct xfs_mount *mp, int w);
+#define XFS_BM_MAXLEVELS(mp,w)			xfs_bm_maxlevels(mp,w)
+#else
+#define XFS_BM_MAXLEVELS(mp,w)		((mp)->m_bm_maxlevels[w])
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_SANITY_CHECK)
+int xfs_bmap_sanity_check(struct xfs_mount *mp, xfs_bmbt_block_t *bb,
+	int level);
+#define XFS_BMAP_SANITY_CHECK(mp,bb,level)	\
+	xfs_bmap_sanity_check(mp,bb,level)
+#else
+#define XFS_BMAP_SANITY_CHECK(mp,bb,level)	\
+	(INT_GET((bb)->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC && \
+	 INT_GET((bb)->bb_level, ARCH_CONVERT) == level && \
+	 INT_GET((bb)->bb_numrecs, ARCH_CONVERT) > 0 && \
+	 INT_GET((bb)->bb_numrecs, ARCH_CONVERT) <= (mp)->m_bmap_dmxr[(level) != 0])
+#endif
+
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BMBT_KTRACE_ARGBI	1
+#define XFS_BMBT_KTRACE_ARGBII	2
+#define XFS_BMBT_KTRACE_ARGFFFI 3
+#define XFS_BMBT_KTRACE_ARGI	4
+#define XFS_BMBT_KTRACE_ARGIFK	5
+#define XFS_BMBT_KTRACE_ARGIFR	6
+#define XFS_BMBT_KTRACE_ARGIK	7
+#define XFS_BMBT_KTRACE_CUR	8
+
+#define XFS_BMBT_TRACE_SIZE	4096	/* size of global trace buffer */
+#define XFS_BMBT_KTRACE_SIZE	32	/* size of per-inode trace buffer */
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_BMBT_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_BMBT_TRACE
+#endif
+
+
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+
+void
+xfs_bmdr_to_bmbt(
+	xfs_bmdr_block_t *,
+	int,
+	xfs_bmbt_block_t *,
+	int);
+
+int
+xfs_bmbt_decrement(
+	struct xfs_btree_cur *,
+	int,
+	int *);
+
+int
+xfs_bmbt_delete(
+	struct xfs_btree_cur *,
+	int,
+	int *);
+
+void
+xfs_bmbt_get_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s);
+
+xfs_bmbt_block_t *
+xfs_bmbt_get_block(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	struct xfs_buf		**bpp);
+
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+	xfs_bmbt_rec_t	*r);
+
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+	xfs_bmbt_rec_t	*r);
+
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+	xfs_bmbt_rec_t	*r);
+
+xfs_exntst_t
+xfs_bmbt_get_state(
+	xfs_bmbt_rec_t	*r);
+
+int
+xfs_bmbt_increment(
+	struct xfs_btree_cur *,
+	int,
+	int *);
+
+int
+xfs_bmbt_insert(
+	struct xfs_btree_cur *,
+	int *);
+
+int
+xfs_bmbt_insert_many(
+	struct xfs_btree_cur *,
+	int,
+	xfs_bmbt_rec_t *,
+	int *);
+
+void
+xfs_bmbt_log_block(
+	struct xfs_btree_cur *,
+	struct xfs_buf *,
+	int);
+
+void
+xfs_bmbt_log_recs(
+	struct xfs_btree_cur *,
+	struct xfs_buf *,
+	int,
+	int);
+
+int
+xfs_bmbt_lookup_eq(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t,
+	xfs_fsblock_t,
+	xfs_filblks_t,
+	int *);
+
+int
+xfs_bmbt_lookup_ge(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t,
+	xfs_fsblock_t,
+	xfs_filblks_t,
+	int *);
+
+int
+xfs_bmbt_lookup_le(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t,
+	xfs_fsblock_t,
+	xfs_filblks_t,
+	int *);
+
+/*
+ * Give the bmap btree a new root block.  Copy the old broot contents
+ * down into a real block and make the broot point to it.
+ */
+int						/* error */
+xfs_bmbt_newroot(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			*logflags,	/* logging flags for inode */
+	int			*stat);		/* return status - 0 fail */
+
+void
+xfs_bmbt_set_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s);
+
+void
+xfs_bmbt_set_allf(
+	xfs_bmbt_rec_t	*r,
+	xfs_fileoff_t	o,
+	xfs_fsblock_t	b,
+	xfs_filblks_t	c,
+	xfs_exntst_t	v);
+
+void
+xfs_bmbt_set_blockcount(
+	xfs_bmbt_rec_t	*r,
+	xfs_filblks_t	v);
+
+void
+xfs_bmbt_set_startblock(
+	xfs_bmbt_rec_t	*r,
+	xfs_fsblock_t	v);
+
+void
+xfs_bmbt_set_startoff(
+	xfs_bmbt_rec_t	*r,
+	xfs_fileoff_t	v);
+
+void
+xfs_bmbt_set_state(
+	xfs_bmbt_rec_t	*r,
+	xfs_exntst_t	v);
+
+void
+xfs_bmbt_to_bmdr(
+	xfs_bmbt_block_t *,
+	int,
+	xfs_bmdr_block_t *,
+	int);
+
+int
+xfs_bmbt_update(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t,
+	xfs_fsblock_t,
+	xfs_filblks_t,
+	xfs_exntst_t);
+
+#ifdef XFSDEBUG
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_bmbt_get_rec(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t *,
+	xfs_fsblock_t *,
+	xfs_filblks_t *,
+	xfs_exntst_t *,
+	int *);
+#endif
+
+
+/*
+ * Search an extent list for the extent which includes block
+ * bno.
+ */
+xfs_bmbt_rec_t *
+xfs_bmap_do_search_extents(
+	xfs_bmbt_rec_t *,
+	xfs_extnum_t,
+	xfs_extnum_t,
+	xfs_fileoff_t,
+	int *,
+	xfs_extnum_t *,
+	xfs_bmbt_irec_t *,
+	xfs_bmbt_irec_t *);
+
+
+#endif	/* __XFS_BMAP_BTREE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_btree.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_btree.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_btree.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_btree.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,937 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains common code for the space manager's btree implementations.
+ */
+
+#include <xfs.h>
+
+/*
+ * Cursor allocation zone.
+ */
+kmem_zone_t	*xfs_btree_cur_zone;
+
+/*
+ * Btree magic numbers.
+ */
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
+{
+	XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
+};
+
+/*
+ * Prototypes for internal routines.
+ */
+
+/*
+ * Checking routine: return maxrecs for the block.
+ */
+STATIC int				/* number of records fitting in block */
+xfs_btree_maxrecs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_block_t	*block);/* generic btree block pointer */
+
+/*
+ * Internal routines.
+ */
+
+/*
+ * Checking routine: return maxrecs for the block.
+ */
+STATIC int				/* number of records fitting in block */
+xfs_btree_maxrecs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_block_t	*block) /* generic btree block pointer */
+{
+	switch (cur->bc_btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		return (int)XFS_ALLOC_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
+	case XFS_BTNUM_BMAP:
+		return (int)XFS_BMAP_BLOCK_IMAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
+	case XFS_BTNUM_INO:
+		return (int)XFS_INOBT_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
+	default:
+		ASSERT(0);
+		return 0;
+	}
+}
+
+/*
+ * External routines.
+ */
+
+#ifdef DEBUG
+/*
+ * Debug routine: check that block header is ok.
+ */
+void
+xfs_btree_check_block(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_block_t	*block, /* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	xfs_buf_t		*bp)	/* buffer containing block, if any */
+{
+	if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+		xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
+			bp);
+	else
+		xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
+			bp);
+}
+
+/*
+ * Debug routine: check that keys are in the right order.
+ */
+void
+xfs_btree_check_key(
+	xfs_btnum_t	btnum,		/* btree identifier */
+	void		*ak1,		/* pointer to left (lower) key */
+	void		*ak2)		/* pointer to right (higher) key */
+{
+	switch (btnum) {
+	case XFS_BTNUM_BNO: {
+		xfs_alloc_key_t *k1;
+		xfs_alloc_key_t *k2;
+
+		k1 = ak1;
+		k2 = ak2;
+		ASSERT(INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT));
+		break;
+	    }
+	case XFS_BTNUM_CNT: {
+		xfs_alloc_key_t *k1;
+		xfs_alloc_key_t *k2;
+
+		k1 = ak1;
+		k2 = ak2;
+		ASSERT(INT_GET(k1->ar_blockcount, ARCH_CONVERT) < INT_GET(k2->ar_blockcount, ARCH_CONVERT) ||
+		       (INT_GET(k1->ar_blockcount, ARCH_CONVERT) == INT_GET(k2->ar_blockcount, ARCH_CONVERT) &&
+			INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT)));
+		break;
+	    }
+	case XFS_BTNUM_BMAP: {
+		xfs_bmbt_key_t	*k1;
+		xfs_bmbt_key_t	*k2;
+
+		k1 = ak1;
+		k2 = ak2;
+		ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT));
+		break;
+	    }
+	case XFS_BTNUM_INO: {
+		xfs_inobt_key_t *k1;
+		xfs_inobt_key_t *k2;
+
+		k1 = ak1;
+		k2 = ak2;
+		ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT));
+		break;
+	    }
+	default:
+		ASSERT(0);
+	}
+}
+#endif	/* DEBUG */
+
+/*
+ * Checking routine: check that long form block header is ok.
+ */
+/* ARGSUSED */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_lblock_t	*block, /* btree long form block pointer */
+	int			level,	/* level of the btree block */
+	xfs_buf_t		*bp)	/* buffer for block, if any */
+{
+	int			lblock_ok; /* block passes checks */
+	xfs_mount_t		*mp;	/* file system mount point */
+
+	mp = cur->bc_mp;
+	lblock_ok =
+		INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] &&
+		INT_GET(block->bb_level, ARCH_CONVERT) == level &&
+		INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+			xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+		!INT_ISZERO(block->bb_leftsib, ARCH_CONVERT) &&
+		(INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO ||
+		 XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_leftsib, ARCH_CONVERT))) &&
+		!INT_ISZERO(block->bb_rightsib, ARCH_CONVERT) &&
+		(INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO ||
+		 XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_rightsib, ARCH_CONVERT)));
+	if (XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+			XFS_RANDOM_BTREE_CHECK_LBLOCK)) {
+		if (bp)
+			xfs_buftrace("LBTREE ERROR", bp);
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * Checking routine: check that (long) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_dfsbno_t	ptr,		/* btree block disk address */
+	int		level)		/* btree block level */
+{
+	xfs_mount_t	*mp;		/* file system mount point */
+
+	mp = cur->bc_mp;
+	XFS_WANT_CORRUPTED_RETURN(
+		level > 0 &&
+		ptr != NULLDFSBNO &&
+		XFS_FSB_SANITY_CHECK(mp, ptr));
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Debug routine: check that records are in the right order.
+ */
+void
+xfs_btree_check_rec(
+	xfs_btnum_t	btnum,		/* btree identifier */
+	void		*ar1,		/* pointer to left (lower) record */
+	void		*ar2)		/* pointer to right (higher) record */
+{
+	switch (btnum) {
+	case XFS_BTNUM_BNO: {
+		xfs_alloc_rec_t *r1;
+		xfs_alloc_rec_t *r2;
+
+		r1 = ar1;
+		r2 = ar2;
+		ASSERT(INT_GET(r1->ar_startblock, ARCH_CONVERT) + INT_GET(r1->ar_blockcount, ARCH_CONVERT) <=
+		       INT_GET(r2->ar_startblock, ARCH_CONVERT));
+		break;
+	    }
+	case XFS_BTNUM_CNT: {
+		xfs_alloc_rec_t *r1;
+		xfs_alloc_rec_t *r2;
+
+		r1 = ar1;
+		r2 = ar2;
+		ASSERT(INT_GET(r1->ar_blockcount, ARCH_CONVERT) < INT_GET(r2->ar_blockcount, ARCH_CONVERT) ||
+		       (INT_GET(r1->ar_blockcount, ARCH_CONVERT) == INT_GET(r2->ar_blockcount, ARCH_CONVERT) &&
+			INT_GET(r1->ar_startblock, ARCH_CONVERT) < INT_GET(r2->ar_startblock, ARCH_CONVERT)));
+		break;
+	    }
+	case XFS_BTNUM_BMAP: {
+		xfs_bmbt_rec_t	*r1;
+		xfs_bmbt_rec_t	*r2;
+
+		r1 = ar1;
+		r2 = ar2;
+		ASSERT(xfs_bmbt_get_startoff(r1) +
+		       xfs_bmbt_get_blockcount(r1) <=
+		       xfs_bmbt_get_startoff(r2));
+		break;
+	    }
+	case XFS_BTNUM_INO: {
+		xfs_inobt_rec_t *r1;
+		xfs_inobt_rec_t *r2;
+
+		r1 = ar1;
+		r2 = ar2;
+		ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <=
+		       INT_GET(r2->ir_startino, ARCH_CONVERT));
+		break;
+	    }
+	default:
+		ASSERT(0);
+	}
+}
+#endif	/* DEBUG */
+
+/*
+ * Checking routine: check that block header is ok.
+ */
+/* ARGSUSED */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_sblock_t	*block, /* btree short form block pointer */
+	int			level,	/* level of the btree block */
+	xfs_buf_t		*bp)	/* buffer containing block */
+{
+	xfs_buf_t		*agbp;	/* buffer for ag. freespace struct */
+	xfs_agf_t		*agf;	/* ag. freespace structure */
+	xfs_agblock_t		agflen; /* native ag. freespace length */
+	int			sblock_ok; /* block passes checks */
+
+	agbp = cur->bc_private.a.agbp;
+	agf = XFS_BUF_TO_AGF(agbp);
+	agflen = INT_GET(agf->agf_length, ARCH_CONVERT);
+	sblock_ok =
+		INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] &&
+		INT_GET(block->bb_level, ARCH_CONVERT) == level &&
+		INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+			xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+		(INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK ||
+		 INT_GET(block->bb_leftsib, ARCH_CONVERT) < agflen) &&
+		!INT_ISZERO(block->bb_leftsib, ARCH_CONVERT) &&
+		(INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK ||
+		 INT_GET(block->bb_rightsib, ARCH_CONVERT) < agflen) &&
+		!INT_ISZERO(block->bb_rightsib, ARCH_CONVERT);
+	if (XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
+			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+			XFS_RANDOM_BTREE_CHECK_SBLOCK)) {
+		if (bp)
+			xfs_buftrace("SBTREE ERROR", bp);
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"xfs_btree_check_sblock: Not OK:");
+		cmn_err(CE_NOTE,
+			"magic 0x%x level %d numrecs %d leftsib %d rightsib %d",
+			INT_GET(block->bb_magic, ARCH_CONVERT),
+			INT_GET(block->bb_level, ARCH_CONVERT),
+			INT_GET(block->bb_numrecs, ARCH_CONVERT),
+			INT_GET(block->bb_leftsib, ARCH_CONVERT),
+			INT_GET(block->bb_rightsib, ARCH_CONVERT));
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * Checking routine: check that (short) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agblock_t	ptr,		/* btree block disk address */
+	int		level)		/* btree block level */
+{
+	xfs_buf_t	*agbp;		/* buffer for ag. freespace struct */
+	xfs_agf_t	*agf;		/* ag. freespace structure */
+
+	agbp = cur->bc_private.a.agbp;
+	agf = XFS_BUF_TO_AGF(agbp);
+	XFS_WANT_CORRUPTED_RETURN(
+		level > 0 &&
+		ptr != NULLAGBLOCK && ptr != 0 &&
+		ptr < INT_GET(agf->agf_length, ARCH_CONVERT));
+	return 0;
+}
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		error)		/* del because of error */
+{
+	int		i;		/* btree level */
+
+	/*
+	 * Clear the buffer pointers, and release the buffers.
+	 * If we're doing this in the face of an error, we
+	 * need to make sure to inspect all of the entries
+	 * in the bc_bufs array for buffers to be unlocked.
+	 * This is because some of the btree code works from
+	 * level n down to 0, and if we get an error along
+	 * the way we won't have initialized all the entries
+	 * down to 0.
+	 */
+	for (i = 0; i < cur->bc_nlevels; i++) {
+		if (cur->bc_bufs[i])
+			xfs_btree_setbuf(cur, i, NULL);
+		else if (!error)
+			break;
+	}
+	/*
+	 * Can't free a bmap cursor without having dealt with the
+	 * allocated indirect blocks' accounting.
+	 */
+	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
+	       cur->bc_private.b.allocated == 0);
+	/*
+	 * Free the cursor.
+	 */
+	kmem_zone_free(xfs_btree_cur_zone, cur);
+}
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	xfs_btree_cur_t *cur,		/* input cursor */
+	xfs_btree_cur_t **ncur)		/* output cursor */
+{
+	xfs_buf_t	*bp;		/* btree block's buffer pointer */
+	int		error;		/* error return value */
+	int		i;		/* level number of btree block */
+	xfs_mount_t	*mp;		/* mount structure for filesystem */
+	xfs_btree_cur_t *new;		/* new cursor value */
+	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
+
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+	/*
+	 * Allocate a new cursor like the old one.
+	 */
+	new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
+		cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
+		cur->bc_private.b.whichfork);
+	/*
+	 * Copy the record currently in the cursor.
+	 */
+	new->bc_rec = cur->bc_rec;
+	/*
+	 * For each level current, re-get the buffer and copy the ptr value.
+	 */
+	for (i = 0; i < new->bc_nlevels; i++) {
+		new->bc_ptrs[i] = cur->bc_ptrs[i];
+		new->bc_ra[i] = cur->bc_ra[i];
+		if ((bp = cur->bc_bufs[i])) {
+			if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+				XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+				xfs_btree_del_cursor(new, error);
+				*ncur = NULL;
+				return error;
+			}
+			new->bc_bufs[i] = bp;
+			ASSERT(bp);
+			ASSERT(!XFS_BUF_GETERROR(bp));
+		} else
+			new->bc_bufs[i] = NULL;
+	}
+	/*
+	 * For bmap btrees, copy the firstblock, flist, and flags values,
+	 * since init cursor doesn't get them.
+	 */
+	if (new->bc_btnum == XFS_BTNUM_BMAP) {
+		new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+		new->bc_private.b.flist = cur->bc_private.b.flist;
+		new->bc_private.b.flags = cur->bc_private.b.flags;
+	}
+	*ncur = new;
+	return 0;
+}
+
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+int					/* success=1, failure=0 */
+xfs_btree_firstrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	xfs_btree_block_t	*block; /* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (INT_ISZERO(block->bb_h.bb_numrecs, ARCH_CONVERT))
+		return 0;
+	/*
+	 * Set the ptr value to 1, that's the first record/key.
+	 */
+	cur->bc_ptrs[level] = 1;
+	return 1;
+}
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be a bmap btree root or from a buffer.
+ */
+xfs_btree_block_t *			/* generic btree block pointer */
+xfs_btree_get_block(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree */
+	xfs_buf_t		**bpp)	/* buffer containing the block */
+{
+	xfs_btree_block_t	*block; /* return value */
+	xfs_buf_t		*bp;	/* return buffer */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	int			whichfork; /* data or attr fork */
+
+	if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
+		whichfork = cur->bc_private.b.whichfork;
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
+		block = (xfs_btree_block_t *)ifp->if_broot;
+		bp = NULL;
+	} else {
+		bp = cur->bc_bufs[level];
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	ASSERT(block != NULL);
+	*bpp = bp;
+	return block;
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+xfs_buf_t *				/* buffer for fsbno */
+xfs_btree_get_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_buf_t	*bp;		/* buffer pointer (return value) */
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+	ASSERT(bp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+	return bp;
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+xfs_buf_t *				/* buffer for agno/agbno */
+xfs_btree_get_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_buf_t	*bp;		/* buffer pointer (return value) */
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+	ASSERT(bp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+	return bp;
+}
+
+/*
+ * Allocate a new btree cursor.
+ * The cursor is either for allocation (A) or bmap (B) or inodes (I).
+ */
+xfs_btree_cur_t *			/* new btree cursor */
+xfs_btree_init_cursor(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*agbp,		/* (A only) buffer for agf structure */
+					/* (I only) buffer for agi structure */
+	xfs_agnumber_t	agno,		/* (AI only) allocation group number */
+	xfs_btnum_t	btnum,		/* btree identifier */
+	xfs_inode_t	*ip,		/* (B only) inode owning the btree */
+	int		whichfork)	/* (B only) data or attr fork */
+{
+	xfs_agf_t	*agf;		/* (A) allocation group freespace */
+	xfs_agi_t	*agi;		/* (I) allocation group inodespace */
+	xfs_btree_cur_t *cur;		/* return value */
+	xfs_ifork_t	*ifp;		/* (I) inode fork pointer */
+	int		nlevels=0;	/* number of levels in the btree */
+
+	ASSERT(xfs_btree_cur_zone != NULL);
+	/*
+	 * Allocate a new cursor.
+	 */
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+	/*
+	 * Deduce the number of btree levels from the arguments.
+	 */
+	switch (btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		agf = XFS_BUF_TO_AGF(agbp);
+		nlevels = INT_GET(agf->agf_levels[btnum], ARCH_CONVERT);
+		break;
+	case XFS_BTNUM_BMAP:
+		ifp = XFS_IFORK_PTR(ip, whichfork);
+		nlevels = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1;
+		break;
+	case XFS_BTNUM_INO:
+		agi = XFS_BUF_TO_AGI(agbp);
+		nlevels = INT_GET(agi->agi_level, ARCH_CONVERT);
+		break;
+	default:
+		ASSERT(0);
+	}
+	/*
+	 * Fill in the common fields.
+	 */
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_nlevels = nlevels;
+	cur->bc_btnum = btnum;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	/*
+	 * Fill in private fields.
+	 */
+	switch (btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		/*
+		 * Allocation btree fields.
+		 */
+		cur->bc_private.a.agbp = agbp;
+		cur->bc_private.a.agno = agno;
+		break;
+	case XFS_BTNUM_BMAP:
+		/*
+		 * Bmap btree fields.
+		 */
+		cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+		cur->bc_private.b.ip = ip;
+		cur->bc_private.b.firstblock = NULLFSBLOCK;
+		cur->bc_private.b.flist = NULL;
+		cur->bc_private.b.allocated = 0;
+		cur->bc_private.b.flags = 0;
+		cur->bc_private.b.whichfork = whichfork;
+		break;
+	case XFS_BTNUM_INO:
+		/*
+		 * Inode allocation btree fields.
+		 */
+		cur->bc_private.i.agbp = agbp;
+		cur->bc_private.i.agno = agno;
+		break;
+	default:
+		ASSERT(0);
+	}
+	return cur;
+}
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int					/* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to check */
+{
+	xfs_btree_block_t	*block; /* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+		return INT_GET(block->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO;
+	else
+		return INT_GET(block->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK;
+}
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.	Other levels are unaffected.
+ */
+int					/* success=1, failure=0 */
+xfs_btree_lastrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	xfs_btree_block_t	*block; /* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (INT_ISZERO(block->bb_h.bb_numrecs, ARCH_CONVERT))
+		return 0;
+	/*
+	 * Set the ptr value to numrecs, that's the last record/key.
+	 */
+	cur->bc_ptrs[level] = INT_GET(block->bb_h.bb_numrecs, ARCH_CONVERT);
+	return 1;
+}
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	__int64_t	fields,		/* bitmask of fields */
+	const short	*offsets,	/* table of field offsets */
+	int		nbits,		/* number of bits to inspect */
+	int		*first,		/* output: first byte offset */
+	int		*last)		/* output: last byte offset */
+{
+	int		i;		/* current bit number */
+	__int64_t	imask;		/* mask for current bit number */
+
+	ASSERT(fields != 0);
+	/*
+	 * Find the lowest bit, so the first byte offset.
+	 */
+	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+		if (imask & fields) {
+			*first = offsets[i];
+			break;
+		}
+	}
+	/*
+	 * Find the highest bit, so the last byte offset.
+	 */
+	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+		if (imask & fields) {
+			*last = offsets[i + 1] - 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	uint		lock,		/* lock flags for read_buf */
+	xfs_buf_t	**bpp,		/* buffer for fsbno */
+	int		refval)		/* ref count value for buffer */
+{
+	xfs_buf_t	*bp;		/* return value */
+	xfs_daddr_t		d;		/* real disk block address */
+	int		error;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+			mp->m_bsize, lock, &bp))) {
+		return error;
+	}
+	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+	if (bp != NULL) {
+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+	}
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Short-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	uint		lock,		/* lock flags for read_buf */
+	xfs_buf_t	**bpp,		/* buffer for agno/agbno */
+	int		refval)		/* ref count value for buffer */
+{
+	xfs_buf_t	*bp;		/* return value */
+	xfs_daddr_t	d;		/* real disk block address */
+	int		error;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+					mp->m_bsize, lock, &bp))) {
+		return error;
+	}
+	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+	if (bp != NULL) {
+		switch (refval) {
+		case XFS_ALLOC_BTREE_REF:
+			XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+			break;
+		case XFS_INO_BTREE_REF:
+			XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
+			break;
+		}
+	}
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	xfs_extlen_t	count)		/* count of filesystem blocks */
+{
+	xfs_daddr_t		d;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	xfs_extlen_t	count)		/* count of filesystem blocks */
+{
+	xfs_daddr_t		d;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+int
+xfs_btree_readahead_core(
+	xfs_btree_cur_t		*cur,		/* btree cursor */
+	int			lev,		/* level in btree */
+	int			lr)		/* left/right bits */
+{
+	xfs_alloc_block_t	*a;
+	xfs_bmbt_block_t	*b;
+	xfs_inobt_block_t	*i;
+	int			rval = 0;
+
+	ASSERT(cur->bc_bufs[lev] != NULL);
+	cur->bc_ra[lev] |= lr;
+	switch (cur->bc_btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
+		if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(a->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				INT_GET(a->bb_leftsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(a->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				INT_GET(a->bb_rightsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		break;
+	case XFS_BTNUM_BMAP:
+		b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
+		if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(b->bb_leftsib, ARCH_CONVERT) != NULLDFSBNO) {
+			xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_leftsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(b->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+			xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_rightsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		break;
+	case XFS_BTNUM_INO:
+		i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
+		if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(i->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+				INT_GET(i->bb_leftsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(i->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+				INT_GET(i->bb_rightsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		break;
+	default:
+		ASSERT(0);
+	}
+	return rval;
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+void
+xfs_btree_setbuf(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	xfs_buf_t		*bp)	/* new buffer to set */
+{
+	xfs_btree_block_t	*b;	/* btree block */
+	xfs_buf_t		*obp;	/* old buffer pointer */
+
+	obp = cur->bc_bufs[lev];
+	if (obp)
+		xfs_trans_brelse(cur->bc_tp, obp);
+	cur->bc_bufs[lev] = bp;
+	cur->bc_ra[lev] = 0;
+	if (!bp)
+		return;
+	b = XFS_BUF_TO_BLOCK(bp);
+	if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
+		if (INT_GET(b->bb_u.l.bb_leftsib, ARCH_CONVERT) == NULLDFSBNO)
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (INT_GET(b->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO)
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	} else {
+		if (INT_GET(b->bb_u.s.bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK)
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (INT_GET(b->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK)
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	}
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_btree.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_btree.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_btree.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_btree.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,587 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BTREE_H__
+#define __XFS_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define XFS_LOOKUP_EQ	((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define XFS_LOOKUP_LE	((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define XFS_LOOKUP_GE	((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define XFS_BTNUM_BNO	((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define XFS_BTNUM_CNT	((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define XFS_BTNUM_BMAP	((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
+
+/*
+ * Short form header: space allocation btrees.
+ */
+typedef struct xfs_btree_sblock
+{
+	__uint32_t	bb_magic;	/* magic number for block type */
+	__uint16_t	bb_level;	/* 0 is a leaf */
+	__uint16_t	bb_numrecs;	/* current # of data records */
+	xfs_agblock_t	bb_leftsib;	/* left sibling block or NULLAGBLOCK */
+	xfs_agblock_t	bb_rightsib;	/* right sibling block or NULLAGBLOCK */
+} xfs_btree_sblock_t;
+
+/*
+ * Long form header: bmap btrees.
+ */
+typedef struct xfs_btree_lblock
+{
+	__uint32_t	bb_magic;	/* magic number for block type */
+	__uint16_t	bb_level;	/* 0 is a leaf */
+	__uint16_t	bb_numrecs;	/* current # of data records */
+	xfs_dfsbno_t	bb_leftsib;	/* left sibling block or NULLDFSBNO */
+	xfs_dfsbno_t	bb_rightsib;	/* right sibling block or NULLDFSBNO */
+} xfs_btree_lblock_t;
+
+/*
+ * Combined header and structure, used by common code.
+ */
+typedef struct xfs_btree_hdr
+{
+	__uint32_t	bb_magic;	/* magic number for block type */
+	__uint16_t	bb_level;	/* 0 is a leaf */
+	__uint16_t	bb_numrecs;	/* current # of data records */
+} xfs_btree_hdr_t;
+
+typedef struct xfs_btree_block
+{
+	xfs_btree_hdr_t bb_h;		/* header */
+	union		{
+		struct	{
+			xfs_agblock_t	bb_leftsib;
+			xfs_agblock_t	bb_rightsib;
+		}	s;		/* short form pointers */
+		struct	{
+			xfs_dfsbno_t	bb_leftsib;
+			xfs_dfsbno_t	bb_rightsib;
+		}	l;		/* long form pointers */
+	}		bb_u;		/* rest */
+} xfs_btree_block_t;
+
+/*
+ * For logging record fields.
+ */
+#define XFS_BB_MAGIC		0x01
+#define XFS_BB_LEVEL		0x02
+#define XFS_BB_NUMRECS		0x04
+#define XFS_BB_LEFTSIB		0x08
+#define XFS_BB_RIGHTSIB		0x10
+#define XFS_BB_NUM_BITS		5
+#define XFS_BB_ALL_BITS		((1 << XFS_BB_NUM_BITS) - 1)
+
+/*
+ * Boolean to select which form of xfs_btree_block_t.bb_u to use.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BTREE_LONG_PTRS)
+int xfs_btree_long_ptrs(xfs_btnum_t btnum);
+#define XFS_BTREE_LONG_PTRS(btnum)	((btnum) == XFS_BTNUM_BMAP)
+#else
+#define XFS_BTREE_LONG_PTRS(btnum)	((btnum) == XFS_BTNUM_BMAP)
+#endif
+
+/*
+ * Magic numbers for btree blocks.
+ */
+extern const __uint32_t xfs_magics[];
+
+/*
+ * Maximum and minimum records in a btree block.
+ * Given block size, type prefix, and leaf flag (0 or 1).
+ * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
+ * compiler warnings.
+ */
+#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf)	\
+	((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
+	 (((lf) * (uint)sizeof(t ## _rec_t)) + \
+	  ((1 - (lf)) * \
+	   ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
+#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf)	\
+	(XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
+
+/*
+ * Record, key, and pointer address calculation macros.
+ * Given block size, type prefix, block pointer, and index of requested entry
+ * (first entry numbered 1).
+ */
+#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr)	\
+	((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+	 ((i) - 1) * sizeof(t ## _rec_t)))
+#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr)	\
+	((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+	 ((i) - 1) * sizeof(t ## _key_t)))
+#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr)	\
+	((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+	 (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+
+#define XFS_BTREE_MAXLEVELS	8	/* max of all btrees */
+
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+typedef struct xfs_btree_cur
+{
+	struct xfs_trans	*bc_tp; /* transaction we're in, if any */
+	struct xfs_mount	*bc_mp; /* file system mount struct */
+	union {
+		xfs_alloc_rec_t		a;
+		xfs_bmbt_irec_t		b;
+		xfs_inobt_rec_t		i;
+	}		bc_rec;		/* current insert/search record value */
+	struct xfs_buf	*bc_bufs[XFS_BTREE_MAXLEVELS];	/* buf ptr per level */
+	int		bc_ptrs[XFS_BTREE_MAXLEVELS];	/* key/record # */
+	__uint8_t	bc_ra[XFS_BTREE_MAXLEVELS];	/* readahead bits */
+#define XFS_BTCUR_LEFTRA	1	/* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA	2	/* right sibling has been read-ahead */
+	__uint8_t	bc_nlevels;	/* number of levels in the tree */
+	__uint8_t	bc_blocklog;	/* log2(blocksize) of btree blocks */
+	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
+	union {
+		struct {			/* needed for BNO, CNT */
+			struct xfs_buf	*agbp;	/* agf buffer pointer */
+			xfs_agnumber_t	agno;	/* ag number */
+		} a;
+		struct {			/* needed for BMAP */
+			struct xfs_inode *ip;	/* pointer to our inode */
+			struct xfs_bmap_free *flist;	/* list to free after */
+			xfs_fsblock_t	firstblock;	/* 1st blk allocated */
+			int		allocated;	/* count of alloced */
+			short		forksize;	/* fork's inode space */
+			char		whichfork;	/* data or attr fork */
+			char		flags;		/* flags */
+#define XFS_BTCUR_BPRV_WASDEL	1			/* was delayed */
+		} b;
+		struct {			/* needed for INO */
+			struct xfs_buf	*agbp;	/* agi buffer pointer */
+			xfs_agnumber_t	agno;	/* ag number */
+		} i;
+	}		bc_private;	/* per-btree type data */
+} xfs_btree_cur_t;
+
+#define XFS_BTREE_NOERROR	0
+#define XFS_BTREE_ERROR		1
+
+/*
+ * Convert from buffer to btree block header.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BLOCK)
+xfs_btree_block_t *xfs_buf_to_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_BLOCK(bp)	xfs_buf_to_block(bp)
+#else
+#define XFS_BUF_TO_BLOCK(bp)	((xfs_btree_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_LBLOCK)
+xfs_btree_lblock_t *xfs_buf_to_lblock(struct xfs_buf *bp);
+#define XFS_BUF_TO_LBLOCK(bp)	xfs_buf_to_lblock(bp)
+#else
+#define XFS_BUF_TO_LBLOCK(bp)	((xfs_btree_lblock_t *)(XFS_BUF_PTR(bp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBLOCK)
+xfs_btree_sblock_t *xfs_buf_to_sblock(struct xfs_buf *bp);
+#define XFS_BUF_TO_SBLOCK(bp)	xfs_buf_to_sblock(bp)
+#else
+#define XFS_BUF_TO_SBLOCK(bp)	((xfs_btree_sblock_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+#ifdef __KERNEL__
+
+#ifdef DEBUG
+/*
+ * Debug routine: check that block header is ok.
+ */
+void
+xfs_btree_check_block(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_block_t	*block, /* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp);	/* buffer containing block, if any */
+
+/*
+ * Debug routine: check that keys are in the right order.
+ */
+void
+xfs_btree_check_key(
+	xfs_btnum_t		btnum,	/* btree identifier */
+	void			*ak1,	/* pointer to left (lower) key */
+	void			*ak2);	/* pointer to right (higher) key */
+
+/*
+ * Debug routine: check that records are in the right order.
+ */
+void
+xfs_btree_check_rec(
+	xfs_btnum_t		btnum,	/* btree identifier */
+	void			*ar1,	/* pointer to left (lower) record */
+	void			*ar2);	/* pointer to right (higher) record */
+#else
+#define xfs_btree_check_block(a,b,c,d)
+#define xfs_btree_check_key(a,b,c)
+#define xfs_btree_check_rec(a,b,c)
+#endif	/* DEBUG */
+
+/*
+ * Checking routine: check that long form block header is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_lblock_t	*block, /* btree long form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp);	/* buffer containing block, if any */
+
+/*
+ * Checking routine: check that (long) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_dfsbno_t		ptr,	/* btree block disk address */
+	int			level); /* btree block level */
+
+/*
+ * Checking routine: check that short form block header is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_sblock_t	*block, /* btree short form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp);	/* buffer containing block */
+
+/*
+ * Checking routine: check that (short) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agblock_t		ptr,	/* btree block disk address */
+	int			level); /* btree block level */
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			error); /* del because of error */
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	xfs_btree_cur_t		*cur,	/* input cursor */
+	xfs_btree_cur_t		**ncur);/* output cursor */
+
+/*
+ * Change the cursor to point to the first record in the current block
+ * at the given level.	Other levels are unaffected.
+ */
+int					/* success=1, failure=0 */
+xfs_btree_firstrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level); /* level to change */
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be a bmap btree root or from a buffer.
+ */
+xfs_btree_block_t *			/* generic btree block pointer */
+xfs_btree_get_block(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree */
+	struct xfs_buf		**bpp); /* buffer containing the block */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+struct xfs_buf *				/* buffer for fsbno */
+xfs_btree_get_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	uint			lock);	/* lock flags for get_buf */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+struct xfs_buf *				/* buffer for agno/agbno */
+xfs_btree_get_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	uint			lock);	/* lock flags for get_buf */
+
+/*
+ * Allocate a new btree cursor.
+ * The cursor is either for allocation (A) or bmap (B).
+ */
+xfs_btree_cur_t *			/* new btree cursor */
+xfs_btree_init_cursor(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	struct xfs_buf		*agbp,	/* (A only) buffer for agf structure */
+	xfs_agnumber_t		agno,	/* (A only) allocation group number */
+	xfs_btnum_t		btnum,	/* btree identifier */
+	struct xfs_inode	*ip,	/* (B only) inode owning the btree */
+	int			whichfork); /* (B only) data/attr fork */
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int					/* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level); /* level to check */
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.	Other levels are unaffected.
+ */
+int					/* success=1, failure=0 */
+xfs_btree_lastrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level); /* level to change */
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	__int64_t		fields, /* bitmask of fields */
+	const short		*offsets,/* table of field offsets */
+	int			nbits,	/* number of bits to inspect */
+	int			*first, /* output: first byte offset */
+	int			*last); /* output: last byte offset */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	uint			lock,	/* lock flags for read_buf */
+	struct xfs_buf		**bpp,	/* buffer for fsbno */
+	int			refval);/* ref count value for buffer */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Short-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	uint			lock,	/* lock flags for read_buf */
+	struct xfs_buf		**bpp,	/* buffer for agno/agbno */
+	int			refval);/* ref count value for buffer */
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	xfs_extlen_t		count); /* count of filesystem blocks */
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	xfs_extlen_t		count); /* count of filesystem blocks */
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+int					/* readahead block count */
+xfs_btree_readahead_core(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	int			lr);	/* left/right bits */
+
+static inline int			/* readahead block count */
+xfs_btree_readahead(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	int			lr)	/* left/right bits */
+{
+	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+		return 0;
+
+	return xfs_btree_readahead_core(cur, lev, lr);
+}
+
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+void
+xfs_btree_setbuf(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	struct xfs_buf		*bp);	/* new buffer to set */
+
+#endif	/* __KERNEL__ */
+
+
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MIN)
+xfs_extlen_t xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b);
+#define XFS_EXTLEN_MIN(a,b)	xfs_extlen_min(a,b)
+#else
+#define XFS_EXTLEN_MIN(a,b)	\
+	((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \
+	 (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MAX)
+xfs_extlen_t xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b);
+#define XFS_EXTLEN_MAX(a,b)	xfs_extlen_max(a,b)
+#else
+#define XFS_EXTLEN_MAX(a,b)	\
+	((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \
+	 (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MIN)
+xfs_agblock_t xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b);
+#define XFS_AGBLOCK_MIN(a,b)	xfs_agblock_min(a,b)
+#else
+#define XFS_AGBLOCK_MIN(a,b)	\
+	((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \
+	 (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MAX)
+xfs_agblock_t xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b);
+#define XFS_AGBLOCK_MAX(a,b)	xfs_agblock_max(a,b)
+#else
+#define XFS_AGBLOCK_MAX(a,b)	\
+	((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \
+	 (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MIN)
+xfs_fileoff_t xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b);
+#define XFS_FILEOFF_MIN(a,b)	xfs_fileoff_min(a,b)
+#else
+#define XFS_FILEOFF_MIN(a,b)	\
+	((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \
+	 (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MAX)
+xfs_fileoff_t xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b);
+#define XFS_FILEOFF_MAX(a,b)	xfs_fileoff_max(a,b)
+#else
+#define XFS_FILEOFF_MAX(a,b)	\
+	((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \
+	 (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MIN)
+xfs_filblks_t xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b);
+#define XFS_FILBLKS_MIN(a,b)	xfs_filblks_min(a,b)
+#else
+#define XFS_FILBLKS_MIN(a,b)	\
+	((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \
+	 (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MAX)
+xfs_filblks_t xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b);
+#define XFS_FILBLKS_MAX(a,b)	xfs_filblks_max(a,b)
+#else
+#define XFS_FILBLKS_MAX(a,b)	\
+	((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \
+	 (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_SANITY_CHECK)
+int xfs_fsb_sanity_check(struct xfs_mount *mp, xfs_fsblock_t fsb);
+#define XFS_FSB_SANITY_CHECK(mp,fsb)	xfs_fsb_sanity_check(mp,fsb)
+#else
+#define XFS_FSB_SANITY_CHECK(mp,fsb)	\
+	(XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+	 XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+#endif
+
+/*
+ * Macros to set EFSCORRUPTED & return/branch.
+ */
+#define XFS_WANT_CORRUPTED_GOTO(x,l)	\
+	{ \
+		int fs_is_ok = (x); \
+		ASSERT(fs_is_ok); \
+		if (!fs_is_ok) { \
+			error = XFS_ERROR(EFSCORRUPTED); \
+			goto l; \
+		} \
+	}
+
+#define XFS_WANT_CORRUPTED_RETURN(x)	\
+	{ \
+		int fs_is_ok = (x); \
+		ASSERT(fs_is_ok); \
+		if (!fs_is_ok) \
+			return XFS_ERROR(EFSCORRUPTED); \
+	}
+
+#endif	/* __XFS_BTREE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_buf.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_buf.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_buf.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_buf.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+/* These are just for xfs_syncsub... it sets an internal variable
+ * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
+ */
+#define XFS_B_ASYNC  PBF_ASYNC
+#define XFS_B_DELWRI PBF_DELWRI
+#define XFS_B_READ   PBF_READ
+#define XFS_B_WRITE  PBF_WRITE
+#define XFS_B_STALE  PBF_STALE
+#define XFS_BUF_TRYLOCK		PBF_TRYLOCK
+#define XFS_INCORE_TRYLOCK	PBF_TRYLOCK
+#define XFS_BUF_LOCK		PBF_LOCK
+#define XFS_BUF_MAPPED		PBF_MAPPED
+
+#define BUF_BUSY	PBF_DONT_BLOCK
+
+#define XFS_BUF_BFLAGS(x)	 ((x)->pb_flags)  /* debugging routines might need this */
+#define XFS_BUF_ZEROFLAGS(x)	\
+	((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI))
+
+#define XFS_BUF_STALE(x)	     ((x)->pb_flags |= XFS_B_STALE)
+#define XFS_BUF_UNSTALE(x)	     ((x)->pb_flags &= ~XFS_B_STALE)
+#define XFS_BUF_ISSTALE(x)	     ((x)->pb_flags & XFS_B_STALE)
+#define XFS_BUF_SUPER_STALE(x)	 (x)->pb_flags |= XFS_B_STALE;\
+				 xfs_buf_undelay(x);\
+				 (x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE)
+
+static inline void xfs_buf_undelay(page_buf_t *pb)
+{
+	if (pb->pb_flags & PBF_DELWRI) {
+		if (pb->pb_list.next != &pb->pb_list) {
+			pagebuf_delwri_dequeue(pb);
+			pagebuf_rele(pb);
+		} else {
+			pb->pb_flags &= ~PBF_DELWRI;
+		}
+	}
+}
+
+#define XFS_BUF_DELAYWRITE(x)	 ((x)->pb_flags |= PBF_DELWRI)
+#define XFS_BUF_UNDELAYWRITE(x)	 xfs_buf_undelay(x)
+#define XFS_BUF_ISDELAYWRITE(x)	 ((x)->pb_flags & PBF_DELWRI)
+
+#define XFS_BUF_ERROR(x,no)	 pagebuf_ioerror(x,no)
+#define XFS_BUF_GETERROR(x)	 pagebuf_geterror(x)
+#define XFS_BUF_ISERROR(x)	 (pagebuf_geterror(x)?1:0)
+
+#define XFS_BUF_DONE(x)		 ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE))
+#define XFS_BUF_UNDONE(x)	 ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE)
+#define XFS_BUF_ISDONE(x)	 (!(PBF_NOT_DONE(x)))
+
+#define XFS_BUF_BUSY(x)		 ((x)->pb_flags |= PBF_FORCEIO)
+#define XFS_BUF_UNBUSY(x)	 ((x)->pb_flags &= ~PBF_FORCEIO)
+#define XFS_BUF_ISBUSY(x)	 (1)
+
+#define XFS_BUF_ASYNC(x)	 ((x)->pb_flags |= PBF_ASYNC)
+#define XFS_BUF_UNASYNC(x)	 ((x)->pb_flags &= ~PBF_ASYNC)
+#define XFS_BUF_ISASYNC(x)	 ((x)->pb_flags & PBF_ASYNC)
+
+#define XFS_BUF_FLUSH(x)	 ((x)->pb_flags |= PBF_FLUSH)
+#define XFS_BUF_UNFLUSH(x)	 ((x)->pb_flags &= ~PBF_FLUSH)
+#define XFS_BUF_ISFLUSH(x)	 ((x)->pb_flags & PBF_FLUSH)
+
+#define XFS_BUF_SHUT(x)		 printk("XFS_BUF_SHUT not implemented yet\n")
+#define XFS_BUF_UNSHUT(x)	 printk("XFS_BUF_UNSHUT not implemented yet\n")
+#define XFS_BUF_ISSHUT(x)	 (0)
+
+#define XFS_BUF_HOLD(x)		pagebuf_hold(x)
+#define XFS_BUF_READ(x)		((x)->pb_flags |= PBF_READ)
+#define XFS_BUF_UNREAD(x)	((x)->pb_flags &= ~PBF_READ)
+#define XFS_BUF_ISREAD(x)	((x)->pb_flags & PBF_READ)
+
+#define XFS_BUF_WRITE(x)	((x)->pb_flags |= PBF_WRITE)
+#define XFS_BUF_UNWRITE(x)	((x)->pb_flags &= ~PBF_WRITE)
+#define XFS_BUF_ISWRITE(x)	((x)->pb_flags & PBF_WRITE)
+
+#define XFS_BUF_ISUNINITIAL(x)	 ((x)->pb_flags & PBF_UNINITIAL)
+#define XFS_BUF_UNUNINITIAL(x)	 ((x)->pb_flags &= ~PBF_UNINITIAL)
+
+#define XFS_BUF_BP_ISMAPPED(bp)	 1
+
+typedef struct page_buf_s xfs_buf_t;
+#define xfs_buf page_buf_s
+
+typedef struct pb_target xfs_buftarg_t;
+#define xfs_buftarg pb_target
+
+#define XFS_BUF_IODONE_FUNC(buf)	(buf)->pb_iodone
+#define XFS_BUF_SET_IODONE_FUNC(buf, func)	\
+			(buf)->pb_iodone = (func)
+#define XFS_BUF_CLR_IODONE_FUNC(buf)		\
+			(buf)->pb_iodone = NULL
+#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func)	\
+			(buf)->pb_strat = (func)
+#define XFS_BUF_CLR_BDSTRAT_FUNC(buf)		\
+			(buf)->pb_strat = NULL
+
+#define XFS_BUF_FSPRIVATE(buf, type)		\
+			((type)(buf)->pb_fspriv)
+#define XFS_BUF_SET_FSPRIVATE(buf, value)	\
+			(buf)->pb_fspriv = (void *)(value)
+#define XFS_BUF_FSPRIVATE2(buf, type)		\
+			((type)(buf)->pb_fspriv2)
+#define XFS_BUF_SET_FSPRIVATE2(buf, value)	\
+			(buf)->pb_fspriv2 = (void *)(value)
+#define XFS_BUF_FSPRIVATE3(buf, type)		\
+			((type)(buf)->pb_fspriv3)
+#define XFS_BUF_SET_FSPRIVATE3(buf, value)	\
+			(buf)->pb_fspriv3  = (void *)(value)
+#define XFS_BUF_SET_START(buf)
+
+#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
+			(buf)->pb_relse = (value)
+
+#define XFS_BUF_PTR(bp)		(xfs_caddr_t)((bp)->pb_addr)
+
+extern inline xfs_caddr_t xfs_buf_offset(page_buf_t *bp, off_t offset)
+{
+	if (bp->pb_flags & PBF_MAPPED)
+		return XFS_BUF_PTR(bp) + offset;
+	return (xfs_caddr_t) pagebuf_offset(bp, offset);
+}
+
+#define XFS_BUF_SET_PTR(bp, val, count)		\
+				pagebuf_associate_memory(bp, val, count)
+#define XFS_BUF_ADDR(bp)	((bp)->pb_bn)
+#define XFS_BUF_OFFSET(bp)	((bp)->pb_file_offset >> 9)
+#define XFS_BUF_SET_ADDR(bp, blk)		\
+			((bp)->pb_bn = (page_buf_daddr_t)(blk))
+#define XFS_BUF_COUNT(bp)	((bp)->pb_count_desired)
+#define XFS_BUF_SET_COUNT(bp, cnt)		\
+			((bp)->pb_count_desired = cnt)
+#define XFS_BUF_SIZE(bp)	((bp)->pb_buffer_length)
+#define XFS_BUF_SET_SIZE(bp, cnt)		\
+			((bp)->pb_buffer_length = cnt)
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+
+#define XFS_BUF_ISPINNED(bp)   pagebuf_ispin(bp)
+
+#define XFS_BUF_VALUSEMA(bp)	pagebuf_lock_value(bp)
+#define XFS_BUF_CPSEMA(bp)	(pagebuf_cond_lock(bp) == 0)
+#define XFS_BUF_VSEMA(bp)	pagebuf_unlock(bp)
+#define XFS_BUF_PSEMA(bp,x)	pagebuf_lock(bp)
+#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema);
+
+/* setup the buffer target from a buftarg structure */
+#define XFS_BUF_SET_TARGET(bp, target)	\
+	(bp)->pb_target = (target)
+
+#define XFS_BUF_TARGET_DEV(bp)	((bp)->pb_target->pbr_dev)
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+
+#define xfs_buf_read(target, blkno, len, flags) \
+		pagebuf_get((target), (blkno), (len), \
+			PBF_LOCK | PBF_READ | PBF_MAPPED | PBF_MAPPABLE)
+#define xfs_buf_get(target, blkno, len, flags) \
+		pagebuf_get((target), (blkno), (len), \
+			PBF_LOCK | PBF_MAPPED | PBF_MAPPABLE)
+
+#define xfs_buf_read_flags(target, blkno, len, flags) \
+		pagebuf_get((target), (blkno), (len), \
+			PBF_READ | PBF_MAPPABLE | flags)
+#define xfs_buf_get_flags(target, blkno, len, flags) \
+		pagebuf_get((target), (blkno), (len), \
+			PBF_MAPPABLE | flags)
+
+static inline int	xfs_bawrite(void *mp, page_buf_t *bp)
+{
+	extern int	xfs_bdstrat_cb(struct xfs_buf *);
+	int ret;
+
+	bp->pb_fspriv3 = mp;
+	bp->pb_strat = xfs_bdstrat_cb;
+	xfs_buf_undelay(bp);
+	if ((ret = pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC)) == 0)
+		run_task_queue(&tq_disk);
+	return ret;
+}
+
+static inline void	xfs_buf_relse(page_buf_t *bp)
+{
+	if ((bp->pb_flags & _PBF_LOCKABLE) && !bp->pb_relse)
+		pagebuf_unlock(bp);
+
+	pagebuf_rele(bp);
+}
+
+
+#define xfs_bpin(bp)		pagebuf_pin(bp)
+#define xfs_bunpin(bp)		pagebuf_unpin(bp)
+
+#ifdef PAGEBUF_TRACE
+# define PB_DEFINE_TRACES
+# include <pagebuf/page_buf_trace.h>
+# define xfs_buftrace(id, bp)	PB_TRACE(bp, PB_TRACE_REC(external), (void *)id)
+#else
+# define xfs_buftrace(id, bp)	do { } while (0)
+#endif
+
+
+#define xfs_biodone(pb)		    \
+	    pagebuf_iodone(pb)
+
+#define xfs_incore(buftarg,blkno,len,lockit) \
+	    pagebuf_find(buftarg, blkno ,len, lockit)
+
+
+#define xfs_biomove(pb, off, len, data, rw) \
+	    pagebuf_iomove((pb), (off), (len), (data), \
+		((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ)
+
+#define xfs_biozero(pb, off, len) \
+	    pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO)
+
+
+static inline int	XFS_bwrite(page_buf_t *pb)
+{
+	int	sync = (pb->pb_flags & PBF_ASYNC) == 0;
+	int	error;
+
+	pb->pb_flags |= PBF_SYNC;
+
+	xfs_buf_undelay(pb);
+
+	__pagebuf_iorequest(pb);
+
+	if (sync) {
+		error = pagebuf_iowait(pb);
+		xfs_buf_relse(pb);
+	} else {
+		run_task_queue(&tq_disk);
+		error = 0;
+	}
+
+	return error;
+}
+
+
+#define XFS_bdwrite(pb)		     \
+	    pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
+
+static inline int xfs_bdwrite(void *mp, page_buf_t *bp)
+{
+	extern int	xfs_bdstrat_cb(struct xfs_buf *);
+
+	bp->pb_strat = xfs_bdstrat_cb;
+	bp->pb_fspriv3 = mp;
+
+	return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
+}
+
+#define XFS_bdstrat(bp) pagebuf_iorequest(bp)
+
+#define xfs_iowait(pb)	pagebuf_iowait(pb)
+
+
+/*
+ * Go through all incore buffers, and release buffers
+ * if they belong to the given device. This is used in
+ * filesystem error handling to preserve the consistency
+ * of its metadata.
+ */
+
+extern void XFS_bflush(xfs_buftarg_t *);
+#define xfs_binval(buftarg) XFS_bflush(buftarg)
+
+#define xfs_incore_relse(buftarg,delwri_only,wait)	\
+       pagebuf_target_clear(buftarg)
+
+
+#define xfs_baread(target, rablkno, ralen)  \
+	pagebuf_readahead((target), (rablkno), \
+			  (ralen), PBF_DONT_BLOCK)
+
+#define XFS_getrbuf(sleep,mp)	\
+	pagebuf_get_empty((mp)->m_ddev_targp)
+#define XFS_ngetrbuf(len,mp)	\
+	pagebuf_get_no_daddr(len,(mp)->m_ddev_targp)
+#define XFS_freerbuf(bp)	pagebuf_free(bp)
+#define XFS_nfreerbuf(bp)	pagebuf_free(bp)
+
+#endif
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_buf_item.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_buf_item.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_buf_item.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_buf_item.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,1203 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains the implementation of the xfs_buf_log_item.
+ * It contains the item operations used to manipulate the buf log
+ * items as well as utility routines used by the buffer specific
+ * transaction routines.
+ */
+
+#include <xfs.h>
+
+
+#define ROUNDUPNBWORD(x)	(((x) + (NBWORD - 1)) & ~(NBWORD - 1))
+
+kmem_zone_t	*xfs_buf_item_zone;
+
+#ifdef XFS_TRANS_DEBUG
+/*
+ * This function uses an alternate strategy for tracking the bytes
+ * that the user requests to be logged.	 This can then be used
+ * in conjunction with the bli_orig array in the buf log item to
+ * catch bugs in our callers' code.
+ *
+ * We also double check the bits set in xfs_buf_item_log using a
+ * simple algorithm to check that every byte is accounted for.
+ */
+STATIC void
+xfs_buf_item_log_debug(
+	xfs_buf_log_item_t	*bip,
+	uint			first,
+	uint			last)
+{
+	uint	x;
+	uint	byte;
+	uint	nbytes;
+	uint	chunk_num;
+	uint	word_num;
+	uint	bit_num;
+	uint	bit_set;
+	uint	*wordp;
+
+	ASSERT(bip->bli_logged != NULL);
+	byte = first;
+	nbytes = last - first + 1;
+	bfset(bip->bli_logged, first, nbytes);
+	for (x = 0; x < nbytes; x++) {
+		chunk_num = byte >> XFS_BLI_SHIFT;
+		word_num = chunk_num >> BIT_TO_WORD_SHIFT;
+		bit_num = chunk_num & (NBWORD - 1);
+		wordp = &(bip->bli_format.blf_data_map[word_num]);
+		bit_set = *wordp & (1 << bit_num);
+		ASSERT(bit_set);
+		byte++;
+	}
+}
+
+/*
+ * This function is called when we flush something into a buffer without
+ * logging it.	This happens for things like inodes which are logged
+ * separately from the buffer.
+ */
+void
+xfs_buf_item_flush_log_debug(
+	xfs_buf_t	*bp,
+	uint		first,
+	uint		last)
+{
+	xfs_buf_log_item_t	*bip;
+	uint			nbytes;
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
+		return;
+	}
+
+	ASSERT(bip->bli_logged != NULL);
+	nbytes = last - first + 1;
+	bfset(bip->bli_logged, first, nbytes);
+}
+
+/*
+ * This function is called to verify that our caller's have logged
+ * all the bytes that they changed.
+ *
+ * It does this by comparing the original copy of the buffer stored in
+ * the buf log item's bli_orig array to the current copy of the buffer
+ * and ensuring that all bytes which miscompare are set in the bli_logged
+ * array of the buf log item.
+ */
+STATIC void
+xfs_buf_item_log_check(
+	xfs_buf_log_item_t	*bip)
+{
+	char		*orig;
+	char		*buffer;
+	int		x;
+	xfs_buf_t	*bp;
+
+	ASSERT(bip->bli_orig != NULL);
+	ASSERT(bip->bli_logged != NULL);
+
+	bp = bip->bli_buf;
+	ASSERT(XFS_BUF_COUNT(bp) > 0);
+	ASSERT(XFS_BUF_PTR(bp) != NULL);
+	orig = bip->bli_orig;
+	buffer = XFS_BUF_PTR(bp);
+	for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
+		if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
+			cmn_err(CE_PANIC,
+	"xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
+				bip, bp, orig, x);
+	}
+}
+#else
+#define		xfs_buf_item_log_debug(x,y,z)
+#define		xfs_buf_item_log_check(x)
+#endif
+
+STATIC void	xfs_buf_error_relse(xfs_buf_t *bp);
+
+/*
+ * This returns the number of log iovecs needed to log the
+ * given buf log item.
+ *
+ * It calculates this as 1 iovec for the buf log format structure
+ * and 1 for each stretch of non-contiguous chunks to be logged.
+ * Contiguous chunks are logged in a single iovec.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing.
+ */
+uint
+xfs_buf_item_size(
+	xfs_buf_log_item_t	*bip)
+{
+	uint		nvecs;
+	int		next_bit;
+	int		last_bit;
+	xfs_buf_t	*bp;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log
+		 * is the buf log format structure with the
+		 * cancel flag in it.
+		 */
+		xfs_buf_item_trace("SIZE STALE", bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		return 1;
+	}
+
+	bp = bip->bli_buf;
+	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+	nvecs = 1;
+	last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+					 bip->bli_format.blf_map_size, 0);
+	ASSERT(last_bit != -1);
+	nvecs++;
+	while (last_bit != -1) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.	 It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+						 bip->bli_format.blf_map_size,
+						 last_bit + 1);
+		/*
+		 * If we run out of bits, leave the loop,
+		 * else if we find a new set of bits bump the number of vecs,
+		 * else keep scanning the current set of bits.
+		 */
+		if (next_bit == -1) {
+			last_bit = -1;
+		} else if (next_bit != last_bit + 1) {
+			last_bit = next_bit;
+			nvecs++;
+		} else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
+			   (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
+			    XFS_BLI_CHUNK)) {
+			last_bit = next_bit;
+			nvecs++;
+		} else {
+			last_bit++;
+		}
+	}
+
+	xfs_buf_item_trace("SIZE NORM", bip);
+	return nvecs;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given log buf item.	It fills the first entry with a buf log
+ * format structure, and the rest point to contiguous chunks
+ * within the buffer.
+ */
+void
+xfs_buf_item_format(
+	xfs_buf_log_item_t	*bip,
+	xfs_log_iovec_t		*log_vector)
+{
+	uint		base_size;
+	uint		nvecs;
+	xfs_log_iovec_t *vecp;
+	xfs_buf_t	*bp;
+	int		first_bit;
+	int		last_bit;
+	int		next_bit;
+	uint		nbits;
+	uint		buffer_offset;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+	bp = bip->bli_buf;
+	ASSERT(XFS_BUF_BP_ISMAPPED(bp));
+	vecp = log_vector;
+
+	/*
+	 * The size of the base structure is the size of the
+	 * declared structure plus the space for the extra words
+	 * of the bitmap.  We subtract one from the map size, because
+	 * the first element of the bitmap is accounted for in the
+	 * size of the base structure.
+	 */
+	base_size =
+		(uint)(sizeof(xfs_buf_log_format_t) +
+		       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
+	vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
+	vecp->i_len = base_size;
+	vecp++;
+	nvecs = 1;
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log
+		 * is the buf log format structure with the
+		 * cancel flag in it.
+		 */
+		xfs_buf_item_trace("FORMAT STALE", bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		bip->bli_format.blf_size = nvecs;
+		return;
+	}
+
+	/*
+	 * Fill in an iovec for each set of contiguous chunks.
+	 */
+	first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+					 bip->bli_format.blf_map_size, 0);
+	ASSERT(first_bit != -1);
+	last_bit = first_bit;
+	nbits = 1;
+	for (;;) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.	 It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+						 bip->bli_format.blf_map_size,
+						 (uint)last_bit + 1);
+		/*
+		 * If we run out of bits fill in the last iovec and get
+		 * out of the loop.
+		 * Else if we start a new set of bits then fill in the
+		 * iovec for the series we were looking at and start
+		 * counting the bits in the new one.
+		 * Else we're still in the same set of bits so just
+		 * keep counting and scanning.
+		 */
+		if (next_bit == -1) {
+			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			nvecs++;
+			break;
+		} else if (next_bit != last_bit + 1) {
+			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			nvecs++;
+			vecp++;
+			first_bit = next_bit;
+			last_bit = next_bit;
+			nbits = 1;
+		} else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
+			   (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
+			    XFS_BLI_CHUNK)) {
+			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+			vecp->i_len = nbits * XFS_BLI_CHUNK;
+/* You would think we need to bump the nvecs here too, but we do not
+ * this number is used by recovery, and it gets confused by the boundary
+ * split here
+ *			nvecs++;
+ */
+			vecp++;
+			first_bit = next_bit;
+			last_bit = next_bit;
+		} else {
+			last_bit++;
+			nbits++;
+		}
+	}
+	bip->bli_format.blf_size = nvecs;
+
+	/*
+	 * Check to make sure everything is consistent.
+	 */
+	xfs_buf_item_trace("FORMAT NORM", bip);
+	xfs_buf_item_log_check(bip);
+}
+
+/*
+ * This is called to pin the buffer associated with the buf log
+ * item in memory so it cannot be written out.	Simply call bpin()
+ * on the buffer to do this.
+ */
+void
+xfs_buf_item_pin(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	bp = bip->bli_buf;
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+	xfs_buf_item_trace("PIN", bip);
+	xfs_buftrace("XFS_PIN", bp);
+	xfs_bpin(bp);
+}
+
+
+/*
+ * This is called to unpin the buffer associated with the buf log
+ * item which was previously pinned with a call to xfs_buf_item_pin().
+ * Just call bunpin() on the buffer to do this.
+ *
+ * Also drop the reference to the buf item for the current transaction.
+ * If the XFS_BLI_STALE flag is set and we are the last reference,
+ * then free up the buf log item and unlock the buffer.
+ */
+void
+xfs_buf_item_unpin(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_mount_t	*mp;
+	xfs_buf_t	*bp;
+	int		freed;
+	SPLDECL(s);
+
+	bp = bip->bli_buf;
+	ASSERT(bp != NULL);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	xfs_buf_item_trace("UNPIN", bip);
+	xfs_buftrace("XFS_UNPIN", bp);
+
+	freed = atomic_dec_and_test(&bip->bli_refcount);
+	mp = bip->bli_item.li_mountp;
+	xfs_bunpin(bp);
+	if (freed && (bip->bli_flags & XFS_BLI_STALE)) {
+		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+		ASSERT(XFS_BUF_ISSTALE(bp));
+/**
+		ASSERT(bp->b_pincount == 0);
+**/
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		xfs_buf_item_trace("UNPIN STALE", bip);
+		xfs_buftrace("XFS_UNPIN STALE", bp);
+		AIL_LOCK(mp,s);
+		/*
+		 * If we get called here because of an IO error, we may
+		 * or may not have the item on the AIL. xfs_trans_delete_ail()
+		 * will take care of that situation.
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
+		xfs_buf_item_relse(bp);
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
+		xfs_buf_relse(bp);
+	}
+
+}
+
+/*
+ * this is called from uncommit in the forced-shutdown path.
+ * we need to check to see if the reference count on the log item
+ * is going to drop to zero.  If so, unpin will free the log item
+ * so we need to free the item's descriptor (that points to the item)
+ * in the transaction.
+ */
+void
+xfs_buf_item_unpin_remove(
+	xfs_buf_log_item_t	*bip,
+	xfs_trans_t		*tp)
+{
+	xfs_buf_t		*bp;
+	xfs_log_item_desc_t	*lidp;
+
+	bp = bip->bli_buf;
+	/*
+	 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
+	 */
+	if ((atomic_read(&bip->bli_refcount) == 1) &&
+	    (bip->bli_flags & XFS_BLI_STALE)) {
+		ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
+		xfs_buf_item_trace("UNPIN REMOVE", bip);
+		xfs_buftrace("XFS_UNPIN_REMOVE", bp);
+		/*
+		 * yes -- clear the xaction descriptor in-use flag
+		 * and free the chunk if required.  We can safely
+		 * do some work here and then call buf_item_unpin
+		 * to do the rest because if the if is true, then
+		 * we are holding the buffer locked so no one else
+		 * will be able to bump up the refcount.
+		 */
+		lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
+		xfs_trans_free_item(tp, lidp);
+		/*
+		 * Since the transaction no longer refers to the buffer,
+		 * the buffer should no longer refer to the transaction.
+		 */
+		XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+	}
+
+	xfs_buf_item_unpin(bip);
+
+	return;
+}
+
+/*
+ * This is called to attempt to lock the buffer associated with this
+ * buf log item.  Don't sleep on the buffer lock.  If we can't get
+ * the lock right away, return 0.  If we can get the lock, pull the
+ * buffer from the free list, mark it busy, and return 1.
+ */
+uint
+xfs_buf_item_trylock(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	bp = bip->bli_buf;
+
+	if (XFS_BUF_ISPINNED(bp)) {
+		return XFS_ITEM_PINNED;
+	}
+
+	if (!XFS_BUF_CPSEMA(bp)) {
+		return XFS_ITEM_LOCKED;
+	}
+
+	/*
+	 * Remove the buffer from the free list.  Only do this
+	 * if it's on the free list.  Private buffers like the
+	 * superblock buffer are not.
+	 */
+	XFS_BUF_HOLD(bp);
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
+	return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Release the buffer associated with the buf log item.
+ * If there is no dirty logged data associated with the
+ * buffer recorded in the buf log item, then free the
+ * buf log item and remove the reference to it in the
+ * buffer.
+ *
+ * This call ignores the recursion count.  It is only called
+ * when the buffer should REALLY be unlocked, regardless
+ * of the recursion count.
+ *
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then
+ * free the log item if necessary but do not unlock the buffer.
+ * This is for support of xfs_trans_bhold(). Make sure the
+ * XFS_BLI_HOLD field is cleared if we don't free the item.
+ */
+void
+xfs_buf_item_unlock(
+	xfs_buf_log_item_t	*bip)
+{
+	int		aborted;
+	xfs_buf_t	*bp;
+	uint		hold;
+
+	bp = bip->bli_buf;
+	xfs_buftrace("XFS_UNLOCK", bp);
+
+	/*
+	 * Clear the buffer's association with this transaction.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+
+	/*
+	 * If this is a transaction abort, don't return early.
+	 * Instead, allow the brelse to happen.
+	 * Normally it would be done for stale (cancelled) buffers
+	 * at unpin time, but we'll never go through the pin/unpin
+	 * cycle if we abort inside commit.
+	 */
+	aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
+
+	/*
+	 * If the buf item is marked stale, then don't do anything.
+	 * We'll unlock the buffer and free the buf item when the
+	 * buffer is unpinned for the last time.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		bip->bli_flags &= ~XFS_BLI_LOGGED;
+		xfs_buf_item_trace("UNLOCK STALE", bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		if (!aborted)
+			return;
+	}
+
+	/*
+	 * Drop the transaction's reference to the log item if
+	 * it was not logged as part of the transaction.  Otherwise
+	 * we'll drop the reference in xfs_buf_item_unpin() when
+	 * the transaction is really through with the buffer.
+	 */
+	if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
+		atomic_dec(&bip->bli_refcount);
+	} else {
+		/*
+		 * Clear the logged flag since this is per
+		 * transaction state.
+		 */
+		bip->bli_flags &= ~XFS_BLI_LOGGED;
+	}
+
+	/*
+	 * Before possibly freeing the buf item, determine if we should
+	 * release the buffer at the end of this routine.
+	 */
+	hold = bip->bli_flags & XFS_BLI_HOLD;
+	xfs_buf_item_trace("UNLOCK", bip);
+
+	/*
+	 * If the buf item isn't tracking any data, free it.
+	 * Otherwise, if XFS_BLI_HOLD is set clear it.
+	 */
+	if (xfs_count_bits(bip->bli_format.blf_data_map,
+			      bip->bli_format.blf_map_size, 0) == 0) {
+		xfs_buf_item_relse(bp);
+	} else if (hold) {
+		bip->bli_flags &= ~XFS_BLI_HOLD;
+	}
+
+	/*
+	 * Release the buffer if XFS_BLI_HOLD was not set.
+	 */
+	if (!hold) {
+		xfs_buf_relse(bp);
+	}
+}
+
+/*
+ * This is called to find out where the oldest active copy of the
+ * buf log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.
+ * We always re-log all the dirty data in a buffer, so usually the
+ * latest copy in the on disk log is the only one that matters.	 For
+ * those cases we simply return the given lsn.
+ *
+ * The one exception to this is for buffers full of newly allocated
+ * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
+ * flag set, indicating that only the di_next_unlinked fields from the
+ * inodes in the buffers will be replayed during recovery.  If the
+ * original newly allocated inode images have not yet been flushed
+ * when the buffer is so relogged, then we need to make sure that we
+ * keep the old images in the 'active' portion of the log.  We do this
+ * by returning the original lsn of that transaction here rather than
+ * the current one.
+ */
+xfs_lsn_t
+xfs_buf_item_committed(
+	xfs_buf_log_item_t	*bip,
+	xfs_lsn_t		lsn)
+{
+	xfs_buf_item_trace("COMMITTED", bip);
+	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+	    (bip->bli_item.li_lsn != 0)) {
+		return bip->bli_item.li_lsn;
+	}
+	return (lsn);
+}
+
+/*
+ * This is called when the transaction holding the buffer is aborted.
+ * Just behave as if the transaction had been cancelled. If we're shutting down
+ * and have aborted this transaction, we'll trap this buffer when it tries to
+ * get written out.
+ */
+void
+xfs_buf_item_abort(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	bp = bip->bli_buf;
+	xfs_buftrace("XFS_ABORT", bp);
+	XFS_BUF_SUPER_STALE(bp);
+	xfs_buf_item_unlock(bip);
+	return;
+}
+
+/*
+ * This is called to asynchronously write the buffer associated with this
+ * buf log item out to disk. The buffer will already have been locked by
+ * a successful call to xfs_buf_item_trylock().	 If the buffer still has
+ * B_DELWRI set, then get it going out to disk with a call to bawrite().
+ * If not, then just release the buffer.
+ */
+void
+xfs_buf_item_push(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	xfs_buf_item_trace("PUSH", bip);
+
+	bp = bip->bli_buf;
+
+	if (XFS_BUF_ISDELAYWRITE(bp)) {
+		xfs_bawrite(bip->bli_item.li_mountp, bp);
+	} else {
+		xfs_buf_relse(bp);
+	}
+}
+
+/* ARGSUSED */
+void
+xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+struct xfs_item_ops xfs_buf_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_buf_item_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+					xfs_buf_item_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_buf_item_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_buf_item_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_buf_item_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_buf_item_committing
+};
+
+
+/*
+ * Allocate a new buf log item to go with the given buffer.
+ * Set the buffer's b_fsprivate field to point to the new
+ * buf log item.  If there are other item's attached to the
+ * buffer (see xfs_buf_attach_iodone() below), then put the
+ * buf log item at the front.
+ */
+void
+xfs_buf_item_init(
+	xfs_buf_t	*bp,
+	xfs_mount_t	*mp)
+{
+	xfs_log_item_t		*lip;
+	xfs_buf_log_item_t	*bip;
+	int			chunks;
+	int			map_size;
+
+	/*
+	 * Check to see if there is already a buf log item for
+	 * this buffer.	 If there is, it is guaranteed to be
+	 * the first.  If we do already have one, there is
+	 * nothing to do here so return.
+	 */
+	if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
+		XFS_BUF_SET_FSPRIVATE3(bp, mp);
+	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
+	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+		if (lip->li_type == XFS_LI_BUF) {
+			return;
+		}
+	}
+
+	/*
+	 * chunks is the number of XFS_BLI_CHUNK size pieces
+	 * the buffer can be divided into. Make sure not to
+	 * truncate any pieces.	 map_size is the size of the
+	 * bitmap needed to describe the chunks of the buffer.
+	 */
+	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+	map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
+
+	bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
+						    KM_SLEEP);
+	bip->bli_item.li_type = XFS_LI_BUF;
+	bip->bli_item.li_ops = &xfs_buf_item_ops;
+	bip->bli_item.li_mountp = mp;
+	bip->bli_buf = bp;
+	bip->bli_format.blf_type = XFS_LI_BUF;
+	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
+	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
+	bip->bli_format.blf_map_size = map_size;
+#ifdef XFS_BLI_TRACE
+	bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
+#endif
+
+#ifdef XFS_TRANS_DEBUG
+	/*
+	 * Allocate the arrays for tracking what needs to be logged
+	 * and what our callers request to be logged.  bli_orig
+	 * holds a copy of the original, clean buffer for comparison
+	 * against, and bli_logged keeps a 1 bit flag per byte in
+	 * the buffer to indicate which bytes the callers have asked
+	 * to have logged.
+	 */
+	bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
+	bcopy(XFS_BUF_PTR(bp), bip->bli_orig, XFS_BUF_COUNT(bp));
+	bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
+#endif
+
+	/*
+	 * Put the buf item into the list of items attached to the
+	 * buffer at the front.
+	 */
+	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+		bip->bli_item.li_bio_list =
+				XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+	}
+	XFS_BUF_SET_FSPRIVATE(bp, bip);
+}
+
+
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+void
+xfs_buf_item_log(
+	xfs_buf_log_item_t	*bip,
+	uint			first,
+	uint			last)
+{
+	uint		first_bit;
+	uint		last_bit;
+	uint		bits_to_set;
+	uint		bits_set;
+	uint		word_num;
+	uint		*wordp;
+	uint		bit;
+	uint		end_bit;
+	uint		mask;
+
+	/*
+	 * Mark the item as having some dirty data for
+	 * quick reference in xfs_buf_item_dirty.
+	 */
+	bip->bli_flags |= XFS_BLI_DIRTY;
+
+	/*
+	 * Convert byte offsets to bit numbers.
+	 */
+	first_bit = first >> XFS_BLI_SHIFT;
+	last_bit = last >> XFS_BLI_SHIFT;
+
+	/*
+	 * Calculate the total number of bits to be set.
+	 */
+	bits_to_set = last_bit - first_bit + 1;
+
+	/*
+	 * Get a pointer to the first word in the bitmap
+	 * to set a bit in.
+	 */
+	word_num = first_bit >> BIT_TO_WORD_SHIFT;
+	wordp = &(bip->bli_format.blf_data_map[word_num]);
+
+	/*
+	 * Calculate the starting bit in the first word.
+	 */
+	bit = first_bit & (uint)(NBWORD - 1);
+
+	/*
+	 * First set any bits in the first word of our range.
+	 * If it starts at bit 0 of the word, it will be
+	 * set below rather than here.	That is what the variable
+	 * bit tells us. The variable bits_set tracks the number
+	 * of bits that have been set so far.  End_bit is the number
+	 * of the last bit to be set in this word plus one.
+	 */
+	if (bit) {
+		end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
+		mask = ((1 << (end_bit - bit)) - 1) << bit;
+		*wordp |= mask;
+		wordp++;
+		bits_set = end_bit - bit;
+	} else {
+		bits_set = 0;
+	}
+
+	/*
+	 * Now set bits a whole word at a time that are between
+	 * first_bit and last_bit.
+	 */
+	while ((bits_to_set - bits_set) >= NBWORD) {
+		*wordp |= 0xffffffff;
+		bits_set += NBWORD;
+		wordp++;
+	}
+
+	/*
+	 * Finally, set any bits left to be set in one last partial word.
+	 */
+	end_bit = bits_to_set - bits_set;
+	if (end_bit) {
+		mask = (1 << end_bit) - 1;
+		*wordp |= mask;
+	}
+
+	xfs_buf_item_log_debug(bip, first, last);
+}
+
+
+/*
+ * Return 1 if the buffer has some data that has been logged (at any
+ * point, not just the current transaction) and 0 if not.
+ */
+uint
+xfs_buf_item_dirty(
+	xfs_buf_log_item_t	*bip)
+{
+	return (bip->bli_flags & XFS_BLI_DIRTY);
+}
+
+/*
+ * This is called when the buf log item is no longer needed.  It should
+ * free the buf log item associated with the given buffer and clear
+ * the buffer's pointer to the buf log item.  If there are no more
+ * items in the list, clear the b_iodone field of the buffer (see
+ * xfs_buf_attach_iodone() below).
+ */
+void
+xfs_buf_item_relse(
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	xfs_buftrace("XFS_RELSE", bp);
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
+	if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
+	    (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
+/**
+		ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0);
+***/
+		XFS_BUF_CLR_IODONE_FUNC(bp);
+	}
+
+#ifdef XFS_TRANS_DEBUG
+	kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
+	bip->bli_orig = NULL;
+	kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
+	bip->bli_logged = NULL;
+#endif /* XFS_TRANS_DEBUG */
+
+#ifdef XFS_BLI_TRACE
+	ktrace_free(bip->bli_trace);
+#endif
+	kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
+
+/*
+ * Add the given log item with it's callback to the list of callbacks
+ * to be called when the buffer's I/O completes.  If it is not set
+ * already, set the buffer's b_iodone() routine to be
+ * xfs_buf_iodone_callbacks() and link the log item into the list of
+ * items rooted at b_fsprivate.	 Items are always added as the second
+ * entry in the list if there is a first, because the buf item code
+ * assumes that the buf log item is first.
+ */
+void
+xfs_buf_attach_iodone(
+	xfs_buf_t	*bp,
+	void		(*cb)(xfs_buf_t *, xfs_log_item_t *),
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*head_lip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	lip->li_cb = cb;
+	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+		head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+		lip->li_bio_list = head_lip->li_bio_list;
+		head_lip->li_bio_list = lip;
+	} else {
+		XFS_BUF_SET_FSPRIVATE(bp, lip);
+	}
+
+	ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
+	       (XFS_BUF_IODONE_FUNC(bp) == NULL));
+	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+}
+
+STATIC void
+xfs_buf_do_callbacks(
+	xfs_buf_t	*bp,
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*nlip;
+
+	while (lip != NULL) {
+		nlip = lip->li_bio_list;
+		ASSERT(lip->li_cb != NULL);
+		/*
+		 * Clear the next pointer so we don't have any
+		 * confusion if the item is added to another buf.
+		 * Don't touch the log item after calling its
+		 * callback, because it could have freed itself.
+		 */
+		lip->li_bio_list = NULL;
+		lip->li_cb(bp, lip);
+		lip = nlip;
+	}
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks
+ * attached to them by xfs_buf_attach_iodone().	 It should remove each
+ * log item from the buffer's list and call the callback of each in turn.
+ * When done, the buffer's fsprivate field is set to NULL and the buffer
+ * is unlocked with a call to iodone().
+ */
+void
+xfs_buf_iodone_callbacks(
+	xfs_buf_t	*bp)
+{
+	xfs_log_item_t	*lip;
+	static time_t	lasttime;
+	static dev_t	lastdev;
+	xfs_mount_t	*mp;
+
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+
+	if (XFS_BUF_GETERROR(bp) != 0) {
+		/*
+		 * If we've already decided to shutdown the filesystem
+		 * because of IO errors, there's no point in giving this
+		 * a retry.
+		 */
+		mp = lip->li_mountp;
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			ASSERT(XFS_BUF_TARGET_DEV(bp) == mp->m_dev);
+			XFS_BUF_SUPER_STALE(bp);
+			xfs_buftrace("BUF_IODONE_CB", bp);
+			xfs_buf_do_callbacks(bp, lip);
+			XFS_BUF_SET_FSPRIVATE(bp, NULL);
+			XFS_BUF_CLR_IODONE_FUNC(bp);
+
+			/*
+			 * XFS_SHUT flag gets set when we go thru the
+			 * entire buffer cache and deliberately start
+			 * throwing away delayed write buffers.
+			 * Since there's no biowait done on those,
+			 * we should just brelse them.
+			 */
+			if (XFS_BUF_ISSHUT(bp)) {
+			    XFS_BUF_UNSHUT(bp);
+				xfs_buf_relse(bp);
+			} else {
+				xfs_biodone(bp);
+			}
+
+			return;
+		}
+
+		if ((XFS_BUF_TARGET_DEV(bp) != lastdev) ||
+		    ((lbolt - lasttime) > 500)) {
+			prdev("XFS write error in file system meta-data "
+			      "block 0x%Lx in %s",
+			      XFS_BUF_TARGET_DEV(bp),
+			      XFS_BUF_ADDR(bp), mp->m_fsname);
+			lasttime = lbolt;
+		}
+		lastdev = XFS_BUF_TARGET_DEV(bp);
+
+		if (XFS_BUF_ISASYNC(bp)) {
+			/*
+			 * If the write was asynchronous then noone will be
+			 * looking for the error.  Clear the error state
+			 * and write the buffer out again delayed write.
+			 *
+			 * XXXsup This is OK, so long as we catch these
+			 * before we start the umount; we don't want these
+			 * DELWRI metadata bufs to be hanging around.
+			 */
+			XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
+
+			if (!(XFS_BUF_ISSTALE(bp))) {
+				XFS_BUF_DELAYWRITE(bp);
+				XFS_BUF_DONE(bp);
+				XFS_BUF_SET_START(bp);
+			}
+			ASSERT(XFS_BUF_IODONE_FUNC(bp));
+			xfs_buftrace("BUF_IODONE ASYNC", bp);
+			xfs_buf_relse(bp);
+		} else {
+			/*
+			 * If the write of the buffer was not asynchronous,
+			 * then we want to make sure to return the error
+			 * to the caller of bwrite().  Because of this we
+			 * cannot clear the B_ERROR state at this point.
+			 * Instead we install a callback function that
+			 * will be called when the buffer is released, and
+			 * that routine will clear the error state and
+			 * set the buffer to be written out again after
+			 * some delay.
+			 */
+			/* We actually overwrite the existing b-relse
+			   function at times, but we're gonna be shutting down
+			   anyway. */
+			XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
+			XFS_BUF_DONE(bp);
+			XFS_BUF_V_IODONESEMA(bp);
+		}
+		return;
+	}
+#ifdef XFSERRORDEBUG
+	xfs_buftrace("XFS BUFCB NOERR", bp);
+#endif
+	xfs_buf_do_callbacks(bp, lip);
+	XFS_BUF_SET_FSPRIVATE(bp, NULL);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	xfs_biodone(bp);
+}
+
+/*
+ * This is a callback routine attached to a buffer which gets an error
+ * when being written out synchronously.
+ */
+STATIC void
+xfs_buf_error_relse(
+	xfs_buf_t	*bp)
+{
+	xfs_log_item_t	*lip;
+	xfs_mount_t	*mp;
+
+	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+	mp = (xfs_mount_t *)lip->li_mountp;
+	ASSERT(XFS_BUF_TARGET_DEV(bp) == mp->m_dev);
+
+	XFS_BUF_STALE(bp);
+	XFS_BUF_DONE(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_ERROR(bp,0);
+	xfs_buftrace("BUF_ERROR_RELSE", bp);
+	if (! XFS_FORCED_SHUTDOWN(mp))
+		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+	/*
+	 * We have to unpin the pinned buffers so do the
+	 * callbacks.
+	 */
+	xfs_buf_do_callbacks(bp, lip);
+	XFS_BUF_SET_FSPRIVATE(bp, NULL);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
+	xfs_buf_relse(bp);
+}
+
+
+/*
+ * This is the iodone() function for buffers which have been
+ * logged.  It is called when they are eventually flushed out.
+ * It should remove the buf item from the AIL, and free the buf item.
+ * It is called by xfs_buf_iodone_callbacks() above which will take
+ * care of cleaning up the buffer itself.
+ */
+/* ARGSUSED */
+void
+xfs_buf_iodone(
+	xfs_buf_t		*bp,
+	xfs_buf_log_item_t	*bip)
+{
+	struct xfs_mount	*mp;
+	SPLDECL(s);
+
+	ASSERT(bip->bli_buf == bp);
+
+	mp = bip->bli_item.li_mountp;
+
+	/*
+	 * If we are forcibly shutting down, this may well be
+	 * off the AIL already. That's because we simulate the
+	 * log-committed callbacks to unpin these buffers. Or we may never
+	 * have put this item on AIL because of the transaction was
+	 * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+	 *
+	 * Either way, AIL is useless if we're forcing a shutdown.
+	 */
+	AIL_LOCK(mp,s);
+	/*
+	 * xfs_trans_delete_ail() drops the AIL lock.
+	 */
+	xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
+
+#ifdef XFS_TRANS_DEBUG
+	kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
+	bip->bli_orig = NULL;
+	kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
+	bip->bli_logged = NULL;
+#endif /* XFS_TRANS_DEBUG */
+
+#ifdef XFS_BLI_TRACE
+	ktrace_free(bip->bli_trace);
+#endif
+	kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
+#if defined(XFS_BLI_TRACE)
+void
+xfs_buf_item_trace(
+	char			*id,
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t		*bp;
+	ASSERT(bip->bli_trace != NULL);
+
+	bp = bip->bli_buf;
+	ktrace_enter(bip->bli_trace,
+		     (void *)id,
+		     (void *)bip->bli_buf,
+		     (void *)((unsigned long)bip->bli_flags),
+		     (void *)((unsigned long)bip->bli_recur),
+		     (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
+		     (void *)XFS_BUF_ADDR(bp),
+		     (void *)((unsigned long)XFS_BUF_COUNT(bp)),
+		     (void *)((unsigned long)(0xFFFFFFFF & (XFS_BFLAGS(bp) >> 32))),
+		     (void *)((unsigned long)(0xFFFFFFFF & XFS_BFLAGS(bp))),
+		     XFS_BUF_FSPRIVATE(bp, void *),
+		     XFS_BUF_FSPRIVATE2(bp, void *),
+		     (void *)((unsigned long)bp->b_pincount),
+		     (void *)XFS_BUF_IODONE_FUNC(bp),
+		     (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
+		     (void *)bip->bli_item.li_desc,
+		     (void *)((unsigned long)bip->bli_item.li_flags));
+}
+#endif /* XFS_BLI_TRACE */
+
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_buf_item.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_buf_item.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_buf_item.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_buf_item.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BUF_ITEM_H__
+#define __XFS_BUF_ITEM_H__
+
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log.	 The data map describes which 128 byte chunks of the buffer
+ * have been logged.  This structure works only on buffers that
+ * reside up to the first TB in the filesystem.	 These buffers are
+ * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF.
+ */
+typedef struct xfs_buf_log_format_v1 {
+	unsigned short	blf_type;	/* buf log item type indicator */
+	unsigned short	blf_size;	/* size of this item */
+	__int32_t	blf_blkno;	/* starting blkno of this buf */
+	ushort		blf_flags;	/* misc state */
+	ushort		blf_len;	/* number of blocks in this buf */
+	unsigned int	blf_map_size;	/* size of data bitmap in words */
+	unsigned int	blf_data_map[1];/* variable size bitmap of */
+					/*   regions of buffer in this item */
+} xfs_buf_log_format_v1_t;
+
+/*
+ * This is a form of the above structure with a 64 bit blkno field.
+ * For 6.2 and beyond, this is XFS_LI_BUF.  We use this to log everything.
+ */
+typedef struct xfs_buf_log_format_t {
+	unsigned short	blf_type;	/* buf log item type indicator */
+	unsigned short	blf_size;	/* size of this item */
+	ushort		blf_flags;	/* misc state */
+	ushort		blf_len;	/* number of blocks in this buf */
+	__int64_t	blf_blkno;	/* starting blkno of this buf */
+	unsigned int	blf_map_size;	/* size of data bitmap in words */
+	unsigned int	blf_data_map[1];/* variable size bitmap of */
+					/*   regions of buffer in this item */
+} xfs_buf_log_format_t;
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define XFS_BLI_INODE_BUF	0x1
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define XFS_BLI_CANCEL		0x2
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define XFS_BLI_UDQUOT_BUF	0x4
+/* #define XFS_BLI_PDQUOT_BUF	0x8 */
+#define XFS_BLI_GDQUOT_BUF	0x10
+
+#define XFS_BLI_CHUNK		128
+#define XFS_BLI_SHIFT		7
+#define BIT_TO_WORD_SHIFT	5
+#define NBWORD			(NBBY * sizeof(unsigned int))
+
+/*
+ * buf log item flags
+ */
+#define XFS_BLI_HOLD		0x01
+#define XFS_BLI_DIRTY		0x02
+#define XFS_BLI_STALE		0x04
+#define XFS_BLI_LOGGED		0x08
+#define XFS_BLI_INODE_ALLOC_BUF 0x10
+
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct ktrace;
+struct xfs_mount;
+
+/*
+ * This is the in core log item structure used to track information
+ * needed to log buffers.  It tracks how many times the lock has been
+ * locked, and which 128 byte chunks of the buffer are dirty.
+ */
+typedef struct xfs_buf_log_item {
+	xfs_log_item_t		bli_item;	/* common item structure */
+	struct xfs_buf		*bli_buf;	/* real buffer pointer */
+	unsigned int		bli_flags;	/* misc flags */
+	unsigned int		bli_recur;	/* lock recursion count */
+	atomic_t		bli_refcount;	/* cnt of tp refs */
+#ifdef DEBUG
+	struct ktrace		*bli_trace;	/* event trace buf */
+#endif
+#ifdef XFS_TRANS_DEBUG
+	char			*bli_orig;	/* original buffer copy */
+	char			*bli_logged;	/* bytes logged (bitmap) */
+#endif
+	xfs_buf_log_format_t	bli_format;	/* in-log header */
+} xfs_buf_log_item_t;
+
+/*
+ * This structure is used during recovery to record the buf log
+ * items which have been canceled and should not be replayed.
+ */
+typedef struct xfs_buf_cancel {
+	xfs_daddr_t			bc_blkno;
+	uint			bc_len;
+	int			bc_refcount;
+	struct xfs_buf_cancel	*bc_next;
+} xfs_buf_cancel_t;
+
+#define XFS_BLI_TRACE_SIZE	32
+
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_BLI_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_BLI_TRACE
+#endif
+
+#if defined(XFS_BLI_TRACE)
+void	xfs_buf_item_trace(char *, xfs_buf_log_item_t *);
+#else
+#define xfs_buf_item_trace(id, bip)
+#endif
+
+void	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void	xfs_buf_item_relse(struct xfs_buf *);
+void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+uint	xfs_buf_item_dirty(xfs_buf_log_item_t *);
+void	xfs_buf_attach_iodone(struct xfs_buf *,
+			      void(*)(struct xfs_buf *, xfs_log_item_t *),
+			      xfs_log_item_t *);
+void	xfs_buf_iodone_callbacks(struct xfs_buf *);
+void	xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *);
+
+#ifdef XFS_TRANS_DEBUG
+void
+xfs_buf_item_flush_log_debug(
+	struct xfs_buf *bp,
+	uint	first,
+	uint	last);
+#else
+#define xfs_buf_item_flush_log_debug(bp, first, last)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_BUF_ITEM_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_cap.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_cap.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_cap.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_cap.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+STATIC int xfs_cap_allow_set(vnode_t *);
+
+
+/*
+ * Test for existence of capability attribute as efficiently as possible.
+ */
+int
+xfs_cap_vhascap(
+	vnode_t		*vp)
+{
+	int		error;
+	int		len = sizeof(xfs_cap_set_t);
+	int		flags = ATTR_KERNOVAL|ATTR_ROOT;
+
+	VOP_ATTR_GET(vp, SGI_CAP_LINUX, NULL, &len, flags, sys_cred, error);
+	return (error == 0);
+}
+
+/*
+ * Convert from extended attribute representation to in-memory for XFS.
+ */
+STATIC int
+posix_cap_xattr_to_xfs(
+	posix_cap_xattr		*src,
+	size_t			size,
+	xfs_cap_set_t		*dest)
+{
+	if (!src || !dest)
+		return EINVAL;
+
+	if (src->c_version != cpu_to_le32(POSIX_CAP_XATTR_VERSION))
+		return EINVAL;
+	if (src->c_abiversion != cpu_to_le32(_LINUX_CAPABILITY_VERSION))
+		return EINVAL;
+
+	if (size < sizeof(posix_cap_xattr))
+		return EINVAL;
+
+	ASSERT(sizeof(dest->cap_effective) == sizeof(src->c_effective));
+
+	dest->cap_effective	= src->c_effective;
+	dest->cap_permitted	= src->c_permitted;
+	dest->cap_inheritable	= src->c_inheritable;
+
+	return 0;
+}
+
+/*
+ * Convert from in-memory XFS to extended attribute representation.
+ */
+STATIC int
+posix_cap_xfs_to_xattr(
+	xfs_cap_set_t		*src,
+	posix_cap_xattr		*xattr_cap,
+	size_t			size)
+{
+	size_t			new_size = posix_cap_xattr_size();
+
+	if (size < new_size)
+		return -ERANGE;
+
+	ASSERT(sizeof(xattr_cap->c_effective) == sizeof(src->cap_effective));
+
+	xattr_cap->c_version	= cpu_to_le32(POSIX_CAP_XATTR_VERSION);
+	xattr_cap->c_abiversion = cpu_to_le32(_LINUX_CAPABILITY_VERSION);
+	xattr_cap->c_effective	= src->cap_effective;
+	xattr_cap->c_permitted	= src->cap_permitted;
+	xattr_cap->c_inheritable= src->cap_inheritable;
+
+	return new_size;
+}
+
+int
+xfs_cap_vget(
+	vnode_t		*vp,
+	void		*cap,
+	size_t		size)
+{
+	int		error;
+	int		len = sizeof(xfs_cap_set_t);
+	int		flags = ATTR_ROOT;
+	xfs_cap_set_t	xfs_cap = { 0 };
+	posix_cap_xattr *xattr_cap = cap;
+
+	VN_HOLD(vp);
+	if ((error = _MAC_VACCESS(vp, NULL, VREAD)))
+		goto out;
+
+	if (!size)
+		flags |= ATTR_KERNOVAL;
+	VOP_ATTR_GET(vp, SGI_CAP_LINUX, (char *)&xfs_cap,
+			&len, flags, sys_cred, error);
+	if (error)
+		goto out;
+	ASSERT(len == sizeof(xfs_cap_set_t));
+
+	error = (size)? -posix_cap_xattr_size() :
+			-posix_cap_xfs_to_xattr(&xfs_cap, xattr_cap, size);
+out:
+	VN_RELE(vp);
+	return -error;
+}
+
+int
+xfs_cap_vremove(
+	vnode_t		*vp)
+{
+	int		error;
+
+	VN_HOLD(vp);
+	error = xfs_cap_allow_set(vp);
+	if (!error) {
+		VOP_ATTR_REMOVE(vp, SGI_CAP_LINUX, ATTR_ROOT, sys_cred, error);
+		if (error == ENOATTR)
+			error = 0;	/* 'scool */
+	}
+	VN_RELE(vp);
+	return -error;
+}
+
+int
+xfs_cap_vset(
+	vnode_t			*vp,
+	void			*cap,
+	size_t			size)
+{
+	posix_cap_xattr		*xattr_cap = cap;
+	xfs_cap_set_t		xfs_cap;
+	int			error;
+
+	if (!cap)
+		return -EINVAL;
+
+	error = posix_cap_xattr_to_xfs(xattr_cap, size, &xfs_cap);
+	if (error)
+		return -error;
+
+	VN_HOLD(vp);
+	error = xfs_cap_allow_set(vp);
+	if (error)
+		goto out;
+
+	VOP_ATTR_SET(vp, SGI_CAP_LINUX, (char *)&xfs_cap,
+			sizeof(xfs_cap_set_t), ATTR_ROOT, sys_cred, error);
+out:
+	VN_RELE(vp);
+	return -error;
+}
+
+STATIC int
+xfs_cap_allow_set(
+	vnode_t		*vp)
+{
+	vattr_t		va;
+	int		error;
+
+	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+		return EROFS;
+	if ((error = _MAC_VACCESS(vp, NULL, VWRITE)))
+		return error;
+	va.va_mask = AT_UID;
+	VOP_GETATTR(vp, &va, 0, NULL, error);
+	if (error)
+		return error;
+	if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
+		return EPERM;
+	return error;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_cap.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_cap.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_cap.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_cap.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_CAP_H__
+#define __XFS_CAP_H__
+
+/*
+ * Capabilities
+ */
+typedef __uint64_t xfs_cap_value_t;
+
+typedef struct xfs_cap_set {
+	xfs_cap_value_t cap_effective;	/* use in capability checks */
+	xfs_cap_value_t cap_permitted;	/* combined with file attrs */
+	xfs_cap_value_t cap_inheritable;/* pass through exec */
+} xfs_cap_set_t;
+
+/* On-disk XFS extended attribute names */
+#define SGI_CAP_FILE	"SGI_CAP_FILE"
+#define SGI_CAP_FILE_SIZE	(sizeof(SGI_CAP_FILE)-1)
+#define SGI_CAP_LINUX	"SGI_CAP_LINUX"
+#define SGI_CAP_LINUX_SIZE	(sizeof(SGI_CAP_LINUX)-1)
+
+/*
+ * For Linux, we take the bitfields directly from capability.h
+ * and no longer attempt to keep this attribute ondisk compatible
+ * with IRIX.  Since this attribute is only set on exectuables,
+ * it just doesn't make much sense to try.  We do use a different
+ * named attribute though, to avoid confusion.
+ */
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_FS_POSIX_CAP
+
+#include <linux/posix_cap_xattr.h>
+
+struct vnode;
+
+extern int xfs_cap_vhascap(struct vnode *);
+extern int xfs_cap_vset(struct vnode *, void *, size_t);
+extern int xfs_cap_vget(struct vnode *, void *, size_t);
+extern int xfs_cap_vremove(struct vnode *vp);
+
+#define _CAP_EXISTS		xfs_cap_vhascap
+
+#else
+#define xfs_cap_vset(v,p,sz)	(-EOPNOTSUPP)
+#define xfs_cap_vget(v,p,sz)	(-EOPNOTSUPP)
+#define xfs_cap_vremove(v)	(-EOPNOTSUPP)
+#define _CAP_EXISTS		(NULL)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_CAP_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_clnt.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_clnt.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_clnt.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_clnt.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_CLNT_H__
+#define __XFS_CLNT_H__
+
+/*
+ * XFS arguments structure, constructed from the arguments we
+ * are passed via the mount system call.
+ *
+ * NOTE: The mount system call is handled differently between
+ * Linux and IRIX.  In IRIX we worked work with a binary data
+ * structure coming in across the syscall interface from user
+ * space (the mount userspace knows about each filesystem type
+ * and the set of valid options for it, and converts the users
+ * argument string into a binary structure _before_ making the
+ * system call), and the ABI issues that this implies.
+ *
+ * In Linux, we are passed a comma separated set of options;
+ * ie. a NULL terminated string of characters.	Userspace mount
+ * code does not have any knowledge of mount options expected by
+ * each filesystem type and so each filesystem parses its mount
+ * options in kernel space.
+ *
+ * For the Linux port, we kept this structure pretty much intact
+ * and use it internally (because the existing code groks it).
+ */
+struct xfs_mount_args {
+	int	flags;		/* flags -> see XFSMNT_... macros below */
+	int	logbufs;	/* Number of log buffers, -1 to default */
+	int	logbufsize;	/* Size of log buffers, -1 to default */
+	char	fsname[MAXNAMELEN];	/* data device name */
+	char	rtname[MAXNAMELEN];	/* realtime device filename */
+	char	logname[MAXNAMELEN];	/* journal device filename */
+	char	mtpt[MAXNAMELEN];	/* filesystem mount point */
+	int	sunit;		/* stripe unit (BBs) */
+	int	swidth;		/* stripe width (BBs), multiple of sunit */
+	uchar_t iosizelog;	/* log2 of the preferred I/O size */
+
+	/*  The remainder is for CXFS support.	*/
+	char	**servlist;	/* Table of hosts which may be servers */
+	int	*servlistlen;	/* Table of hostname lengths. */
+	int	slcount;	/* Count of hosts which may be servers. */
+	int	stimeout;	/* Server timeout in milliseconds */
+	int	ctimeout;	/* Client timeout in milliseconds */
+	char	*server;	/* Designated server hostname (for remount). */
+	int	servlen;	/* Length of server hostname (for remount). */
+	int	servcell;	/* Server cell (internal testing only) */
+};
+
+/*
+ * XFS mount option flags
+ */
+#define XFSMNT_CHKLOG		0x00000001	/* check log */
+#define XFSMNT_WSYNC		0x00000002	/* safe mode nfs mount
+						 * compatible */
+#define XFSMNT_INO64		0x00000004	/* move inode numbers up
+						 * past 2^32 */
+#define XFSMNT_UQUOTA		0x00000008	/* user quota accounting */
+#define XFSMNT_PQUOTA		0x00000010	/* IRIX prj quota accounting */
+#define XFSMNT_UQUOTAENF	0x00000020	/* user quota limit
+						 * enforcement */
+#define XFSMNT_PQUOTAENF	0x00000040	/* IRIX project quota limit
+						 * enforcement */
+#define XFSMNT_QUOTAMAYBE	0x00000080	/* don't turn off if SB
+						 * has quotas on */
+#define XFSMNT_NOATIME		0x00000100	/* don't modify access
+						 * times on reads */
+#define XFSMNT_NOALIGN		0x00000200	/* don't allocate at
+						 * stripe boundaries*/
+#define XFSMNT_RETERR		0x00000400	/* return error to user */
+#define XFSMNT_NORECOVERY	0x00000800	/* no recovery, implies
+						 * read-only mount */
+#define XFSMNT_SHARED		0x00001000	/* shared XFS mount */
+#define XFSMNT_IOSIZE		0x00002000	/* optimize for I/O size */
+#define XFSMNT_OSYNCISOSYNC	0x00004000	/* o_sync is REALLY o_sync */
+						/* (osyncisdsync is now default) */
+#define XFSMNT_CLNTONLY		0x00008000	/* cxfs mount as client only */
+#define XFSMNT_UNSHARED		0x00010000	/* cxfs filesystem mounted
+						 * unshared */
+#define XFSMNT_CHGCLNTONLY	0x00020000	/* changing client only flag */
+						/* (for remount only) */
+#define XFSMNT_SERVCELL		0x00040000	/* setting server cell */
+						/* (allowed on remount) */
+#define XFSMNT_MAKESERVER	0x00080000	/* become the server (remount */
+						/* only) */
+#define XFSMNT_NOTSERVER	0x00100000	/* give up being the server */
+						/* (remount only) */
+#define XFSMNT_DMAPI		0x00200000	/* enable dmapi/xdsm */
+#define XFSMNT_GQUOTA		0x00400000	/* group quota accounting */
+#define XFSMNT_GQUOTAENF	0x00800000	/* group quota limit
+						 * enforcement */
+#define XFSMNT_NOUUID		0x01000000	/* Ignore fs uuid */
+#define XFSMNT_32BITINODES	0x02000000	/* restrict inodes to 32
+						 * bits of address space */
+#define XFSMNT_IRIXSGID		0x04000000	/* Irix-style sgid inheritance */
+#define XFSMNT_NOLOGFLUSH	0x08000000	/* Don't flush for log blocks */
+
+/* Did we get any args for CXFS to consume? */
+#define XFSARGS_FOR_CXFSARR(ap)		\
+	((ap)->servlist || (ap)->slcount >= 0 || \
+	 (ap)->stimeout >= 0 || (ap)->ctimeout >= 0 || \
+	 (ap)->flags & (XFSMNT_CLNTONLY | XFSMNT_UNSHARED))
+
+#endif	/* __XFS_CLNT_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_da_btree.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_da_btree.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_da_btree.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_da_btree.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,2579 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+#if defined(XFSDEBUG) && defined(CONFIG_KDB)
+#undef xfs_buftrace
+#define xfs_buftrace(A,B) \
+	printk("    xfs_buftrace : %s (0x%p)\n", A, B); \
+	BUG();
+#endif
+
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da_root_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_root,
+					    xfs_da_state_blk_t *new_child);
+STATIC int xfs_da_node_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_blk,
+					    xfs_da_state_blk_t *split_blk,
+					    xfs_da_state_blk_t *blk_to_add,
+					    int treelevel,
+					    int *result);
+STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *node_blk_1,
+					 xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da_node_add(xfs_da_state_t *state,
+				   xfs_da_state_blk_t *old_node_blk,
+				   xfs_da_state_blk_t *new_node_blk);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da_root_join(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da_node_remove(xfs_da_state_t *state,
+					      xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *src_node_blk,
+					 xfs_da_state_blk_t *dst_node_blk);
+
+/*
+ * Utility routines.
+ */
+STATIC uint	xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count);
+STATIC int	xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp);
+STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra);
+
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+				 xfs_dabuf_t **bpp, int whichfork)
+{
+	xfs_da_intnode_t *node;
+	xfs_dabuf_t *bp;
+	int error;
+	xfs_trans_t *tp;
+
+	tp = args->trans;
+	error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	node = bp->data;
+	INT_ZERO(node->hdr.info.forw, ARCH_CONVERT);
+	INT_ZERO(node->hdr.info.back, ARCH_CONVERT);
+	INT_SET(node->hdr.info.magic, ARCH_CONVERT, XFS_DA_NODE_MAGIC);
+	INT_ZERO(node->hdr.info.pad, ARCH_CONVERT);
+	INT_ZERO(node->hdr.count, ARCH_CONVERT);
+	INT_SET(node->hdr.level, ARCH_CONVERT, level);
+
+	xfs_da_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+	*bpp = bp;
+	return(0);
+}
+
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int							/* error */
+xfs_da_split(xfs_da_state_t *state)
+{
+	xfs_da_state_blk_t *oldblk, *newblk, *addblk;
+	xfs_da_intnode_t *node;
+	xfs_dabuf_t *bp;
+	int max, action, error, i;
+
+	/*
+	 * Walk back up the tree splitting/inserting/adjusting as necessary.
+	 * If we need to insert and there isn't room, split the node, then
+	 * decide which fragment to insert the new block from below into.
+	 * Note that we may split the root this way, but we need more fixup.
+	 */
+	max = state->path.active - 1;
+	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+	       state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+
+	addblk = &state->path.blk[max];		/* initial dummy value */
+	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+		oldblk = &state->path.blk[i];
+		newblk = &state->altpath.blk[i];
+
+		/*
+		 * If a leaf node then
+		 *     Allocate a new leaf node, then rebalance across them.
+		 * else if an intermediate node then
+		 *     We split on the last layer, must we split the node?
+		 */
+		switch (oldblk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+#ifndef __KERNEL__
+			return(ENOTTY);
+#else
+			error = xfs_attr_leaf_split(state, oldblk, newblk);
+			if ((error != 0) && (error != ENOSPC)) {
+				return(error);	/* GROT: attr is inconsistent */
+			}
+			if (!error) {
+				addblk = newblk;
+				break;
+			}
+			/*
+			 * Entry wouldn't fit, split the leaf again.
+			 */
+			state->extravalid = 1;
+			if (state->inleaf) {
+				state->extraafter = 0;	/* before newblk */
+				error = xfs_attr_leaf_split(state, oldblk,
+							    &state->extrablk);
+			} else {
+				state->extraafter = 1;	/* after newblk */
+				error = xfs_attr_leaf_split(state, newblk,
+							    &state->extrablk);
+			}
+			if (error)
+				return(error);	/* GROT: attr inconsistent */
+			addblk = newblk;
+			break;
+#endif
+		case XFS_DIR_LEAF_MAGIC:
+			ASSERT(XFS_DIR_IS_V1(state->mp));
+			error = xfs_dir_leaf_split(state, oldblk, newblk);
+			if ((error != 0) && (error != ENOSPC)) {
+				return(error);	/* GROT: dir is inconsistent */
+			}
+			if (!error) {
+				addblk = newblk;
+				break;
+			}
+			/*
+			 * Entry wouldn't fit, split the leaf again.
+			 */
+			state->extravalid = 1;
+			if (state->inleaf) {
+				state->extraafter = 0;	/* before newblk */
+				error = xfs_dir_leaf_split(state, oldblk,
+							   &state->extrablk);
+				if (error)
+					return(error);	/* GROT: dir incon. */
+				addblk = newblk;
+			} else {
+				state->extraafter = 1;	/* after newblk */
+				error = xfs_dir_leaf_split(state, newblk,
+							   &state->extrablk);
+				if (error)
+					return(error);	/* GROT: dir incon. */
+				addblk = newblk;
+			}
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			ASSERT(XFS_DIR_IS_V2(state->mp));
+			error = xfs_dir2_leafn_split(state, oldblk, newblk);
+			if (error)
+				return error;
+			addblk = newblk;
+			break;
+		case XFS_DA_NODE_MAGIC:
+			error = xfs_da_node_split(state, oldblk, newblk, addblk,
+							 max - i, &action);
+			xfs_da_buf_done(addblk->bp);
+			addblk->bp = NULL;
+			if (error)
+				return(error);	/* GROT: dir is inconsistent */
+			/*
+			 * Record the newly split block for the next time thru?
+			 */
+			if (action)
+				addblk = newblk;
+			else
+				addblk = NULL;
+			break;
+		}
+
+		/*
+		 * Update the btree to show the new hashval for this child.
+		 */
+		xfs_da_fixhashpath(state, &state->path);
+		/*
+		 * If we won't need this block again, it's getting dropped
+		 * from the active path by the loop control, so we need
+		 * to mark it done now.
+		 */
+		if (i > 0 || !addblk)
+			xfs_da_buf_done(oldblk->bp);
+	}
+	if (!addblk)
+		return(0);
+
+	/*
+	 * Split the root node.
+	 */
+	ASSERT(state->path.active == 0);
+	oldblk = &state->path.blk[0];
+	error = xfs_da_root_split(state, oldblk, addblk);
+	if (error) {
+		xfs_da_buf_done(oldblk->bp);
+		xfs_da_buf_done(addblk->bp);
+		addblk->bp = NULL;
+		return(error);	/* GROT: dir is inconsistent */
+	}
+
+	/*
+	 * Update pointers to the node which used to be block 0 and
+	 * just got bumped because of the addition of a new root node.
+	 * There might be three blocks involved if a double split occurred,
+	 * and the original block 0 could be at any position in the list.
+	 */
+
+	node = oldblk->bp->data;
+	if (!INT_ISZERO(node->hdr.info.forw, ARCH_CONVERT)) {
+		if (INT_GET(node->hdr.info.forw, ARCH_CONVERT) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->data;
+		INT_SET(node->hdr.info.back, ARCH_CONVERT, oldblk->blkno);
+		xfs_da_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	node = oldblk->bp->data;
+	if (INT_GET(node->hdr.info.back, ARCH_CONVERT)) {
+		if (INT_GET(node->hdr.info.back, ARCH_CONVERT) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->data;
+		INT_SET(node->hdr.info.forw, ARCH_CONVERT, oldblk->blkno);
+		xfs_da_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	xfs_da_buf_done(oldblk->bp);
+	xfs_da_buf_done(addblk->bp);
+	addblk->bp = NULL;
+	return(0);
+}
+
+/*
+ * Split the root.  We have to create a new root and point to the two
+ * parts (the split old root) that we just created.  Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int						/* error */
+xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				 xfs_da_state_blk_t *blk2)
+{
+	xfs_da_intnode_t *node, *oldroot;
+	xfs_da_args_t *args;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+	int error, size;
+	xfs_inode_t *dp;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+	xfs_dir2_leaf_t *leaf;
+
+	/*
+	 * Copy the existing (incorrect) block from the root node position
+	 * to a free space somewhere.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		return(error);
+	dp = args->dp;
+	tp = args->trans;
+	mp = state->mp;
+	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	node = bp->data;
+	oldroot = blk1->bp->data;
+	if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+		size = (int)((char *)&oldroot->btree[INT_GET(oldroot->hdr.count, ARCH_CONVERT)] -
+			     (char *)oldroot);
+	} else {
+		ASSERT(XFS_DIR_IS_V2(mp));
+		ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+		leaf = (xfs_dir2_leaf_t *)oldroot;
+		size = (int)((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] -
+			     (char *)leaf);
+	}
+	bcopy(oldroot, node, size);
+	xfs_da_log_buf(tp, bp, 0, size - 1);
+	xfs_da_buf_done(blk1->bp);
+	blk1->bp = bp;
+	blk1->blkno = blkno;
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da_node_create(args,
+		args->whichfork == XFS_DATA_FORK &&
+		XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0,
+		INT_GET(node->hdr.level, ARCH_CONVERT) + 1, &bp, args->whichfork);
+	if (error)
+		return(error);
+	node = bp->data;
+	INT_SET(node->btree[0].hashval, ARCH_CONVERT, blk1->hashval);
+	INT_SET(node->btree[0].before, ARCH_CONVERT, blk1->blkno);
+	INT_SET(node->btree[1].hashval, ARCH_CONVERT, blk2->hashval);
+	INT_SET(node->btree[1].before, ARCH_CONVERT, blk2->blkno);
+	INT_SET(node->hdr.count, ARCH_CONVERT, 2);
+	if (XFS_DIR_IS_V2(mp)) {
+		ASSERT(blk1->blkno >= mp->m_dirleafblk &&
+		       blk1->blkno < mp->m_dirfreeblk);
+		ASSERT(blk2->blkno >= mp->m_dirleafblk &&
+		       blk2->blkno < mp->m_dirfreeblk);
+	}
+	/* Header is already logged by xfs_da_node_create */
+	xfs_da_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, node->btree,
+			sizeof(xfs_da_node_entry_t) * 2));
+	xfs_da_buf_done(bp);
+
+	return(0);
+}
+
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int						/* error */
+xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+				 xfs_da_state_blk_t *newblk,
+				 xfs_da_state_blk_t *addblk,
+				 int treelevel, int *result)
+{
+	xfs_da_intnode_t *node;
+	xfs_dablk_t blkno;
+	int newcount, error;
+	int useextra;
+
+	node = oldblk->bp->data;
+	ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+
+	/*
+	 * With V2 the extra block is data or freespace.
+	 */
+	useextra = state->extravalid && XFS_DIR_IS_V1(state->mp);
+	newcount = 1 + useextra;
+	/*
+	 * Do we have to split the node?
+	 */
+	if ((INT_GET(node->hdr.count, ARCH_CONVERT) + newcount) > XFS_DA_NODE_ENTRIES(state->mp)) {
+		/*
+		 * Allocate a new node, add to the doubly linked chain of
+		 * nodes, then move some of our excess entries into it.
+		 */
+		error = xfs_da_grow_inode(state->args, &blkno);
+		if (error)
+			return(error);	/* GROT: dir is inconsistent */
+
+		error = xfs_da_node_create(state->args, blkno, treelevel,
+					   &newblk->bp, state->args->whichfork);
+		if (error)
+			return(error);	/* GROT: dir is inconsistent */
+		newblk->blkno = blkno;
+		newblk->magic = XFS_DA_NODE_MAGIC;
+		xfs_da_node_rebalance(state, oldblk, newblk);
+		error = xfs_da_blk_link(state, oldblk, newblk);
+		if (error)
+			return(error);
+		*result = 1;
+	} else {
+		*result = 0;
+	}
+
+	/*
+	 * Insert the new entry(s) into the correct block
+	 * (updating last hashval in the process).
+	 *
+	 * xfs_da_node_add() inserts BEFORE the given index,
+	 * and as a result of using node_lookup_int() we always
+	 * point to a valid entry (not after one), but a split
+	 * operation always results in a new block whose hashvals
+	 * FOLLOW the current block.
+	 *
+	 * If we had double-split op below us, then add the extra block too.
+	 */
+	node = oldblk->bp->data;
+	if (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)) {
+		oldblk->index++;
+		xfs_da_node_add(state, oldblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				oldblk->index++;
+			xfs_da_node_add(state, oldblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	} else {
+		newblk->index++;
+		xfs_da_node_add(state, newblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				newblk->index++;
+			xfs_da_node_add(state, newblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	}
+
+	return(0);
+}
+
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				     xfs_da_state_blk_t *blk2)
+{
+	xfs_da_intnode_t *node1, *node2, *tmpnode;
+	xfs_da_node_entry_t *btree_s, *btree_d;
+	int count, tmp;
+	xfs_trans_t *tp;
+
+	node1 = blk1->bp->data;
+	node2 = blk2->bp->data;
+	/*
+	 * Figure out how many entries need to move, and in which direction.
+	 * Swap the nodes around if that makes it simpler.
+	 */
+	if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
+	    ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
+	     (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+	      INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+		tmpnode = node1;
+		node1 = node2;
+		node2 = tmpnode;
+	}
+	ASSERT(INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	ASSERT(INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	count = (INT_GET(node1->hdr.count, ARCH_CONVERT) - INT_GET(node2->hdr.count, ARCH_CONVERT)) / 2;
+	if (count == 0)
+		return;
+	tp = state->args->trans;
+	/*
+	 * Two cases: high-to-low and low-to-high.
+	 */
+	if (count > 0) {
+		/*
+		 * Move elements in node2 up to make a hole.
+		 */
+		if ((tmp = INT_GET(node2->hdr.count, ARCH_CONVERT)) > 0) {
+			tmp *= (uint)sizeof(xfs_da_node_entry_t);
+			btree_s = &node2->btree[0];
+			btree_d = &node2->btree[count];
+			ovbcopy(btree_s, btree_d, tmp);
+		}
+
+		/*
+		 * Move the req'd B-tree elements from high in node1 to
+		 * low in node2.
+		 */
+		INT_MOD(node2->hdr.count, ARCH_CONVERT, count);
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT) - count];
+		btree_d = &node2->btree[0];
+		bcopy(btree_s, btree_d, tmp);
+		INT_MOD(node1->hdr.count, ARCH_CONVERT, -(count));
+
+	} else {
+		/*
+		 * Move the req'd B-tree elements from low in node2 to
+		 * high in node1.
+		 */
+		count = -count;
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &node2->btree[0];
+		btree_d = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT)];
+		bcopy(btree_s, btree_d, tmp);
+		INT_MOD(node1->hdr.count, ARCH_CONVERT, count);
+		xfs_da_log_buf(tp, blk1->bp,
+			XFS_DA_LOGRANGE(node1, btree_d, tmp));
+
+		/*
+		 * Move elements in node2 down to fill the hole.
+		 */
+		tmp  = INT_GET(node2->hdr.count, ARCH_CONVERT) - count;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &node2->btree[count];
+		btree_d = &node2->btree[0];
+		ovbcopy(btree_s, btree_d, tmp);
+		INT_MOD(node2->hdr.count, ARCH_CONVERT, -(count));
+	}
+
+	/*
+	 * Log header of node 1 and all current bits of node 2.
+	 */
+	xfs_da_log_buf(tp, blk1->bp,
+		XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
+	xfs_da_log_buf(tp, blk2->bp,
+		XFS_DA_LOGRANGE(node2, &node2->hdr,
+			sizeof(node2->hdr) +
+			sizeof(node2->btree[0]) * INT_GET(node2->hdr.count, ARCH_CONVERT)));
+
+	/*
+	 * Record the last hashval from each block for upward propagation.
+	 * (note: don't use the swapped node pointers)
+	 */
+	node1 = blk1->bp->data;
+	node2 = blk2->bp->data;
+	blk1->hashval = INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+	blk2->hashval = INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (blk1->index >= INT_GET(node1->hdr.count, ARCH_CONVERT)) {
+		blk2->index = blk1->index - INT_GET(node1->hdr.count, ARCH_CONVERT);
+		blk1->index = INT_GET(node1->hdr.count, ARCH_CONVERT) + 1;	/* make it invalid */
+	}
+}
+
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+			       xfs_da_state_blk_t *newblk)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	int tmp;
+	xfs_mount_t *mp;
+
+	node = oldblk->bp->data;
+	mp = state->mp;
+	ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	ASSERT((oldblk->index >= 0) && (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)));
+	ASSERT(newblk->blkno != 0);
+	if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+		ASSERT(newblk->blkno >= mp->m_dirleafblk &&
+		       newblk->blkno < mp->m_dirfreeblk);
+
+	/*
+	 * We may need to make some room before we insert the new node.
+	 */
+	tmp = 0;
+	btree = &node->btree[ oldblk->index ];
+	if (oldblk->index < INT_GET(node->hdr.count, ARCH_CONVERT)) {
+		tmp = (INT_GET(node->hdr.count, ARCH_CONVERT) - oldblk->index) * (uint)sizeof(*btree);
+		ovbcopy(btree, btree + 1, tmp);
+	}
+	INT_SET(btree->hashval, ARCH_CONVERT, newblk->hashval);
+	INT_SET(btree->before, ARCH_CONVERT, newblk->blkno);
+	xfs_da_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
+	INT_MOD(node->hdr.count, ARCH_CONVERT, +1);
+	xfs_da_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+	/*
+	 * Copy the last hash value from the oldblk to propagate upwards.
+	 */
+	oldblk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da_join(xfs_da_state_t *state)
+{
+	xfs_da_state_blk_t *drop_blk, *save_blk;
+	int action, error;
+
+	action = 0;
+	drop_blk = &state->path.blk[ state->path.active-1 ];
+	save_blk = &state->altpath.blk[ state->path.active-1 ];
+	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+	       drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+
+	/*
+	 * Walk back up the tree joining/deallocating as necessary.
+	 * When we stop dropping blocks, break out.
+	 */
+	for (  ; state->path.active >= 2; drop_blk--, save_blk--,
+		 state->path.active--) {
+		/*
+		 * See if we can combine the block with a neighbor.
+		 *   (action == 0) => no options, just leave
+		 *   (action == 1) => coalesce, then unlink
+		 *   (action == 2) => block empty, unlink it
+		 */
+		switch (drop_blk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+#ifndef __KERNEL__
+			error = ENOTTY;
+#else
+			error = xfs_attr_leaf_toosmall(state, &action);
+#endif
+			if (error)
+				return(error);
+			if (action == 0)
+				return(0);
+#ifdef __KERNEL__
+			xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
+#endif
+			break;
+		case XFS_DIR_LEAF_MAGIC:
+			ASSERT(XFS_DIR_IS_V1(state->mp));
+			error = xfs_dir_leaf_toosmall(state, &action);
+			if (error)
+				return(error);
+			if (action == 0)
+				return(0);
+			xfs_dir_leaf_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			ASSERT(XFS_DIR_IS_V2(state->mp));
+			error = xfs_dir2_leafn_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DA_NODE_MAGIC:
+			/*
+			 * Remove the offending node, fixup hashvals,
+			 * check for a toosmall neighbor.
+			 */
+			xfs_da_node_remove(state, drop_blk);
+			xfs_da_fixhashpath(state, &state->path);
+			error = xfs_da_node_toosmall(state, &action);
+			if (error)
+				return(error);
+			if (action == 0)
+				return 0;
+			xfs_da_node_unbalance(state, drop_blk, save_blk);
+			break;
+		}
+		xfs_da_fixhashpath(state, &state->altpath);
+		error = xfs_da_blk_unlink(state, drop_blk, save_blk);
+		xfs_da_state_kill_altpath(state);
+		if (error)
+			return(error);
+		error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+							 drop_blk->bp);
+		drop_blk->bp = NULL;
+		if (error)
+			return(error);
+	}
+	/*
+	 * We joined all the way to the top.  If it turns out that
+	 * we only have one entry in the root, make the child block
+	 * the new root.
+	 */
+	xfs_da_node_remove(state, drop_blk);
+	xfs_da_fixhashpath(state, &state->path);
+	error = xfs_da_root_join(state, &state->path.blk[0]);
+	return(error);
+}
+
+/*
+ * We have only one entry in the root.	Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
+{
+	xfs_da_intnode_t *oldroot;
+	/* REFERENCED */
+	xfs_da_blkinfo_t *blkinfo;
+	xfs_da_args_t *args;
+	xfs_dablk_t child;
+	xfs_dabuf_t *bp;
+	int error;
+
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+	oldroot = root_blk->bp->data;
+	ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	ASSERT(INT_ISZERO(oldroot->hdr.info.forw, ARCH_CONVERT));
+	ASSERT(INT_ISZERO(oldroot->hdr.info.back, ARCH_CONVERT));
+
+	/*
+	 * If the root has more than one child, then don't do anything.
+	 */
+	if (INT_GET(oldroot->hdr.count, ARCH_CONVERT) > 1)
+		return(0);
+
+	/*
+	 * Read in the (only) child block, then copy those bytes into
+	 * the root block's buffer and free the original child block.
+	 */
+	child = INT_GET(oldroot->btree[ 0 ].before, ARCH_CONVERT);
+	ASSERT(child != 0);
+	error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+					     args->whichfork);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	blkinfo = bp->data;
+	if (INT_GET(oldroot->hdr.level, ARCH_CONVERT) == 1) {
+		ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+	} else {
+		ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	}
+	ASSERT(INT_ISZERO(blkinfo->forw, ARCH_CONVERT));
+	ASSERT(INT_ISZERO(blkinfo->back, ARCH_CONVERT));
+	bcopy(bp->data, root_blk->bp->data, state->blocksize);
+	xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+	error = xfs_da_shrink_inode(args, child, bp);
+	return(error);
+}
+
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	int count, forward, error, retval, i;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->data;
+	ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	node = (xfs_da_intnode_t *)info;
+	count = INT_GET(node->hdr.count, ARCH_CONVERT);
+	if (count > (XFS_DA_NODE_ENTRIES(state->mp) >> 1)) {
+		*action = 0;	/* blk over 50%, dont try to join */
+		return(0);	/* blk over 50%, dont try to join */
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (aribtrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (!INT_ISZERO(info->forw, ARCH_CONVERT));
+		bcopy(&state->path, &state->altpath, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return(error);
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return(0);
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	/* start with smaller blk num */
+	forward = (INT_GET(info->forw, ARCH_CONVERT)
+				< INT_GET(info->back, ARCH_CONVERT));
+	for (i = 0; i < 2; forward = !forward, i++) {
+		if (forward)
+			blkno = INT_GET(info->forw, ARCH_CONVERT);
+		else
+			blkno = INT_GET(info->back, ARCH_CONVERT);
+		if (blkno == 0)
+			continue;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+					blkno, -1, &bp, state->args->whichfork);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+
+		node = (xfs_da_intnode_t *)info;
+		count  = XFS_DA_NODE_ENTRIES(state->mp);
+		count -= XFS_DA_NODE_ENTRIES(state->mp) >> 2;
+		count -= INT_GET(node->hdr.count, ARCH_CONVERT);
+		node = bp->data;
+		ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+		count -= INT_GET(node->hdr.count, ARCH_CONVERT);
+		xfs_da_brelse(state->args->trans, bp);
+		if (count >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return(0);
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	bcopy(&state->path, &state->altpath, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error) {
+			return(error);
+		}
+		if (retval) {
+			*action = 0;
+			return(0);
+		}
+	} else {
+		error = xfs_da_path_shift(state, &state->path, forward,
+						 0, &retval);
+		if (error) {
+			return(error);
+		}
+		if (retval) {
+			*action = 0;
+			return(0);
+		}
+	}
+	*action = 1;
+	return(0);
+}
+
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
+{
+	xfs_da_state_blk_t *blk;
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	xfs_dahash_t lasthash=0;
+	int level, count;
+
+	level = path->active-1;
+	blk = &path->blk[ level ];
+	switch (blk->magic) {
+#ifdef __KERNEL__
+	case XFS_ATTR_LEAF_MAGIC:
+		lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+#endif
+	case XFS_DIR_LEAF_MAGIC:
+		ASSERT(XFS_DIR_IS_V1(state->mp));
+		lasthash = xfs_dir_leaf_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		ASSERT(XFS_DIR_IS_V2(state->mp));
+		lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DA_NODE_MAGIC:
+		lasthash = xfs_da_node_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	}
+	for (blk--, level--; level >= 0; blk--, level--) {
+		node = blk->bp->data;
+		ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+		btree = &node->btree[ blk->index ];
+		if (INT_GET(btree->hashval, ARCH_CONVERT) == lasthash)
+			break;
+		blk->hashval = lasthash;
+		INT_SET(btree->hashval, ARCH_CONVERT, lasthash);
+		xfs_da_log_buf(state->args->trans, blk->bp,
+				  XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+
+		lasthash = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+	}
+}
+
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	int tmp;
+
+	node = drop_blk->bp->data;
+	ASSERT(drop_blk->index < INT_GET(node->hdr.count, ARCH_CONVERT));
+	ASSERT(drop_blk->index >= 0);
+
+	/*
+	 * Copy over the offending entry, or just zero it out.
+	 */
+	btree = &node->btree[drop_blk->index];
+	if (drop_blk->index < (INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
+		tmp  = INT_GET(node->hdr.count, ARCH_CONVERT) - drop_blk->index - 1;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		ovbcopy(btree + 1, btree, tmp);
+		xfs_da_log_buf(state->args->trans, drop_blk->bp,
+		    XFS_DA_LOGRANGE(node, btree, tmp));
+		btree = &node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ];
+	}
+	bzero((char *)btree, sizeof(xfs_da_node_entry_t));
+	xfs_da_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+	INT_MOD(node->hdr.count, ARCH_CONVERT, -1);
+	xfs_da_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+	/*
+	 * Copy the last hash value from the block to propagate upwards.
+	 */
+	btree--;
+	drop_blk->hashval = INT_GET(btree->hashval, ARCH_CONVERT);
+}
+
+/*
+ * Unbalance the btree elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				     xfs_da_state_blk_t *save_blk)
+{
+	xfs_da_intnode_t *drop_node, *save_node;
+	xfs_da_node_entry_t *btree;
+	int tmp;
+	xfs_trans_t *tp;
+
+	drop_node = drop_blk->bp->data;
+	save_node = save_blk->bp->data;
+	ASSERT(INT_GET(drop_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	ASSERT(INT_GET(save_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	tp = state->args->trans;
+
+	/*
+	 * If the dying block has lower hashvals, then move all the
+	 * elements in the remaining block up to make a hole.
+	 */
+	if ((INT_GET(drop_node->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(save_node->btree[ 0 ].hashval, ARCH_CONVERT)) ||
+	    (INT_GET(drop_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+	     INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))
+	{
+		btree = &save_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT) ];
+		tmp = INT_GET(save_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
+		ovbcopy(&save_node->btree[0], btree, tmp);
+		btree = &save_node->btree[0];
+		xfs_da_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, btree,
+				(INT_GET(save_node->hdr.count, ARCH_CONVERT) + INT_GET(drop_node->hdr.count, ARCH_CONVERT)) *
+				sizeof(xfs_da_node_entry_t)));
+	} else {
+		btree = &save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT) ];
+		xfs_da_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, btree,
+				INT_GET(drop_node->hdr.count, ARCH_CONVERT) *
+				sizeof(xfs_da_node_entry_t)));
+	}
+
+	/*
+	 * Move all the B-tree elements from drop_blk to save_blk.
+	 */
+	tmp = INT_GET(drop_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
+	bcopy(&drop_node->btree[0], btree, tmp);
+	INT_MOD(save_node->hdr.count, ARCH_CONVERT, INT_GET(drop_node->hdr.count, ARCH_CONVERT));
+
+	xfs_da_log_buf(tp, save_blk->bp,
+		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+			sizeof(save_node->hdr)));
+
+	/*
+	 * Save the last hashval in the remaining block for upward propagation.
+	 */
+	save_blk->hashval = INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend.  This is a
+ * pruned depth-first tree search.
+ */
+int							/* error */
+xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
+{
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *curr;
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	xfs_dablk_t blkno;
+	int probe, span, max, error, retval;
+	xfs_dahash_t hashval;
+	xfs_da_args_t *args;
+
+	args = state->args;
+	/*
+	 * Descend thru the B-tree searching each level for the right
+	 * node to use, until the right hashval is found.
+	 */
+	if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp))
+		blkno = state->mp->m_dirleafblk;
+	else
+		blkno = 0;
+	for (blk = &state->path.blk[0], state->path.active = 1;
+			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
+			 blk++, state->path.active++) {
+		/*
+		 * Read the next node down in the tree.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+					blkno, -1, &blk->bp,
+					state->args->whichfork);
+		if (error) {
+			blk->blkno = 0;
+			state->path.active--;
+			return(error);
+		}
+		ASSERT(blk->bp != NULL);
+		curr = blk->bp->data;
+		ASSERT(INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
+		       INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+
+		/*
+		 * Search an intermediate node for a match.
+		 */
+		blk->magic = INT_GET(curr->magic, ARCH_CONVERT);
+		if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+			node = blk->bp->data;
+			blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+
+			/*
+			 * Binary search.  (note: small blocks will skip loop)
+			 */
+			max = INT_GET(node->hdr.count, ARCH_CONVERT);
+			probe = span = max / 2;
+			hashval = state->args->hashval;
+			for (btree = &node->btree[probe]; span > 4;
+				   btree = &node->btree[probe]) {
+				span /= 2;
+				if (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)
+					probe += span;
+				else if (INT_GET(btree->hashval, ARCH_CONVERT) > hashval)
+					probe -= span;
+				else
+					break;
+			}
+			ASSERT((probe >= 0) && (probe < max));
+			ASSERT((span <= 4) || (INT_GET(btree->hashval, ARCH_CONVERT) == hashval));
+
+			/*
+			 * Since we may have duplicate hashval's, find the first
+			 * matching hashval in the node.
+			 */
+			while ((probe > 0) && (INT_GET(btree->hashval, ARCH_CONVERT) >= hashval)) {
+				btree--;
+				probe--;
+			}
+			while ((probe < max) && (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)) {
+				btree++;
+				probe++;
+			}
+
+			/*
+			 * Pick the right block to descend on.
+			 */
+			if (probe == max) {
+				blk->index = max-1;
+				blkno = INT_GET(node->btree[ max-1 ].before, ARCH_CONVERT);
+			} else {
+				blk->index = probe;
+				blkno = INT_GET(btree->before, ARCH_CONVERT);
+			}
+		}
+#ifdef __KERNEL__
+		else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		}
+#endif
+		else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
+			blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL);
+			break;
+		}
+		else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
+			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
+			break;
+		}
+	}
+
+	/*
+	 * A leaf block that ends in the hashval that we are interested in
+	 * (final hashval == search hashval) means that the next block may
+	 * contain more entries with the same hashval, shift upward to the
+	 * next leaf and keep searching.
+	 */
+	for (;;) {
+		if (blk->magic == XFS_DIR_LEAF_MAGIC) {
+			ASSERT(XFS_DIR_IS_V1(state->mp));
+			retval = xfs_dir_leaf_lookup_int(blk->bp, state->args,
+								  &blk->index);
+		} else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+			ASSERT(XFS_DIR_IS_V2(state->mp));
+			retval = xfs_dir2_leafn_lookup_int(blk->bp, state->args,
+							&blk->index, state);
+		}
+#ifdef __KERNEL__
+		else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+			retval = xfs_attr_leaf_lookup_int(blk->bp, state->args);
+			blk->index = state->args->index;
+			state->args->blkno = blk->blkno;
+		}
+#endif
+		if (((retval == ENOENT) || (retval == ENOATTR)) &&
+		    (blk->hashval == state->args->hashval)) {
+			error = xfs_da_path_shift(state, &state->path, 1, 1,
+							 &retval);
+			if (error)
+				return(error);
+			if (retval == 0) {
+				continue;
+			}
+#ifdef __KERNEL__
+			else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+				/* path_shift() gives ENOENT */
+				retval = XFS_ERROR(ENOATTR);
+			}
+#endif
+		}
+		break;
+	}
+	*result = retval;
+	return(0);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int							/* error */
+xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+			       xfs_da_state_blk_t *new_blk)
+{
+	xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
+	xfs_da_args_t *args;
+	int before=0, error;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	old_info = old_blk->bp->data;
+	new_info = new_blk->bp->data;
+	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+	       old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(old_blk->magic == INT_GET(old_info->magic, ARCH_CONVERT));
+	ASSERT(new_blk->magic == INT_GET(new_info->magic, ARCH_CONVERT));
+	ASSERT(old_blk->magic == new_blk->magic);
+
+	switch (old_blk->magic) {
+#ifdef __KERNEL__
+	case XFS_ATTR_LEAF_MAGIC:
+		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+		break;
+#endif
+	case XFS_DIR_LEAF_MAGIC:
+		ASSERT(XFS_DIR_IS_V1(state->mp));
+		before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		ASSERT(XFS_DIR_IS_V2(state->mp));
+		before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DA_NODE_MAGIC:
+		before = xfs_da_node_order(old_blk->bp, new_blk->bp);
+		break;
+	}
+
+	/*
+	 * Link blocks in appropriate order.
+	 */
+	if (before) {
+		/*
+		 * Link new block in before existing block.
+		 */
+		INT_SET(new_info->forw, ARCH_CONVERT, old_blk->blkno);
+		new_info->back = old_info->back; /* INT_: direct copy */
+		if (INT_GET(old_info->back, ARCH_CONVERT)) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						INT_GET(old_info->back,
+							ARCH_CONVERT), -1, &bp,
+						args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(old_info->magic, ARCH_CONVERT));
+			ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == old_blk->blkno);
+			INT_SET(tmp_info->forw, ARCH_CONVERT, new_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+			xfs_da_buf_done(bp);
+		}
+		INT_SET(old_info->back, ARCH_CONVERT, new_blk->blkno);
+	} else {
+		/*
+		 * Link new block in after existing block.
+		 */
+		new_info->forw = old_info->forw; /* INT_: direct copy */
+		INT_SET(new_info->back, ARCH_CONVERT, old_blk->blkno);
+		if (INT_GET(old_info->forw, ARCH_CONVERT)) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						INT_GET(old_info->forw, ARCH_CONVERT), -1, &bp,
+						args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
+				    == INT_GET(old_info->magic, ARCH_CONVERT));
+			ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
+				    == old_blk->blkno);
+			INT_SET(tmp_info->back, ARCH_CONVERT, new_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+			xfs_da_buf_done(bp);
+		}
+		INT_SET(old_info->forw, ARCH_CONVERT, new_blk->blkno);
+	}
+
+	xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+	xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+	return(0);
+}
+
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
+{
+	xfs_da_intnode_t *node1, *node2;
+
+	node1 = node1_bp->data;
+	node2 = node2_bp->data;
+	ASSERT((INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) &&
+	       (INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC));
+	if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
+	    ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) <
+	      INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
+	     (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+	      INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+		return(1);
+	}
+	return(0);
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count)
+{
+	xfs_da_intnode_t *node;
+
+	node = bp->data;
+	ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	if (count)
+		*count = INT_GET(node->hdr.count, ARCH_CONVERT);
+	if (INT_ISZERO(node->hdr.count, ARCH_CONVERT))
+		return(0);
+	return(INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+int							/* error */
+xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				 xfs_da_state_blk_t *save_blk)
+{
+	xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
+	xfs_da_args_t *args;
+	xfs_dabuf_t *bp;
+	int error;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	save_info = save_blk->bp->data;
+	drop_info = drop_blk->bp->data;
+	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+	       save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == INT_GET(save_info->magic, ARCH_CONVERT));
+	ASSERT(drop_blk->magic == INT_GET(drop_info->magic, ARCH_CONVERT));
+	ASSERT(save_blk->magic == drop_blk->magic);
+	ASSERT((INT_GET(save_info->forw, ARCH_CONVERT) == drop_blk->blkno) ||
+	       (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno));
+	ASSERT((INT_GET(drop_info->forw, ARCH_CONVERT) == save_blk->blkno) ||
+	       (INT_GET(drop_info->back, ARCH_CONVERT) == save_blk->blkno));
+
+	/*
+	 * Unlink the leaf block from the doubly linked chain of leaves.
+	 */
+	if (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno) {
+		save_info->back = drop_info->back; /* INT_: direct copy */
+		if (INT_GET(drop_info->back, ARCH_CONVERT)) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						INT_GET(drop_info->back,
+							ARCH_CONVERT), -1, &bp,
+						args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(save_info->magic, ARCH_CONVERT));
+			ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == drop_blk->blkno);
+			INT_SET(tmp_info->forw, ARCH_CONVERT, save_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+			xfs_da_buf_done(bp);
+		}
+	} else {
+		save_info->forw = drop_info->forw; /* INT_: direct copy */
+		if (INT_GET(drop_info->forw, ARCH_CONVERT)) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						INT_GET(drop_info->forw, ARCH_CONVERT), -1, &bp,
+						args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
+				    == INT_GET(save_info->magic, ARCH_CONVERT));
+			ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
+				    == drop_blk->blkno);
+			INT_SET(tmp_info->back, ARCH_CONVERT, save_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+			xfs_da_buf_done(bp);
+		}
+	}
+
+	xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+	return(0);
+}
+
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int							/* error */
+xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+				 int forward, int release, int *result)
+{
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	xfs_da_intnode_t *node;
+	xfs_da_args_t *args;
+	xfs_dablk_t blkno=0;
+	int level, error;
+
+	/*
+	 * Roll up the Btree looking for the first block where our
+	 * current index is not at the edge of the block.  Note that
+	 * we skip the bottom layer because we want the sibling block.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(path != NULL);
+	ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	level = (path->active-1) - 1;	/* skip bottom layer in path */
+	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+		ASSERT(blk->bp != NULL);
+		node = blk->bp->data;
+		ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+		if (forward && (blk->index < INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
+			blk->index++;
+			blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+			break;
+		} else if (!forward && (blk->index > 0)) {
+			blk->index--;
+			blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+			break;
+		}
+	}
+	if (level < 0) {
+		*result = XFS_ERROR(ENOENT);	/* we're out of our tree */
+		ASSERT(args->oknoent);
+		return(0);
+	}
+
+	/*
+	 * Roll down the edge of the subtree until we reach the
+	 * same depth we were at originally.
+	 */
+	for (blk++, level++; level < path->active; blk++, level++) {
+		/*
+		 * Release the old block.
+		 * (if it's dirty, trans won't actually let go)
+		 */
+		if (release)
+			xfs_da_brelse(args->trans, blk->bp);
+
+		/*
+		 * Read the next child block.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
+						     &blk->bp, args->whichfork);
+		if (error)
+			return(error);
+		ASSERT(blk->bp != NULL);
+		info = blk->bp->data;
+		ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
+		       INT_GET(info->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+		blk->magic = INT_GET(info->magic, ARCH_CONVERT);
+		if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+			node = (xfs_da_intnode_t *)info;
+			blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+			if (forward)
+				blk->index = 0;
+			else
+				blk->index = INT_GET(node->hdr.count, ARCH_CONVERT)-1;
+			blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+		} else {
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			switch(blk->magic) {
+#ifdef __KERNEL__
+			case XFS_ATTR_LEAF_MAGIC:
+				blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
+								      NULL);
+				break;
+#endif
+			case XFS_DIR_LEAF_MAGIC:
+				ASSERT(XFS_DIR_IS_V1(state->mp));
+				blk->hashval = xfs_dir_leaf_lasthash(blk->bp,
+								     NULL);
+				break;
+			case XFS_DIR2_LEAFN_MAGIC:
+				ASSERT(XFS_DIR_IS_V2(state->mp));
+				blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
+								       NULL);
+				break;
+			default:
+				ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
+				       blk->magic ==
+				       XFS_DIRX_LEAF_MAGIC(state->mp));
+				break;
+			}
+		}
+	}
+	*result = 0;
+	return(0);
+}
+
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(uchar_t *name, int namelen)
+{
+	xfs_dahash_t hash;
+
+#define ROTL(x,y)	(((x) << (y)) | ((x) >> (32 - (y))))
+#ifdef SLOWVERSION
+	/*
+	 * This is the old one-byte-at-a-time version.
+	 */
+	for (hash = 0; namelen > 0; namelen--) {
+		hash = *name++ ^ ROTL(hash, 7);
+	}
+	return(hash);
+#else
+	/*
+	 * Do four characters at a time as long as we can.
+	 */
+	for (hash = 0; namelen >= 4; namelen -= 4, name += 4) {
+		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+		       (name[3] << 0) ^ ROTL(hash, 7 * 4);
+	}
+	/*
+	 * Now do the rest of the characters.
+	 */
+	switch (namelen) {
+	case 3:
+		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+		       ROTL(hash, 7 * 3);
+	case 2:
+		return (name[0] << 7) ^ (name[1] << 0) ^ ROTL(hash, 7 * 2);
+	case 1:
+		return (name[0] << 0) ^ ROTL(hash, 7 * 1);
+	case 0:
+		return hash;
+	}
+	/* NOTREACHED */
+#endif
+#undef ROTL
+	return 0; /* keep gcc happy */
+}
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
+{
+	xfs_fileoff_t bno, b;
+	xfs_bmbt_irec_t map;
+	xfs_bmbt_irec_t *mapp;
+	xfs_inode_t *dp;
+	int nmap, error, w, count, c, got, i, mapi;
+	xfs_fsize_t size;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	w = args->whichfork;
+	tp = args->trans;
+	/*
+	 * For new directories adjust the file offset and block count.
+	 */
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) {
+		bno = mp->m_dirleafblk;
+		count = mp->m_dirblkfsbs;
+	} else {
+		bno = 0;
+		count = 1;
+	}
+	/*
+	 * Find a spot in the file space to put the new block.
+	 */
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) {
+		return error;
+	}
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+		ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
+	/*
+	 * Try mapping it in one filesystem block.
+	 */
+	nmap = 1;
+	ASSERT(args->firstblock != NULL);
+	if ((error = xfs_bmapi(tp, dp, bno, count,
+			XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
+			XFS_BMAPI_CONTIG,
+			args->firstblock, args->total, &map, &nmap,
+			args->flist))) {
+		return error;
+	}
+	ASSERT(nmap <= 1);
+	if (nmap == 1) {
+		mapp = &map;
+		mapi = 1;
+	}
+	/*
+	 * If we didn't get it and the block might work if fragmented,
+	 * try without the CONTIG flag.	 Loop until we get it all.
+	 */
+	else if (nmap == 0 && count > 1) {
+		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		for (b = bno, mapi = 0; b < bno + count; ) {
+			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+			c = (int)(bno + count - b);
+			if ((error = xfs_bmapi(tp, dp, b, c,
+					XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
+					XFS_BMAPI_METADATA,
+					args->firstblock, args->total,
+					&mapp[mapi], &nmap, args->flist))) {
+				kmem_free(mapp, sizeof(*mapp) * count);
+				return error;
+			}
+			if (nmap < 1)
+				break;
+			mapi += nmap;
+			b = mapp[mapi - 1].br_startoff +
+			    mapp[mapi - 1].br_blockcount;
+		}
+	} else {
+		mapi = 0;
+		mapp = NULL;
+	}
+	/*
+	 * Count the blocks we got, make sure it matches the total.
+	 */
+	for (i = 0, got = 0; i < mapi; i++)
+		got += mapp[i].br_blockcount;
+	if (got != count || mapp[0].br_startoff != bno ||
+	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+	    bno + count) {
+		if (mapp != &map)
+			kmem_free(mapp, sizeof(*mapp) * count);
+		return XFS_ERROR(ENOSPC);
+	}
+	if (mapp != &map)
+		kmem_free(mapp, sizeof(*mapp) * count);
+	*new_blkno = (xfs_dablk_t)bno;
+	/*
+	 * For version 1 directories, adjust the file size if it changed.
+	 */
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
+		ASSERT(mapi == 1);
+		if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
+			return error;
+		size = XFS_FSB_TO_B(mp, bno);
+		if (size != dp->i_d.di_size) {
+			dp->i_d.di_size = size;
+			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Ick.	 We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file.  The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
+		      xfs_dabuf_t **dead_bufp)
+{
+	xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
+	xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf;
+	xfs_fileoff_t lastoff;
+	xfs_inode_t *ip;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+	int error, w, entno, level, dead_level;
+	xfs_da_blkinfo_t *dead_info, *sib_info;
+	xfs_da_intnode_t *par_node, *dead_node;
+	xfs_dir_leafblock_t *dead_leaf;
+	xfs_dir2_leaf_t *dead_leaf2;
+	xfs_dahash_t dead_hash;
+
+	dead_buf = *dead_bufp;
+	dead_blkno = *dead_blknop;
+	tp = args->trans;
+	ip = args->dp;
+	w = args->whichfork;
+	ASSERT(w == XFS_DATA_FORK);
+	mp = ip->i_mount;
+	if (XFS_DIR_IS_V2(mp)) {
+		lastoff = mp->m_dirfreeblk;
+		error = xfs_bmap_last_before(tp, ip, &lastoff, w);
+	} else
+		error = xfs_bmap_last_offset(tp, ip, &lastoff, w);
+	if (error)
+		return error;
+	if (lastoff == 0)
+		return XFS_ERROR(EFSCORRUPTED);
+	/*
+	 * Read the last block in the btree space.
+	 */
+	last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
+	if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+		return error;
+	/*
+	 * Copy the last block into the dead buffer and log it.
+	 */
+	bcopy(last_buf->data, dead_buf->data, mp->m_dirblksize);
+	xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
+	dead_info = dead_buf->data;
+	/*
+	 * Get values from the moved block.
+	 */
+	if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
+		ASSERT(XFS_DIR_IS_V1(mp));
+		dead_leaf = (xfs_dir_leafblock_t *)dead_info;
+		dead_level = 0;
+		dead_hash =
+			INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	} else if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
+		ASSERT(XFS_DIR_IS_V2(mp));
+		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+		dead_level = 0;
+		dead_hash = INT_GET(dead_leaf2->ents[INT_GET(dead_leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	} else {
+		ASSERT(INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+		dead_node = (xfs_da_intnode_t *)dead_info;
+		dead_level = INT_GET(dead_node->hdr.level, ARCH_CONVERT);
+		dead_hash = INT_GET(dead_node->btree[INT_GET(dead_node->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	}
+	sib_buf = par_buf = NULL;
+	/*
+	 * If the moved block has a left sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = INT_GET(dead_info->back, ARCH_CONVERT))) {
+		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+			goto done;
+		sib_info = sib_buf->data;
+		if (INT_GET(sib_info->forw, ARCH_CONVERT) != last_blkno ||
+		    INT_GET(sib_info->magic, ARCH_CONVERT) != INT_GET(dead_info->magic, ARCH_CONVERT)) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		INT_SET(sib_info->forw, ARCH_CONVERT, dead_blkno);
+		xfs_da_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+					sizeof(sib_info->forw)));
+		xfs_da_buf_done(sib_buf);
+		sib_buf = NULL;
+	}
+	/*
+	 * If the moved block has a right sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = INT_GET(dead_info->forw, ARCH_CONVERT))) {
+		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+			goto done;
+		sib_info = sib_buf->data;
+		if (   INT_GET(sib_info->back, ARCH_CONVERT) != last_blkno
+		    || INT_GET(sib_info->magic, ARCH_CONVERT)
+				!= INT_GET(dead_info->magic, ARCH_CONVERT)) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		INT_SET(sib_info->back, ARCH_CONVERT, dead_blkno);
+		xfs_da_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+					sizeof(sib_info->back)));
+		xfs_da_buf_done(sib_buf);
+		sib_buf = NULL;
+	}
+	par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk;
+	level = -1;
+	/*
+	 * Walk down the tree looking for the parent of the moved block.
+	 */
+	for (;;) {
+		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+			goto done;
+		par_node = par_buf->data;
+		if (INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC ||
+		    (level >= 0 && level != INT_GET(par_node->hdr.level, ARCH_CONVERT) + 1)) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		level = INT_GET(par_node->hdr.level, ARCH_CONVERT);
+		for (entno = 0;
+		     entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
+		     INT_GET(par_node->btree[entno].hashval, ARCH_CONVERT) < dead_hash;
+		     entno++)
+			continue;
+		if (entno == INT_GET(par_node->hdr.count, ARCH_CONVERT)) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		par_blkno = INT_GET(par_node->btree[entno].before, ARCH_CONVERT);
+		if (level == dead_level + 1)
+			break;
+		xfs_da_brelse(tp, par_buf);
+		par_buf = NULL;
+	}
+	/*
+	 * We're in the right parent block.
+	 * Look for the right entry.
+	 */
+	for (;;) {
+		for (;
+		     entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
+		     INT_GET(par_node->btree[entno].before, ARCH_CONVERT) != last_blkno;
+		     entno++)
+			continue;
+		if (entno < INT_GET(par_node->hdr.count, ARCH_CONVERT))
+			break;
+		par_blkno = INT_GET(par_node->hdr.info.forw, ARCH_CONVERT);
+		xfs_da_brelse(tp, par_buf);
+		par_buf = NULL;
+		if (par_blkno == 0) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+			goto done;
+		par_node = par_buf->data;
+		if (INT_GET(par_node->hdr.level, ARCH_CONVERT) != level ||
+		    INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		entno = 0;
+	}
+	/*
+	 * Update the parent entry pointing to the moved block.
+	 */
+	INT_SET(par_node->btree[entno].before, ARCH_CONVERT, dead_blkno);
+	xfs_da_log_buf(tp, par_buf,
+		XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
+				sizeof(par_node->btree[entno].before)));
+	xfs_da_buf_done(par_buf);
+	xfs_da_buf_done(dead_buf);
+	*dead_blknop = last_blkno;
+	*dead_bufp = last_buf;
+	return 0;
+done:
+	if (par_buf)
+		xfs_da_brelse(tp, par_buf);
+	if (sib_buf)
+		xfs_da_brelse(tp, sib_buf);
+	xfs_da_brelse(tp, last_buf);
+	return error;
+}
+
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+		    xfs_dabuf_t *dead_buf)
+{
+	xfs_inode_t *dp;
+	int done, error, w, count;
+	xfs_fileoff_t bno;
+	xfs_fsize_t size;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+
+	dp = args->dp;
+	w = args->whichfork;
+	tp = args->trans;
+	mp = dp->i_mount;
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+		count = mp->m_dirblkfsbs;
+	else
+		count = 1;
+	for (;;) {
+		/*
+		 * Remove extents.  If we get ENOSPC for a dir we have to move
+		 * the last block to the place we want to kill.
+		 */
+		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
+				XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
+				0, args->firstblock, args->flist,
+				&done)) == ENOSPC) {
+			if (w != XFS_DATA_FORK)
+				goto done;
+			if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
+					&dead_buf)))
+				goto done;
+		} else if (error)
+			goto done;
+		else
+			break;
+	}
+	ASSERT(done);
+	xfs_da_binval(tp, dead_buf);
+	/*
+	 * Adjust the directory size for version 1.
+	 */
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
+		if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
+			return error;
+		size = XFS_FSB_TO_B(dp->i_mount, bno);
+		if (size != dp->i_d.di_size) {
+			dp->i_d.di_size = size;
+			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+done:
+	xfs_da_binval(tp, dead_buf);
+	return error;
+}
+
+/*
+ * See if the mapping(s) for this btree block are valid, i.e.
+ * don't contain holes, are logically contiguous, and cover the whole range.
+ */
+STATIC int
+xfs_da_map_covers_blocks(
+	int		nmap,
+	xfs_bmbt_irec_t *mapp,
+	xfs_dablk_t	bno,
+	int		count)
+{
+	int		i;
+	xfs_fileoff_t	off;
+
+	for (i = 0, off = bno; i < nmap; i++) {
+		if (mapp[i].br_startblock == HOLESTARTBLOCK ||
+		    mapp[i].br_startblock == DELAYSTARTBLOCK) {
+			return 0;
+		}
+		if (off != mapp[i].br_startoff) {
+			return 0;
+		}
+		off += mapp[i].br_blockcount;
+	}
+	return off == bno + count;
+}
+
+/*
+ * Make a dabuf.
+ * Used for get_buf, read_buf, read_bufr, and reada_buf.
+ */
+STATIC int
+xfs_da_do_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	xfs_daddr_t	*mappedbnop,
+	xfs_dabuf_t	**bpp,
+	int		whichfork,
+	int		caller,
+	inst_t		*ra)
+{
+	xfs_buf_t	*bp = 0;
+	xfs_buf_t	**bplist;
+	int		error=0;
+	int		i;
+	xfs_bmbt_irec_t map;
+	xfs_bmbt_irec_t *mapp;
+	xfs_daddr_t	mappedbno;
+	xfs_mount_t	*mp;
+	int		nbplist=0;
+	int		nfsb;
+	int		nmap;
+	xfs_dabuf_t	*rbp;
+
+	mp = dp->i_mount;
+	if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+		nfsb = mp->m_dirblkfsbs;
+	else
+		nfsb = 1;
+	mappedbno = *mappedbnop;
+	/*
+	 * Caller doesn't have a mapping.  -2 means don't complain
+	 * if we land in a hole.
+	 */
+	if (mappedbno == -1 || mappedbno == -2) {
+		/*
+		 * Optimize the one-block case.
+		 */
+		if (nfsb == 1) {
+			xfs_fsblock_t	fsb;
+
+			if ((error =
+			    xfs_bmapi_single(trans, dp, whichfork, &fsb,
+				    (xfs_fileoff_t)bno))) {
+				return error;
+			}
+			mapp = &map;
+			if (fsb == NULLFSBLOCK) {
+				nmap = 0;
+			} else {
+				map.br_startblock = fsb;
+				map.br_startoff = (xfs_fileoff_t)bno;
+				map.br_blockcount = 1;
+				nmap = 1;
+			}
+		} else {
+			mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
+			nmap = nfsb;
+			if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
+					nfsb,
+					XFS_BMAPI_METADATA |
+						XFS_BMAPI_AFLAG(whichfork),
+					NULL, 0, mapp, &nmap, NULL)))
+				goto exit0;
+		}
+	} else {
+		map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+		map.br_startoff = (xfs_fileoff_t)bno;
+		map.br_blockcount = nfsb;
+		mapp = &map;
+		nmap = 1;
+	}
+	if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) {
+		error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
+		goto exit0;
+	}
+	if (caller != 3 && nmap > 1) {
+		bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP);
+		nbplist = 0;
+	} else
+		bplist = NULL;
+	/*
+	 * Turn the mapping(s) into buffer(s).
+	 */
+	for (i = 0; i < nmap; i++) {
+		int	nmapped;
+
+		mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock);
+		if (i == 0)
+			*mappedbnop = mappedbno;
+		nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount);
+		switch (caller) {
+		case 0:
+			bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
+				mappedbno, nmapped, 0);
+			error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
+			break;
+		case 1:
+#ifndef __KERNEL__
+		case 2:
+#endif
+			bp = NULL;
+			error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp,
+				mappedbno, nmapped, 0, &bp);
+			break;
+#ifdef __KERNEL__
+		case 3:
+			xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
+			error = 0;
+			bp = NULL;
+			break;
+#endif
+		}
+		if (error) {
+			if (bp)
+				xfs_trans_brelse(trans, bp);
+			goto exit1;
+		}
+		if (!bp)
+			continue;
+		if (caller == 1) {
+			if (whichfork == XFS_ATTR_FORK) {
+				XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
+						XFS_ATTR_BTREE_REF);
+			} else {
+				XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
+						XFS_DIR_BTREE_REF);
+			}
+		}
+		if (bplist) {
+			bplist[nbplist++] = bp;
+		}
+	}
+	/*
+	 * Build a dabuf structure.
+	 */
+	if (bplist) {
+		rbp = xfs_da_buf_make(nbplist, bplist, ra);
+	} else if (bp)
+		rbp = xfs_da_buf_make(1, &bp, ra);
+	else
+		rbp = NULL;
+	/*
+	 * For read_buf, check the magic number.
+	 */
+	if (caller == 1) {
+		xfs_dir2_data_t		*data;
+		xfs_dir2_free_t		*free;
+		xfs_da_blkinfo_t	*info;
+		uint			magic, magic1;
+
+		info = rbp->data;
+		data = rbp->data;
+		free = rbp->data;
+		magic = INT_GET(info->magic, ARCH_CONVERT);
+		magic1 = INT_GET(data->hdr.magic, ARCH_CONVERT);
+		if (XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
+				   (magic != XFS_DIR_LEAF_MAGIC) &&
+				   (magic != XFS_ATTR_LEAF_MAGIC) &&
+				   (magic != XFS_DIR2_LEAF1_MAGIC) &&
+				   (magic != XFS_DIR2_LEAFN_MAGIC) &&
+				   (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
+				   (magic1 != XFS_DIR2_DATA_MAGIC) &&
+				   (INT_GET(free->hdr.magic, ARCH_CONVERT) != XFS_DIR2_FREE_MAGIC),
+				mp, XFS_ERRTAG_DA_READ_BUF,
+				XFS_RANDOM_DA_READ_BUF)) {
+			xfs_buftrace("DA READ ERROR", rbp->bps[0]);
+			error = XFS_ERROR(EFSCORRUPTED);
+			xfs_da_brelse(trans, rbp);
+			nbplist = 0;
+			goto exit1;
+		}
+	}
+	if (bplist) {
+		kmem_free(bplist, sizeof(*bplist) * nmap);
+	}
+	if (mapp != &map) {
+		kmem_free(mapp, sizeof(*mapp) * nfsb);
+	}
+	if (bpp)
+		*bpp = rbp;
+	return 0;
+exit1:
+	if (bplist) {
+		for (i = 0; i < nbplist; i++)
+			xfs_trans_brelse(trans, bplist[i]);
+		kmem_free(bplist, sizeof(*bplist) * nmap);
+	}
+exit0:
+	if (mapp != &map)
+		kmem_free(mapp, sizeof(*mapp) * nfsb);
+	if (bpp)
+		*bpp = NULL;
+	return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	xfs_daddr_t		mappedbno,
+	xfs_dabuf_t	**bpp,
+	int		whichfork)
+{
+	return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0,
+						 (inst_t *)__return_address);
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	xfs_daddr_t		mappedbno,
+	xfs_dabuf_t	**bpp,
+	int		whichfork)
+{
+	return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1,
+		(inst_t *)__return_address);
+}
+
+/*
+ * Readahead the dir/attr block.
+ */
+xfs_daddr_t
+xfs_da_reada_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	int		whichfork)
+{
+	xfs_daddr_t		rval;
+
+	rval = -1;
+	if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3,
+			(inst_t *)__return_address))
+		return -1;
+	else
+		return rval;
+}
+
+/*
+ * Calculate the number of bits needed to hold i different values.
+ */
+uint
+xfs_da_log2_roundup(uint i)
+{
+	uint rval;
+
+	for (rval = 0; rval < NBBY * sizeof(i); rval++) {
+		if ((1 << rval) >= i)
+			break;
+	}
+	return(rval);
+}
+
+kmem_zone_t *xfs_da_state_zone;	/* anchor for state struct zone */
+kmem_zone_t *xfs_dabuf_zone;		/* dabuf zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+	return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+	int	i;
+
+	for (i = 0; i < state->altpath.active; i++) {
+		if (state->altpath.blk[i].bp) {
+			if (state->altpath.blk[i].bp != state->path.blk[i].bp)
+				xfs_da_buf_done(state->altpath.blk[i].bp);
+			state->altpath.blk[i].bp = NULL;
+		}
+	}
+	state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+	int	i;
+
+	xfs_da_state_kill_altpath(state);
+	for (i = 0; i < state->path.active; i++) {
+		if (state->path.blk[i].bp)
+			xfs_da_buf_done(state->path.blk[i].bp);
+	}
+	if (state->extravalid && state->extrablk.bp)
+		xfs_da_buf_done(state->extrablk.bp);
+#ifdef DEBUG
+	bzero((char *)state, sizeof(*state));
+#endif /* DEBUG */
+	kmem_zone_free(xfs_da_state_zone, state);
+}
+
+#ifdef XFS_DABUF_DEBUG
+xfs_dabuf_t	*xfs_dabuf_global_list;
+lock_t		xfs_dabuf_global_lock;
+#endif
+
+/*
+ * Create a dabuf.
+ */
+/* ARGSUSED */
+STATIC xfs_dabuf_t *
+xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
+{
+	xfs_buf_t	*bp;
+	xfs_dabuf_t	*dabuf;
+	int		i;
+	int		off;
+
+	if (nbuf == 1)
+		dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
+	else
+		dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
+	dabuf->dirty = 0;
+#ifdef XFS_DABUF_DEBUG
+	dabuf->ra = ra;
+	dabuf->dev = XFS_BUF_TARGET_DEV(bps[0]);
+	dabuf->blkno = XFS_BUF_ADDR(bps[0]);
+#endif
+	if (nbuf == 1) {
+		dabuf->nbuf = 1;
+		bp = bps[0];
+		dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
+		dabuf->data = XFS_BUF_PTR(bp);
+		dabuf->bps[0] = bp;
+	} else {
+		dabuf->nbuf = nbuf;
+		for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
+			dabuf->bps[i] = bp = bps[i];
+			dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
+		}
+		dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
+		for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+			bp = bps[i];
+			bcopy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
+				XFS_BUF_COUNT(bp));
+		}
+	}
+#ifdef XFS_DABUF_DEBUG
+	{
+		SPLDECL(s);
+		xfs_dabuf_t	*p;
+
+		s = mutex_spinlock(&xfs_dabuf_global_lock);
+		for (p = xfs_dabuf_global_list; p; p = p->next) {
+			ASSERT(p->blkno != dabuf->blkno ||
+			       p->dev != dabuf->dev);
+		}
+		dabuf->prev = NULL;
+		if (xfs_dabuf_global_list)
+			xfs_dabuf_global_list->prev = dabuf;
+		dabuf->next = xfs_dabuf_global_list;
+		xfs_dabuf_global_list = dabuf;
+		mutex_spinunlock(&xfs_dabuf_global_lock, s);
+	}
+#endif
+	return dabuf;
+}
+
+/*
+ * Un-dirty a dabuf.
+ */
+STATIC void
+xfs_da_buf_clean(xfs_dabuf_t *dabuf)
+{
+	xfs_buf_t	*bp;
+	int		i;
+	int		off;
+
+	if (dabuf->dirty) {
+		ASSERT(dabuf->nbuf > 1);
+		dabuf->dirty = 0;
+		for (i = off = 0; i < dabuf->nbuf;
+				i++, off += XFS_BUF_COUNT(bp)) {
+			bp = dabuf->bps[i];
+			bcopy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
+				XFS_BUF_COUNT(bp));
+		}
+	}
+}
+
+/*
+ * Release a dabuf.
+ */
+void
+xfs_da_buf_done(xfs_dabuf_t *dabuf)
+{
+	ASSERT(dabuf);
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if (dabuf->dirty)
+		xfs_da_buf_clean(dabuf);
+	if (dabuf->nbuf > 1)
+		kmem_free(dabuf->data, BBTOB(dabuf->bbcount));
+#ifdef XFS_DABUF_DEBUG
+	{
+		SPLDECL(s);
+
+		s = mutex_spinlock(&xfs_dabuf_global_lock);
+		if (dabuf->prev)
+			dabuf->prev->next = dabuf->next;
+		else
+			xfs_dabuf_global_list = dabuf->next;
+		if (dabuf->next)
+			dabuf->next->prev = dabuf->prev;
+		mutex_spinunlock(&xfs_dabuf_global_lock, s);
+	}
+	bzero(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
+#endif
+	if (dabuf->nbuf == 1)
+		kmem_zone_free(xfs_dabuf_zone, dabuf);
+	else
+		kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
+}
+
+/*
+ * Log transaction from a dabuf.
+ */
+void
+xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
+{
+	xfs_buf_t	*bp;
+	uint		f;
+	int		i;
+	uint		l;
+	int		off;
+
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if (dabuf->nbuf == 1) {
+		ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
+		xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
+		return;
+	}
+	dabuf->dirty = 1;
+	ASSERT(first <= last);
+	for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+		bp = dabuf->bps[i];
+		f = off;
+		l = f + XFS_BUF_COUNT(bp) - 1;
+		if (f < first)
+			f = first;
+		if (l > last)
+			l = last;
+		if (f <= l)
+			xfs_trans_log_buf(tp, bp, f - off, l - off);
+		/*
+		 * B_DONE is set by xfs_trans_log buf.
+		 * If we don't set it on a new buffer (get not read)
+		 * then if we don't put anything in the buffer it won't
+		 * be set, and at commit it it released into the cache,
+		 * and then a read will fail.
+		 */
+		else if (!(XFS_BUF_ISDONE(bp)))
+		  XFS_BUF_DONE(bp);
+	}
+	ASSERT(last < off);
+}
+
+/*
+ * Release dabuf from a transaction.
+ * Have to free up the dabuf before the buffers are released,
+ * since the synchronization on the dabuf is really the lock on the buffer.
+ */
+void
+xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+	xfs_buf_t	*bp;
+	xfs_buf_t	**bplist;
+	int		i;
+	int		nbuf;
+
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if ((nbuf = dabuf->nbuf) == 1) {
+		bplist = &bp;
+		bp = dabuf->bps[0];
+	} else {
+		bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+		bcopy(dabuf->bps, bplist, nbuf * sizeof(*bplist));
+	}
+	xfs_da_buf_done(dabuf);
+	for (i = 0; i < nbuf; i++)
+		xfs_trans_brelse(tp, bplist[i]);
+	if (bplist != &bp)
+		kmem_free(bplist, nbuf * sizeof(*bplist));
+}
+
+/*
+ * Invalidate dabuf from a transaction.
+ */
+void
+xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+	xfs_buf_t	*bp;
+	xfs_buf_t	**bplist;
+	int		i;
+	int		nbuf;
+
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if ((nbuf = dabuf->nbuf) == 1) {
+		bplist = &bp;
+		bp = dabuf->bps[0];
+	} else {
+		bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+		bcopy(dabuf->bps, bplist, nbuf * sizeof(*bplist));
+	}
+	xfs_da_buf_done(dabuf);
+	for (i = 0; i < nbuf; i++)
+		xfs_trans_binval(tp, bplist[i]);
+	if (bplist != &bp)
+		kmem_free(bplist, nbuf * sizeof(*bplist));
+}
+
+/*
+ * Get the first daddr from a dabuf.
+ */
+xfs_daddr_t
+xfs_da_blkno(xfs_dabuf_t *dabuf)
+{
+	ASSERT(dabuf->nbuf);
+	ASSERT(dabuf->data);
+	return XFS_BUF_ADDR(dabuf->bps[0]);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_da_btree.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_da_btree.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_da_btree.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_da_btree.h	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define __XFS_DA_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct zone;
+
+/*========================================================================
+ * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * Is is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC	0xfebe	/* magic number: non-leaf blocks */
+#define XFS_DIR_LEAF_MAGIC	0xfeeb	/* magic number: directory leaf blks */
+#define XFS_ATTR_LEAF_MAGIC	0xfbee	/* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC	0xd2f1	/* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC	0xd2ff	/* magic number: v2 dirlf multi blks */
+
+#define XFS_DIRX_LEAF_MAGIC(mp) \
+	(XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC)
+
+typedef struct xfs_da_blkinfo {
+	xfs_dablk_t forw;			/* previous block in list */
+	xfs_dablk_t back;			/* following block in list */
+	__uint16_t magic;			/* validity check on block */
+	__uint16_t pad;				/* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH	5	/* max depth of Btree */
+
+typedef struct xfs_da_intnode {
+	struct xfs_da_node_hdr {	/* constant-structure header block */
+		xfs_da_blkinfo_t info;	/* block type, links, etc. */
+		__uint16_t count;	/* count of active entries */
+		__uint16_t level;	/* level above leaves (leaf == 0) */
+	} hdr;
+	struct xfs_da_node_entry {
+		xfs_dahash_t hashval;	/* hash value for this descendant */
+		xfs_dablk_t before;	/* Btree block before this key */
+	} btree[1];			/* variable sized array of keys */
+} xfs_da_intnode_t;
+typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
+typedef struct xfs_da_node_entry xfs_da_node_entry_t;
+
+#define XFS_DA_NODE_ENTSIZE_BYNAME	/* space a name uses */ \
+	(sizeof(xfs_da_node_entry_t))
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_NODE_ENTRIES)
+int xfs_da_node_entries(struct xfs_mount *mp);
+#define XFS_DA_NODE_ENTRIES(mp)		xfs_da_node_entries(mp)
+#else
+#define XFS_DA_NODE_ENTRIES(mp)		((mp)->m_da_node_ents)
+#endif
+
+#define XFS_DA_MAXHASH	((xfs_dahash_t)-1) /* largest valid hash value */
+
+/*
+ * Macros used by directory code to interface to the filesystem.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBSIZE)
+int xfs_lbsize(struct xfs_mount *mp);
+#define XFS_LBSIZE(mp)			xfs_lbsize(mp)
+#else
+#define XFS_LBSIZE(mp)	((mp)->m_sb.sb_blocksize)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBLOG)
+int xfs_lblog(struct xfs_mount *mp);
+#define XFS_LBLOG(mp)			xfs_lblog(mp)
+#else
+#define XFS_LBLOG(mp)	((mp)->m_sb.sb_blocklog)
+#endif
+
+/*
+ * Macros used by directory code to interface to the kernel
+ */
+
+/*
+ * Macros used to manipulate directory off_t's
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_BNOENTRY)
+__uint32_t xfs_da_make_bnoentry(struct xfs_mount *mp, xfs_dablk_t bno,
+				int entry);
+#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry)	\
+	xfs_da_make_bnoentry(mp,bno,entry)
+#else
+#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \
+	(((bno) << (mp)->m_dircook_elog) | (entry))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_COOKIE)
+xfs_off_t xfs_da_make_cookie(struct xfs_mount *mp, xfs_dablk_t bno, int entry,
+				xfs_dahash_t hash);
+#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash)	\
+	xfs_da_make_cookie(mp,bno,entry,hash)
+#else
+#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \
+	(((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_HASH)
+xfs_dahash_t xfs_da_cookie_hash(struct xfs_mount *mp, xfs_off_t cookie);
+#define XFS_DA_COOKIE_HASH(mp,cookie)		xfs_da_cookie_hash(mp,cookie)
+#else
+#define XFS_DA_COOKIE_HASH(mp,cookie)	((xfs_dahash_t)(cookie))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_BNO)
+xfs_dablk_t xfs_da_cookie_bno(struct xfs_mount *mp, xfs_off_t cookie);
+#define XFS_DA_COOKIE_BNO(mp,cookie)		xfs_da_cookie_bno(mp,cookie)
+#else
+#define XFS_DA_COOKIE_BNO(mp,cookie) \
+	(((xfs_off_t)(cookie) >> 31) == -1LL ? \
+		(xfs_dablk_t)0 : \
+		(xfs_dablk_t)((xfs_off_t)(cookie) >> ((mp)->m_dircook_elog + 32)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_ENTRY)
+int xfs_da_cookie_entry(struct xfs_mount *mp, xfs_off_t cookie);
+#define XFS_DA_COOKIE_ENTRY(mp,cookie)		xfs_da_cookie_entry(mp,cookie)
+#else
+#define XFS_DA_COOKIE_ENTRY(mp,cookie) \
+	(((xfs_off_t)(cookie) >> 31) == -1LL ? \
+		(xfs_dablk_t)0 : \
+		(xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
+			      ((1 << (mp)->m_dircook_elog) - 1)))
+#endif
+
+
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+	uchar_t		*name;		/* string (maybe not NULL terminated) */
+	int		namelen;	/* length of string (maybe no NULL) */
+	uchar_t		*value;		/* set of bytes (maybe contain NULLs) */
+	int		valuelen;	/* length of value */
+	int		flags;		/* argument flags (eg: ATTR_NOCREATE) */
+	xfs_dahash_t	hashval;	/* hash value of name */
+	xfs_ino_t	inumber;	/* input/output inode number */
+	struct xfs_inode *dp;		/* directory inode to manipulate */
+	xfs_fsblock_t	*firstblock;	/* ptr to firstblock for bmap calls */
+	struct xfs_bmap_free *flist;	/* ptr to freelist for bmap_finish */
+	struct xfs_trans *trans;	/* current trans (changes over time) */
+	xfs_extlen_t	total;		/* total blocks needed, for 1st bmap */
+	int		whichfork;	/* data or attribute fork */
+	xfs_dablk_t	blkno;		/* blkno of attr leaf of interest */
+	int		index;		/* index of attr of interest in blk */
+	xfs_dablk_t	rmtblkno;	/* remote attr value starting blkno */
+	int		rmtblkcnt;	/* remote attr value block count */
+	int		rename;		/* T/F: this is an atomic rename op */
+	xfs_dablk_t	blkno2;		/* blkno of 2nd attr leaf of interest */
+	int		index2;		/* index of 2nd attr in blk */
+	xfs_dablk_t	rmtblkno2;	/* remote attr value starting blkno */
+	int		rmtblkcnt2;	/* remote attr value block count */
+	int		justcheck;	/* check for ok with no space */
+	int		addname;	/* T/F: this is an add operation */
+	int		oknoent;	/* T/F: ok to return ENOENT, else die */
+} xfs_da_args_t;
+
+/*
+ * Structure to describe buffer(s) for a block.
+ * This is needed in the directory version 2 format case, when
+ * multiple non-contiguous fsblocks might be needed to cover one
+ * logical directory block.
+ * If the buffer count is 1 then the data pointer points to the
+ * same place as the b_addr field for the buffer, else to kmem_alloced memory.
+ */
+typedef struct xfs_dabuf {
+	int		nbuf;		/* number of buffer pointers present */
+	short		dirty;		/* data needs to be copied back */
+	short		bbcount;	/* how large is data in bbs */
+	void		*data;		/* pointer for buffers' data */
+#ifdef XFS_DABUF_DEBUG
+	inst_t		*ra;		/* return address of caller to make */
+	struct xfs_dabuf *next;		/* next in global chain */
+	struct xfs_dabuf *prev;		/* previous in global chain */
+	dev_t		dev;		/* device for buffer */
+	xfs_daddr_t	blkno;		/* daddr first in bps[0] */
+#endif
+	struct xfs_buf	*bps[1];	/* actually nbuf of these */
+} xfs_dabuf_t;
+#define XFS_DA_BUF_SIZE(n)	\
+	(sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1))
+
+#ifdef XFS_DABUF_DEBUG
+extern xfs_dabuf_t	*xfs_dabuf_global_list;
+#endif
+
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes.  With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+	xfs_dabuf_t	*bp;		/* buffer containing block */
+	xfs_dablk_t	blkno;		/* filesystem blkno of buffer */
+	xfs_daddr_t		disk_blkno;	/* on-disk blkno (in BBs) of buffer */
+	int		index;		/* relevant index into block */
+	xfs_dahash_t	hashval;	/* last hash value in block */
+	int		magic;		/* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+
+typedef struct xfs_da_state_path {
+	int			active;		/* number of active levels */
+	xfs_da_state_blk_t	blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+
+typedef struct xfs_da_state {
+	xfs_da_args_t		*args;		/* filename arguments */
+	struct xfs_mount	*mp;		/* filesystem mount point */
+	int			blocksize;	/* logical block size */
+	int			inleaf;		/* insert into 1->lf, 0->splf */
+	xfs_da_state_path_t	path;		/* search/split paths */
+	xfs_da_state_path_t	altpath;	/* alternate path for join */
+	int			extravalid;	/* T/F: extrablk is in use */
+	int			extraafter;	/* T/F: extrablk is after new */
+	xfs_da_state_blk_t	extrablk;	/* for double-splits on leafs */
+						/* for dirv2 extrablk is data */
+} xfs_da_state_t;
+
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR)	((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE)	\
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+
+
+#ifdef __KERNEL__
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+					 xfs_dabuf_t **bpp, int whichfork);
+int	xfs_da_split(xfs_da_state_t *state);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_da_join(xfs_da_state_t *state);
+void	xfs_da_fixhashpath(xfs_da_state_t *state,
+					  xfs_da_state_path_t *path_to_to_fix);
+
+/*
+ * Routines used for finding things in the Btree.
+ */
+int	xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
+int	xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+					 int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int	xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+					 xfs_da_state_blk_t *save_blk);
+int	xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+				       xfs_da_state_blk_t *new_blk);
+
+/*
+ * Utility routines.
+ */
+int	xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int	xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			      xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			      xfs_dabuf_t **bp, int whichfork);
+int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			       xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			       xfs_dabuf_t **bpp, int whichfork);
+xfs_daddr_t	xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			xfs_dablk_t bno, int whichfork);
+int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+					  xfs_dabuf_t *dead_buf);
+
+uint xfs_da_hashname(uchar_t *name_string, int name_length);
+uint xfs_da_log2_roundup(uint i);
+xfs_da_state_t *xfs_da_state_alloc(void);
+void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da_state_kill_altpath(xfs_da_state_t *state);
+
+void xfs_da_buf_done(xfs_dabuf_t *dabuf);
+void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first,
+			   uint last);
+void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
+
+extern struct kmem_zone *xfs_da_state_zone;
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_DA_BTREE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dfrag.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dfrag.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dfrag.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dfrag.c	2002-10-25 16:51:21.000000000 -0400
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_dfrag.h>
+
+/*
+ * Syssgi interface for swapext
+ */
+int
+xfs_swapext(
+	xfs_swapext_t	*sxp)
+{
+	xfs_swapext_t	sx;
+	xfs_inode_t	*ip=NULL, *tip=NULL, *ips[2];
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	xfs_bstat_t	*sbp;
+	struct file	*fp = NULL, *tfp = NULL;
+	vnode_t		*vp, *tvp;
+	bhv_desc_t	*bdp, *tbdp;
+	vn_bhv_head_t	*bhp, *tbhp;
+	uint		lock_flags=0;
+	int		ilf_fields, tilf_fields;
+	int		error = 0;
+	xfs_ifork_t	tempif, *ifp, *tifp;
+	__uint64_t	tmp;
+	int		aforkblks = 0;
+	int		taforkblks = 0;
+	int		locked = 0;
+
+	if (copy_from_user(&sx, sxp, sizeof(sx)))
+		return XFS_ERROR(EFAULT);
+
+	/* Pull information for the target fd */
+	if (((fp = fget((int)sx.sx_fdtarget)) == NULL) ||
+	    ((vp = LINVFS_GET_VP(fp->f_dentry->d_inode)) == NULL))  {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	bhp = VN_BHV_HEAD(vp);
+	VN_BHV_READ_LOCK(bhp);
+	bdp = vn_bhv_lookup(bhp, &xfs_vnodeops);
+	if (bdp == NULL) {
+		VN_BHV_READ_UNLOCK(bhp);
+		error = XFS_ERROR(EBADF);
+		goto error0;
+	} else {
+		ip = XFS_BHVTOI(bdp);
+		VN_BHV_READ_UNLOCK(bhp);
+	}
+
+	if (((tfp = fget((int)sx.sx_fdtmp)) == NULL) ||
+	    ((tvp = LINVFS_GET_VP(tfp->f_dentry->d_inode)) == NULL)) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	tbhp = VN_BHV_HEAD(tvp);
+	VN_BHV_READ_LOCK(tbhp);
+	tbdp = vn_bhv_lookup(tbhp, &xfs_vnodeops);
+	if (tbdp == NULL) {
+		VN_BHV_READ_UNLOCK(tbhp);
+		error = XFS_ERROR(EBADF);
+		goto error0;
+	} else {
+		tip = XFS_BHVTOI(tbdp);
+		VN_BHV_READ_UNLOCK(tbhp);
+	}
+
+	if (ip->i_ino == tip->i_ino) {
+		error =	 XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	mp = ip->i_mount;
+
+	sbp = &sx.sx_stat;
+
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		error =	 XFS_ERROR(EIO);
+		goto error0;
+	}
+
+	locked = 1;
+
+	/* Lock in i_ino order */
+	if (ip->i_ino < tip->i_ino) {
+		ips[0] = ip;
+		ips[1] = tip;
+	} else {
+		ips[0] = tip;
+		ips[1] = ip;
+	}
+	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+	xfs_lock_inodes(ips, 2, 0, lock_flags);
+
+	/* Check permissions */
+	if ((error = _MAC_XFS_IACCESS(ip, MACWRITE, NULL))) {
+		goto error0;
+	}
+	if ((error = _MAC_XFS_IACCESS(tip, MACWRITE, NULL))) {
+		goto error0;
+	}
+	if ((current->fsuid != ip->i_d.di_uid) &&
+	    (error = xfs_iaccess(ip, IWRITE, NULL)) &&
+	    !capable_cred(NULL, CAP_FOWNER)) {
+		goto error0;
+	}
+	if ((current->fsuid != tip->i_d.di_uid) &&
+	    (error = xfs_iaccess(tip, IWRITE, NULL)) &&
+	    !capable_cred(NULL, CAP_FOWNER)) {
+		goto error0;
+	}
+
+	/* Verify both files are either real-time or non-realtime */
+	if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
+	    (tip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	/* Should never get a local format */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	if (VN_CACHED(tvp) != 0)
+		xfs_inval_cached_pages(XFS_ITOV(tip), &(tip->i_iocore),
+						(loff_t)0, 0, 0);
+
+	/* Verify O_DIRECT for ftmp */
+	if (VN_CACHED(tvp) != 0) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	/* Verify all data are being swapped */
+	if (sx.sx_offset != 0 ||
+	    sx.sx_length != ip->i_d.di_size ||
+	    sx.sx_length != tip->i_d.di_size) {
+		error = XFS_ERROR(EFAULT);
+		goto error0;
+	}
+
+	/*
+	 * If the target has extended attributes, the tmp file
+	 * must also in order to ensure the correct data fork
+	 * format.
+	 */
+	if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	/*
+	 * Compare the current change & modify times with that
+	 * passed in.  If they differ, we abort this swap.
+	 * This is the mechanism used to ensure the calling
+	 * process that the file was not changed out from
+	 * under it.
+	 */
+	if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) ||
+	    (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) ||
+	    (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
+	    (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
+		error = XFS_ERROR(EBUSY);
+		goto error0;
+	}
+
+	/* We need to fail if the file is memory mapped.  Once we have tossed
+	 * all existing pages, the page fault will have no option
+	 * but to go to the filesystem for pages. By making the page fault call
+	 * VOP_READ (or write in the case of autogrow) they block on the iolock
+	 * until we have switched the extents.
+	 */
+	if (VN_MAPPED(vp)) {
+		error = XFS_ERROR(EBUSY);
+		goto error0;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_iunlock(tip, XFS_ILOCK_EXCL);
+
+	/*
+	 * There is a race condition here since we gave up the
+	 * ilock.  However, the data fork will not change since
+	 * we have the iolock (locked for truncation too) so we
+	 * are safe.  We don't really care if non-io related
+	 * fields change.
+	 */
+
+	VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+	if ((error = xfs_trans_reserve(tp, 0,
+				     XFS_ICHANGE_LOG_RES(mp), 0,
+				     0, 0))) {
+		xfs_iunlock(ip,	 XFS_IOLOCK_EXCL);
+		xfs_iunlock(tip, XFS_IOLOCK_EXCL);
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+
+	/*
+	 * Count the number of extended attribute blocks
+	 */
+	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+		if (error) {
+			xfs_iunlock(ip,	 lock_flags);
+			xfs_iunlock(tip, lock_flags);
+			xfs_trans_cancel(tp, 0);
+			return error;
+		}
+	}
+	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+			&taforkblks);
+		if (error) {
+			xfs_iunlock(ip,	 lock_flags);
+			xfs_iunlock(tip, lock_flags);
+			xfs_trans_cancel(tp, 0);
+			return error;
+		}
+	}
+
+	/*
+	 * Swap the data forks of the inodes
+	 */
+	ifp = &ip->i_df;
+	tifp = &tip->i_df;
+	tempif = *ifp;	/* struct copy */
+	*ifp = *tifp;	/* struct copy */
+	*tifp = tempif; /* struct copy */
+
+	/*
+	 * Fix the on-disk inode values
+	 */
+	tmp = (__uint64_t)ip->i_d.di_nblocks;
+	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+	tmp = (__uint64_t) ip->i_d.di_nextents;
+	ip->i_d.di_nextents = tip->i_d.di_nextents;
+	tip->i_d.di_nextents = tmp;
+
+	tmp = (__uint64_t) ip->i_d.di_format;
+	ip->i_d.di_format = tip->i_d.di_format;
+	tip->i_d.di_format = tmp;
+
+	ilf_fields = XFS_ILOG_CORE;
+
+	switch(ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/* If the extents fit in the inode, fix the
+		 * pointer.  Otherwise it's already NULL or
+		 * pointing to the extent.
+		 */
+		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+			ifp->if_u1.if_extents =
+				ifp->if_u2.if_inline_ext;
+		}
+		ilf_fields |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		ilf_fields |= XFS_ILOG_DBROOT;
+		break;
+	}
+
+	tilf_fields = XFS_ILOG_CORE;
+
+	switch(tip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/* If the extents fit in the inode, fix the
+		 * pointer.  Otherwise it's already NULL or
+		 * pointing to the extent.
+		 */
+		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+			tifp->if_u1.if_extents =
+				tifp->if_u2.if_inline_ext;
+		}
+		tilf_fields |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		tilf_fields |= XFS_ILOG_DBROOT;
+		break;
+	}
+
+	/*
+	 * Increment vnode ref counts since xfs_trans_commit &
+	 * xfs_trans_cancel will both unlock the inodes and
+	 * decrement the associated ref counts.
+	 */
+	VN_HOLD(vp);
+	VN_HOLD(tvp);
+
+	xfs_trans_ijoin(tp, ip, lock_flags);
+	xfs_trans_ijoin(tp, tip, lock_flags);
+
+	xfs_trans_log_inode(tp, ip,  ilf_fields);
+	xfs_trans_log_inode(tp, tip, tilf_fields);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL);
+
+	fput(fp);
+	fput(tfp);
+
+	return error;
+
+ error0:
+	if (locked) {
+		xfs_iunlock(ip,	 lock_flags);
+		xfs_iunlock(tip, lock_flags);
+	}
+
+	if (fp != NULL) fput(fp);
+	if (tfp != NULL) fput(tfp);
+
+	return error;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dfrag.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dfrag.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dfrag.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dfrag.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DFRAG_H__
+#define __XFS_DFRAG_H__
+
+/*
+ * Structure passed to xfs_swapext
+ */
+
+typedef struct xfs_swapext
+{
+	__int64_t	sx_version;	/* version */
+	__int64_t	sx_fdtarget;	/* fd of target file */
+	__int64_t	sx_fdtmp;	/* fd of tmp file */
+	xfs_off_t	sx_offset;	/* offset into file */
+	xfs_off_t	sx_length;	/* leng from offset */
+	char		sx_pad[16];	/* pad space, unused */
+	xfs_bstat_t	sx_stat;	/* stat of target b4 copy */
+} xfs_swapext_t;
+
+/*
+ * Version flag
+ */
+#define XFS_SX_VERSION		0
+
+#ifdef __KERNEL__
+/*
+ * Prototypes for visible xfs_dfrag.c routines.
+ */
+
+/*
+ * Syscall interface for xfs_swapext
+ */
+int	xfs_swapext(struct xfs_swapext *sx);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_DFRAG_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dinode.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dinode.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dinode.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dinode.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DINODE_H__
+#define __XFS_DINODE_H__
+
+struct xfs_buf;
+struct xfs_mount;
+
+#define XFS_DINODE_VERSION_1	1
+#define XFS_DINODE_VERSION_2	2
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DINODE_GOOD_VERSION)
+int xfs_dinode_good_version(int v);
+#define XFS_DINODE_GOOD_VERSION(v)	xfs_dinode_good_version(v)
+#else
+#define XFS_DINODE_GOOD_VERSION(v)	(((v) == XFS_DINODE_VERSION_1) || \
+					 ((v) == XFS_DINODE_VERSION_2))
+#endif
+#define XFS_DINODE_MAGIC	0x494e	/* 'IN' */
+
+/*
+ * Disk inode structure.
+ * This is just the header; the inode is expanded to fill a variable size
+ * with the last field expanding.  It is split into the core and "other"
+ * because we only need the core part in the in-core inode.
+ */
+typedef struct xfs_timestamp {
+	__int32_t	t_sec;		/* timestamp seconds */
+	__int32_t	t_nsec;		/* timestamp nanoseconds */
+} xfs_timestamp_t;
+
+/*
+ * Note: Coordinate changes to this structure with the XFS_DI_* #defines
+ * below and the offsets table in xfs_ialloc_log_di().
+ */
+typedef struct xfs_dinode_core
+{
+	__uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
+	__uint16_t	di_mode;	/* mode and type of file */
+	__int8_t	di_version;	/* inode version */
+	__int8_t	di_format;	/* format of di_c data */
+	__uint16_t	di_onlink;	/* old number of links to file */
+	__uint32_t	di_uid;		/* owner's user id */
+	__uint32_t	di_gid;		/* owner's group id */
+	__uint32_t	di_nlink;	/* number of links to file */
+	__uint16_t	di_projid;	/* owner's project id */
+	__uint8_t	di_pad[10];	/* unused, zeroed space */
+	xfs_timestamp_t di_atime;	/* time last accessed */
+	xfs_timestamp_t di_mtime;	/* time last modified */
+	xfs_timestamp_t di_ctime;	/* time created/inode modified */
+	xfs_fsize_t	di_size;	/* number of bytes in file */
+	xfs_drfsbno_t	di_nblocks;	/* # of direct & btree blocks used */
+	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
+	xfs_extnum_t	di_nextents;	/* number of extents in data fork */
+	xfs_aextnum_t	di_anextents;	/* number of extents in attribute fork*/
+	__uint8_t	di_forkoff;	/* attr fork offs, <<3 for 64b align */
+	__int8_t	di_aformat;	/* format of attr fork's data */
+	__uint32_t	di_dmevmask;	/* DMIG event mask */
+	__uint16_t	di_dmstate;	/* DMIG state info */
+	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
+	__uint32_t	di_gen;		/* generation number */
+} xfs_dinode_core_t;
+
+typedef struct xfs_dinode
+{
+	xfs_dinode_core_t	di_core;
+	/*
+	 * In adding anything between the core and the union, be
+	 * sure to update the macros like XFS_LITINO below and
+	 * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
+	 */
+	xfs_agino_t		di_next_unlinked;/* agi unlinked list ptr */
+	union {
+		xfs_bmdr_block_t di_bmbt;	/* btree root block */
+		xfs_bmbt_rec_32_t di_bmx[1];	/* extent list */
+		xfs_dir_shortform_t di_dirsf;	/* shortform directory */
+		xfs_dir2_sf_t	di_dir2sf;	/* shortform directory v2 */
+		char		di_c[1];	/* local contents */
+		xfs_dev_t	di_dev;		/* device for IFCHR/IFBLK */
+		uuid_t		di_muuid;	/* mount point value */
+		char		di_symlink[1];	/* local symbolic link */
+	}		di_u;
+	union {
+		xfs_bmdr_block_t di_abmbt;	/* btree root block */
+		xfs_bmbt_rec_32_t di_abmx[1];	/* extent list */
+		xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
+	}		di_a;
+} xfs_dinode_t;
+
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define XFS_MAXLINK		((1U << 31) - 1U)
+#define XFS_MAXLINK_1		65535U
+
+/*
+ * Bit names for logging disk inodes only
+ */
+#define XFS_DI_MAGIC		0x0000001
+#define XFS_DI_MODE		0x0000002
+#define XFS_DI_VERSION		0x0000004
+#define XFS_DI_FORMAT		0x0000008
+#define XFS_DI_ONLINK		0x0000010
+#define XFS_DI_UID		0x0000020
+#define XFS_DI_GID		0x0000040
+#define XFS_DI_NLINK		0x0000080
+#define XFS_DI_PROJID		0x0000100
+#define XFS_DI_PAD		0x0000200
+#define XFS_DI_ATIME		0x0000400
+#define XFS_DI_MTIME		0x0000800
+#define XFS_DI_CTIME		0x0001000
+#define XFS_DI_SIZE		0x0002000
+#define XFS_DI_NBLOCKS		0x0004000
+#define XFS_DI_EXTSIZE		0x0008000
+#define XFS_DI_NEXTENTS		0x0010000
+#define XFS_DI_NAEXTENTS	0x0020000
+#define XFS_DI_FORKOFF		0x0040000
+#define XFS_DI_AFORMAT		0x0080000
+#define XFS_DI_DMEVMASK		0x0100000
+#define XFS_DI_DMSTATE		0x0200000
+#define XFS_DI_FLAGS		0x0400000
+#define XFS_DI_GEN		0x0800000
+#define XFS_DI_NEXT_UNLINKED	0x1000000
+#define XFS_DI_U		0x2000000
+#define XFS_DI_A		0x4000000
+#define XFS_DI_NUM_BITS		27
+#define XFS_DI_ALL_BITS		((1 << XFS_DI_NUM_BITS) - 1)
+#define XFS_DI_CORE_BITS	(XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
+
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt
+{
+	XFS_DINODE_FMT_DEV,		/* CHR, BLK: di_dev */
+	XFS_DINODE_FMT_LOCAL,		/* DIR, REG: di_c */
+					/* LNK: di_symlink */
+	XFS_DINODE_FMT_EXTENTS,		/* DIR, REG, LNK: di_bmx */
+	XFS_DINODE_FMT_BTREE,		/* DIR, REG, LNK: di_bmbt */
+	XFS_DINODE_FMT_UUID		/* MNT: di_uuid */
+} xfs_dinode_fmt_t;
+
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define XFS_DINODE_MIN_LOG	8
+#define XFS_DINODE_MAX_LOG	11
+#define XFS_DINODE_MIN_SIZE	(1 << XFS_DINODE_MIN_LOG)
+#define XFS_DINODE_MAX_SIZE	(1 << XFS_DINODE_MAX_LOG)
+
+/*
+ * Inode size for given fs.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LITINO)
+int xfs_litino(struct xfs_mount *mp);
+#define XFS_LITINO(mp)		xfs_litino(mp)
+#else
+#define XFS_LITINO(mp)	((mp)->m_litino)
+#endif
+#define XFS_BROOT_SIZE_ADJ	\
+	(sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+
+/*
+ * Fork identifiers.  Here so utilities can use them without including
+ * xfs_inode.h.
+ */
+#define XFS_DATA_FORK	0
+#define XFS_ATTR_FORK	1
+
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_Q)
+int xfs_cfork_q_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch);
+int xfs_cfork_q(xfs_dinode_core_t *dcp);
+#define XFS_CFORK_Q_ARCH(dcp,arch)	    xfs_cfork_q_arch(dcp,arch)
+#define XFS_CFORK_Q(dcp)		    xfs_cfork_q(dcp)
+#else
+#define XFS_CFORK_Q_ARCH(dcp,arch)	    (!INT_ISZERO((dcp)->di_forkoff, arch))
+#define XFS_CFORK_Q(dcp)		    ((dcp)->di_forkoff != 0)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_BOFF)
+int xfs_cfork_boff_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch);
+int xfs_cfork_boff(xfs_dinode_core_t *dcp);
+#define XFS_CFORK_BOFF_ARCH(dcp,arch)	    xfs_cfork_boff_arch(dcp,arch)
+#define XFS_CFORK_BOFF(dcp)		    xfs_cfork_boff(dcp)
+#else
+#define XFS_CFORK_BOFF_ARCH(dcp,arch)	    ((int)(INT_GET((dcp)->di_forkoff, arch) << 3))
+#define XFS_CFORK_BOFF(dcp)		    ((int)((dcp)->di_forkoff << 3))
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_DSIZE)
+int xfs_cfork_dsize_arch(xfs_dinode_core_t *dcp, struct xfs_mount *mp, xfs_arch_t arch);
+int xfs_cfork_dsize(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
+#define XFS_CFORK_DSIZE_ARCH(dcp,mp,arch)   xfs_cfork_dsize_arch(dcp,mp,arch)
+#define XFS_CFORK_DSIZE(dcp,mp)		    xfs_cfork_dsize(dcp,mp)
+#else
+#define XFS_CFORK_DSIZE_ARCH(dcp,mp,arch) \
+	(XFS_CFORK_Q_ARCH(dcp, arch) ? XFS_CFORK_BOFF_ARCH(dcp, arch) : XFS_LITINO(mp))
+#define XFS_CFORK_DSIZE(dcp,mp) \
+	(XFS_CFORK_Q(dcp) ? XFS_CFORK_BOFF(dcp) : XFS_LITINO(mp))
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_ASIZE)
+int xfs_cfork_asize_arch(xfs_dinode_core_t *dcp, struct xfs_mount *mp, xfs_arch_t arch);
+int xfs_cfork_asize(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
+#define XFS_CFORK_ASIZE_ARCH(dcp,mp,arch)   xfs_cfork_asize_arch(dcp,mp,arch)
+#define XFS_CFORK_ASIZE(dcp,mp)		    xfs_cfork_asize(dcp,mp)
+#else
+#define XFS_CFORK_ASIZE_ARCH(dcp,mp,arch) \
+	(XFS_CFORK_Q_ARCH(dcp, arch) ? XFS_LITINO(mp) - XFS_CFORK_BOFF_ARCH(dcp, arch) : 0)
+#define XFS_CFORK_ASIZE(dcp,mp) \
+	(XFS_CFORK_Q(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF(dcp) : 0)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_SIZE)
+int xfs_cfork_size_arch(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w, xfs_arch_t arch);
+int xfs_cfork_size(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w);
+#define XFS_CFORK_SIZE_ARCH(dcp,mp,w,arch)  xfs_cfork_size_arch(dcp,mp,w,arch)
+#define XFS_CFORK_SIZE(dcp,mp,w)	    xfs_cfork_size(dcp,mp,w)
+#else
+#define XFS_CFORK_SIZE_ARCH(dcp,mp,w,arch) \
+	((w) == XFS_DATA_FORK ? \
+		XFS_CFORK_DSIZE_ARCH(dcp, mp, arch) : XFS_CFORK_ASIZE_ARCH(dcp, mp, arch))
+#define XFS_CFORK_SIZE(dcp,mp,w) \
+	((w) == XFS_DATA_FORK ? \
+		XFS_CFORK_DSIZE(dcp, mp) : XFS_CFORK_ASIZE(dcp, mp))
+
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DSIZE)
+int xfs_dfork_dsize_arch(xfs_dinode_t *dip, struct xfs_mount *mp, xfs_arch_t arch);
+int xfs_dfork_dsize(xfs_dinode_t *dip, struct xfs_mount *mp);
+#define XFS_DFORK_DSIZE_ARCH(dip,mp,arch)   xfs_dfork_dsize_arch(dip,mp,arch)
+#define XFS_DFORK_DSIZE(dip,mp)		    xfs_dfork_dsize(dip,mp)
+#else
+#define XFS_DFORK_DSIZE_ARCH(dip,mp,arch)   XFS_CFORK_DSIZE_ARCH(&(dip)->di_core, mp, arch)
+#define XFS_DFORK_DSIZE(dip,mp)		    XFS_DFORK_DSIZE_ARCH(dip,mp,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_ASIZE)
+int xfs_dfork_asize_arch(xfs_dinode_t *dip, struct xfs_mount *mp, xfs_arch_t arch);
+int xfs_dfork_asize(xfs_dinode_t *dip, struct xfs_mount *mp);
+#define XFS_DFORK_ASIZE_ARCH(dip,mp,arch)   xfs_dfork_asize_arch(dip,mp,arch)
+#define XFS_DFORK_ASIZE(dip,mp)		    xfs_dfork_asize(dip,mp)
+#else
+#define XFS_DFORK_ASIZE_ARCH(dip,mp,arch)   XFS_CFORK_ASIZE_ARCH(&(dip)->di_core, mp, arch)
+#define XFS_DFORK_ASIZE(dip,mp)		    XFS_DFORK_ASIZE_ARCH(dip,mp,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_SIZE)
+int xfs_dfork_size_arch(xfs_dinode_t *dip, struct xfs_mount *mp, int w, xfs_arch_t arch);
+int xfs_dfork_size(xfs_dinode_t *dip, struct xfs_mount *mp, int w);
+#define XFS_DFORK_SIZE_ARCH(dip,mp,w,arch)  xfs_dfork_size_arch(dip,mp,w,arch)
+#define XFS_DFORK_SIZE(dip,mp,w)	    xfs_dfork_size(dip,mp,w)
+#else
+#define XFS_DFORK_SIZE_ARCH(dip,mp,w,arch)  XFS_CFORK_SIZE_ARCH(&(dip)->di_core, mp, w, arch)
+#define XFS_DFORK_SIZE(dip,mp,w)	    XFS_DFORK_SIZE_ARCH(dip,mp,w,ARCH_NOCONVERT)
+
+#endif
+
+/*
+ * Macros for accessing per-fork disk inode information.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_Q)
+int xfs_dfork_q_arch(xfs_dinode_t *dip, xfs_arch_t arch);
+int xfs_dfork_q(xfs_dinode_t *dip);
+#define XFS_DFORK_Q_ARCH(dip,arch)	    xfs_dfork_q_arch(dip,arch)
+#define XFS_DFORK_Q(dip)		    xfs_dfork_q(dip)
+#else
+#define XFS_DFORK_Q_ARCH(dip,arch)	    XFS_CFORK_Q_ARCH(&(dip)->di_core, arch)
+#define XFS_DFORK_Q(dip)		    XFS_DFORK_Q_ARCH(dip,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_BOFF)
+int xfs_dfork_boff_arch(xfs_dinode_t *dip, xfs_arch_t arch);
+int xfs_dfork_boff(xfs_dinode_t *dip);
+#define XFS_DFORK_BOFF_ARCH(dip,arch)	    xfs_dfork_boff_arch(dip,arch)
+#define XFS_DFORK_BOFF(dip)		    xfs_dfork_boff(dip)
+#else
+#define XFS_DFORK_BOFF_ARCH(dip,arch)	    XFS_CFORK_BOFF_ARCH(&(dip)->di_core, arch)
+#define XFS_DFORK_BOFF(dip)		    XFS_DFORK_BOFF_ARCH(dip,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DPTR)
+char *xfs_dfork_dptr_arch(xfs_dinode_t *dip, xfs_arch_t arch);
+char *xfs_dfork_dptr(xfs_dinode_t *dip);
+#define XFS_DFORK_DPTR_ARCH(dip,arch)	    xfs_dfork_dptr_arch(dip,arch)
+#define XFS_DFORK_DPTR(dip)		    xfs_dfork_dptr(dip)
+#else
+#define XFS_DFORK_DPTR_ARCH(dip,arch)	    ((dip)->di_u.di_c)
+#define XFS_DFORK_DPTR(dip)		    XFS_DFORK_DPTR_ARCH(dip,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_APTR)
+char *xfs_dfork_aptr_arch(xfs_dinode_t *dip, xfs_arch_t arch);
+char *xfs_dfork_aptr(xfs_dinode_t *dip);
+#define XFS_DFORK_APTR_ARCH(dip,arch)	    xfs_dfork_aptr_arch(dip,arch)
+#define XFS_DFORK_APTR(dip)		    xfs_dfork_aptr(dip)
+#else
+#define XFS_DFORK_APTR_ARCH(dip,arch)	    ((dip)->di_u.di_c + XFS_DFORK_BOFF_ARCH(dip, arch))
+#define XFS_DFORK_APTR(dip)		    XFS_DFORK_APTR_ARCH(dip,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_PTR)
+char *xfs_dfork_ptr_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch);
+char *xfs_dfork_ptr(xfs_dinode_t *dip, int w);
+#define XFS_DFORK_PTR_ARCH(dip,w,arch)	    xfs_dfork_ptr_arch(dip,w,arch)
+#define XFS_DFORK_PTR(dip,w)		    xfs_dfork_ptr(dip,w)
+#else
+#define XFS_DFORK_PTR_ARCH(dip,w,arch)	\
+	((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR_ARCH(dip, arch) : XFS_DFORK_APTR_ARCH(dip, arch))
+#define XFS_DFORK_PTR(dip,w)		    XFS_DFORK_PTR_ARCH(dip,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FORMAT)
+int xfs_cfork_format_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch);
+int xfs_cfork_format(xfs_dinode_core_t *dcp, int w);
+#define XFS_CFORK_FORMAT_ARCH(dcp,w,arch)   xfs_cfork_format_arch(dcp,w,arch)
+#define XFS_CFORK_FORMAT(dcp,w)		    xfs_cfork_format(dcp,w)
+#else
+#define XFS_CFORK_FORMAT_ARCH(dcp,w,arch) \
+	((w) == XFS_DATA_FORK ? INT_GET((dcp)->di_format, arch) : INT_GET((dcp)->di_aformat, arch))
+#define XFS_CFORK_FORMAT(dcp,w)		    XFS_CFORK_FORMAT_ARCH(dcp,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FMT_SET)
+void xfs_cfork_fmt_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch);
+void xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n);
+#define XFS_CFORK_FMT_SET_ARCH(dcp,w,n,arch) xfs_cfork_fmt_set_arch(dcp,w,n,arch)
+#define XFS_CFORK_FMT_SET(dcp,w,n)	     xfs_cfork_fmt_set(dcp,w,n)
+#else
+#define XFS_CFORK_FMT_SET_ARCH(dcp,w,n,arch) \
+	((w) == XFS_DATA_FORK ? \
+		(INT_SET((dcp)->di_format, arch, (n))) : \
+		(INT_SET((dcp)->di_aformat, arch, (n))))
+#define XFS_CFORK_FMT_SET(dcp,w,n)	     XFS_CFORK_FMT_SET_ARCH(dcp,w,n,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXTENTS)
+int xfs_cfork_nextents_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch);
+int xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w);
+#define XFS_CFORK_NEXTENTS_ARCH(dcp,w,arch)  xfs_cfork_nextents_arch(dcp,w,arch)
+#define XFS_CFORK_NEXTENTS(dcp,w)	     xfs_cfork_nextents(dcp,w)
+#else
+#define XFS_CFORK_NEXTENTS_ARCH(dcp,w,arch) \
+	((w) == XFS_DATA_FORK ? INT_GET((dcp)->di_nextents, arch) : INT_GET((dcp)->di_anextents, arch))
+#define XFS_CFORK_NEXTENTS(dcp,w)	     XFS_CFORK_NEXTENTS_ARCH(dcp,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXT_SET)
+void xfs_cfork_next_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch);
+void xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n);
+#define XFS_CFORK_NEXT_SET_ARCH(dcp,w,n,arch)	xfs_cfork_next_set_arch(dcp,w,n,arch)
+#define XFS_CFORK_NEXT_SET(dcp,w,n)		xfs_cfork_next_set(dcp,w,n)
+#else
+#define XFS_CFORK_NEXT_SET_ARCH(dcp,w,n,arch) \
+	((w) == XFS_DATA_FORK ? \
+		(INT_SET((dcp)->di_nextents, arch, (n))) : \
+		(INT_SET((dcp)->di_anextents, arch, (n))))
+#define XFS_CFORK_NEXT_SET(dcp,w,n)		XFS_CFORK_NEXT_SET_ARCH(dcp,w,n,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_FORMAT)
+int xfs_dfork_format_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch);
+int xfs_dfork_format(xfs_dinode_t *dip, int w);
+#define XFS_DFORK_FORMAT_ARCH(dip,w,arch)   xfs_dfork_format_arch(dip,w,arch)
+#define XFS_DFORK_FORMAT(dip,w)		    xfs_dfork_format(dip,w)
+#else
+#define XFS_DFORK_FORMAT_ARCH(dip,w,arch)   XFS_CFORK_FORMAT_ARCH(&(dip)->di_core, w, arch)
+#define XFS_DFORK_FORMAT(dip,w)		    XFS_DFORK_FORMAT_ARCH(dip,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_FMT_SET)
+void xfs_dfork_fmt_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch);
+void xfs_dfork_fmt_set(xfs_dinode_t *dip, int w, int n);
+#define XFS_DFORK_FMT_SET_ARCH(dip,w,n,arch)	xfs_dfork_fmt_set_arch(dip,w,n,arch)
+#define XFS_DFORK_FMT_SET(dip,w,n)		xfs_dfork_fmt_set(dip,w,n)
+#else
+#define XFS_DFORK_FMT_SET_ARCH(dip,w,n,arch)	XFS_CFORK_FMT_SET_ARCH(&(dip)->di_core, w, n, arch)
+#define XFS_DFORK_FMT_SET(dip,w,n)		XFS_DFORK_FMT_SET_ARCH(dip,w,n,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_NEXTENTS)
+int xfs_dfork_nextents_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch);
+int xfs_dfork_nextents(xfs_dinode_t *dip, int w);
+#define XFS_DFORK_NEXTENTS_ARCH(dip,w,arch) xfs_dfork_nextents_arch(dip,w,arch)
+#define XFS_DFORK_NEXTENTS(dip,w)	    xfs_dfork_nextents(dip,w)
+#else
+#define XFS_DFORK_NEXTENTS_ARCH(dip,w,arch) XFS_CFORK_NEXTENTS_ARCH(&(dip)->di_core, w, arch)
+#define XFS_DFORK_NEXTENTS(dip,w)	    XFS_DFORK_NEXTENTS_ARCH(dip,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_NEXT_SET)
+void xfs_dfork_next_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch);
+void xfs_dfork_next_set(xfs_dinode_t *dip, int w, int n);
+#define XFS_DFORK_NEXT_SET_ARCH(dip,w,n,arch)	xfs_dfork_next_set_arch(dip,w,n,arch)
+#define XFS_DFORK_NEXT_SET(dip,w,n)		xfs_dfork_next_set(dip,w,n)
+#else
+#define XFS_DFORK_NEXT_SET_ARCH(dip,w,n,arch)	XFS_CFORK_NEXT_SET_ARCH(&(dip)->di_core, w, n, arch)
+#define XFS_DFORK_NEXT_SET(dip,w,n)		XFS_DFORK_NEXT_SET_ARCH(dip,w,n,ARCH_NOCONVERT)
+
+#endif
+
+/*
+ * File types (mode field)
+ */
+#define IFMT		0170000		/* type of file */
+#define IFIFO		0010000		/* named pipe (fifo) */
+#define IFCHR		0020000		/* character special */
+#define IFDIR		0040000		/* directory */
+#define IFBLK		0060000		/* block special */
+#define IFREG		0100000		/* regular */
+#define IFLNK		0120000		/* symbolic link */
+#define IFSOCK		0140000		/* socket */
+#define IFMNT		0160000		/* mount point */
+
+/*
+ * File execution and access modes.
+ */
+#define ISUID		04000		/* set user id on execution */
+#define ISGID		02000		/* set group id on execution */
+#define ISVTX		01000		/* sticky directory */
+#define IREAD		0400		/* read, write, execute permissions */
+#define IWRITE		0200
+#define IEXEC		0100
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_DINODE)
+xfs_dinode_t *xfs_buf_to_dinode(struct xfs_buf *bp);
+#define XFS_BUF_TO_DINODE(bp)	xfs_buf_to_dinode(bp)
+#else
+#define XFS_BUF_TO_DINODE(bp)	((xfs_dinode_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT 0	/* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT 1	/* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT	2	/* for rtbitmap inode, new format */
+#define XFS_DIFLAG_REALTIME	(1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC	(1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM	(1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_ALL	\
+	(XFS_DIFLAG_REALTIME|XFS_DIFLAG_PREALLOC|XFS_DIFLAG_NEWRTBM)
+
+#endif	/* __XFS_DINODE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * xfs_dir.c
+ *
+ * Provide the external interfaces to manage directories.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Functions for the dirops interfaces.
+ */
+static void	xfs_dir_mount(struct xfs_mount *mp);
+
+static int	xfs_dir_isempty(struct xfs_inode *dp);
+
+static int	xfs_dir_init(struct xfs_trans *trans,
+			     struct xfs_inode *dir,
+			     struct xfs_inode *parent_dir);
+
+static int	xfs_dir_createname(struct xfs_trans *trans,
+				   struct xfs_inode *dp,
+				   char *name_string,
+				   int name_len,
+				   xfs_ino_t inode_number,
+				   xfs_fsblock_t *firstblock,
+				   xfs_bmap_free_t *flist,
+				   xfs_extlen_t total);
+
+static int	xfs_dir_lookup(struct xfs_trans *tp,
+			       struct xfs_inode *dp,
+			       char *name_string,
+			       int name_length,
+			       xfs_ino_t *inode_number);
+
+static int	xfs_dir_removename(struct xfs_trans *trans,
+				   struct xfs_inode *dp,
+				   char *name_string,
+				   int name_length,
+				   xfs_ino_t ino,
+				   xfs_fsblock_t *firstblock,
+				   xfs_bmap_free_t *flist,
+				   xfs_extlen_t total);
+
+static int	xfs_dir_getdents(struct xfs_trans *tp,
+				 struct xfs_inode *dp,
+				 struct uio *uiop,
+				 int *eofp);
+
+static int	xfs_dir_replace(struct xfs_trans *tp,
+				struct xfs_inode *dp,
+				char *name_string,
+				int name_length,
+				xfs_ino_t inode_number,
+				xfs_fsblock_t *firstblock,
+				xfs_bmap_free_t *flist,
+				xfs_extlen_t total);
+
+static int	xfs_dir_canenter(struct xfs_trans *tp,
+				 struct xfs_inode *dp,
+				 char *name_string,
+				 int name_length);
+
+static int	xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp,
+						  xfs_dinode_t *dip);
+
+xfs_dirops_t xfsv1_dirops = {
+	.xd_mount			= xfs_dir_mount,
+	.xd_isempty			= xfs_dir_isempty,
+	.xd_init			= xfs_dir_init,
+	.xd_createname			= xfs_dir_createname,
+	.xd_lookup			= xfs_dir_lookup,
+	.xd_removename			= xfs_dir_removename,
+	.xd_getdents			= xfs_dir_getdents,
+	.xd_replace			= xfs_dir_replace,
+	.xd_canenter			= xfs_dir_canenter,
+	.xd_shortform_validate_ondisk	= xfs_dir_shortform_validate_ondisk,
+	.xd_shortform_to_single		= xfs_dir_shortform_to_leaf,
+};
+
+/*
+ * Internal routines when dirsize == XFS_LBSIZE(mp).
+ */
+STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args);
+STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries,
+						 int *total_namebytes);
+STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
+					     uio_t *uio, int *eofp,
+					     xfs_dirent_t *dbp,
+					     xfs_dir_put_t put);
+STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args);
+
+/*
+ * Internal routines when dirsize > XFS_LBSIZE(mp).
+ */
+STATIC int xfs_dir_node_addname(xfs_da_args_t *args);
+STATIC int xfs_dir_node_lookup(xfs_da_args_t *args);
+STATIC int xfs_dir_node_removename(xfs_da_args_t *args);
+STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
+					     uio_t *uio, int *eofp,
+					     xfs_dirent_t *dbp,
+					     xfs_dir_put_t put);
+STATIC int xfs_dir_node_replace(xfs_da_args_t *args);
+
+#if defined(DEBUG)
+ktrace_t *xfs_dir_trace_buf;
+#endif
+
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+xfs_dahash_t	xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+	xfs_dir_hash_dot = xfs_da_hashname(".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
+}
+
+/*
+ * Initialize directory-related fields in the mount structure.
+ */
+static void
+xfs_dir_mount(xfs_mount_t *mp)
+{
+	uint shortcount, leafcount, count;
+
+	mp->m_dirversion = 1;
+	shortcount = (mp->m_attroffset - (uint)sizeof(xfs_dir_sf_hdr_t)) /
+		     (uint)sizeof(xfs_dir_sf_entry_t);
+	leafcount = (XFS_LBSIZE(mp) - (uint)sizeof(xfs_dir_leaf_hdr_t)) /
+		    ((uint)sizeof(xfs_dir_leaf_entry_t) +
+		     (uint)sizeof(xfs_dir_leaf_name_t));
+	count = shortcount > leafcount ? shortcount : leafcount;
+	mp->m_dircook_elog = xfs_da_log2_roundup(count + 1);
+	ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog);
+	mp->m_da_node_ents =
+		(XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) /
+		(uint)sizeof(xfs_da_node_entry_t);
+	mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100;
+	mp->m_dirblksize = mp->m_sb.sb_blocksize;
+	mp->m_dirblkfsbs = 1;
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+static int
+xfs_dir_isempty(xfs_inode_t *dp)
+{
+	xfs_dir_sf_hdr_t *hdr;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (dp->i_d.di_size == 0)
+		return(1);
+	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+		return(0);
+	hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	return(hdr->count == 0);
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+static int
+xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir)
+{
+	xfs_da_args_t args;
+	int error;
+
+	bzero((char *)&args, sizeof(args));
+	args.dp = dir;
+	args.trans = trans;
+
+	ASSERT((dir->i_d.di_mode & IFMT) == IFDIR);
+	if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino)))
+		return error;
+
+	return(xfs_dir_shortform_create(&args, parent_dir->i_ino));
+}
+
+/*
+ * Generic handler routine to add a name to a directory.
+ * Transitions directory from shortform to Btree as necessary.
+ */
+static int							/* error */
+xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
+		   int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock,
+		   xfs_bmap_free_t *flist, xfs_extlen_t total)
+{
+	xfs_da_args_t args;
+	int retval, newsize, done;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+
+	if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
+		return (retval);
+
+	XFS_STATS_INC(xfsstats.xs_dir_create);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = firstblock;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = 0;
+	args.addname = args.oknoent = 1;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	done = 0;
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
+		if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) {
+			retval = xfs_dir_shortform_addname(&args);
+			done = 1;
+		} else {
+			if (total == 0)
+				return XFS_ERROR(ENOSPC);
+			retval = xfs_dir_shortform_to_leaf(&args);
+			done = retval != 0;
+		}
+	}
+	if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_addname(&args);
+		done = retval != ENOSPC;
+		if (!done) {
+			if (total == 0)
+				return XFS_ERROR(ENOSPC);
+			retval = xfs_dir_leaf_to_node(&args);
+			done = retval != 0;
+		}
+	}
+	if (!done) {
+		retval = xfs_dir_node_addname(&args);
+	}
+	return(retval);
+}
+
+/*
+ * Generic handler routine to check if a name can be added to a directory,
+ * without adding any blocks to the directory.
+ */
+static int							/* error */
+xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen)
+{
+	xfs_da_args_t args;
+	int retval, newsize;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = 0;
+	args.dp = dp;
+	args.firstblock = NULL;
+	args.flist = NULL;
+	args.total = 0;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = args.addname = args.oknoent = 1;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
+		if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp))
+			retval = 0;
+		else
+			retval = XFS_ERROR(ENOSPC);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_addname(&args);
+	} else {
+		retval = xfs_dir_node_addname(&args);
+	}
+	return(retval);
+}
+
+/*
+ * Generic handler routine to remove a name from a directory.
+ * Transitions directory from Btree to shortform as necessary.
+ */
+static int							/* error */
+xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
+		   int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock,
+		   xfs_bmap_free_t *flist, xfs_extlen_t total)
+{
+	xfs_da_args_t args;
+	int count, totallen, newsize, retval;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	XFS_STATS_INC(xfsstats.xs_dir_remove);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = ino;
+	args.dp = dp;
+	args.firstblock = firstblock;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = args.addname = args.oknoent = 0;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		retval = xfs_dir_shortform_removename(&args);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_removename(&args, &count, &totallen);
+		if (retval == 0) {
+			newsize = XFS_DIR_SF_ALLFIT(count, totallen);
+			if (newsize <= XFS_IFORK_DSIZE(dp)) {
+				retval = xfs_dir_leaf_to_shortform(&args);
+			}
+		}
+	} else {
+		retval = xfs_dir_node_removename(&args);
+	}
+	return(retval);
+}
+
+static int							/* error */
+xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
+				   xfs_ino_t *inum)
+{
+	xfs_da_args_t args;
+	int retval;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (namelen >= MAXNAMELEN) {
+		return(XFS_ERROR(EINVAL));
+	}
+
+	XFS_STATS_INC(xfsstats.xs_dir_lookup);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = 0;
+	args.dp = dp;
+	args.firstblock = NULL;
+	args.flist = NULL;
+	args.total = 0;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = args.addname = 0;
+	args.oknoent = 1;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		retval = xfs_dir_shortform_lookup(&args);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_lookup(&args);
+	} else {
+		retval = xfs_dir_node_lookup(&args);
+	}
+	if (retval == EEXIST)
+		retval = 0;
+	*inum = args.inumber;
+	return(retval);
+}
+
+/*
+ * Implement readdir.
+ */
+static int							/* error */
+xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp)
+{
+	xfs_dirent_t *dbp;
+	int  alignment, retval;
+	xfs_dir_put_t put;
+
+	XFS_STATS_INC(xfsstats.xs_dir_getdents);
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+
+	/*
+	 * If our caller has given us a single contiguous memory buffer,
+	 * just work directly within that buffer.  If it's in user memory,
+	 * lock it down first.
+	 */
+	alignment = sizeof(xfs_off_t) - 1;
+	if ((uio->uio_iovcnt == 1) &&
+	    (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
+	    ((uio->uio_iov[0].iov_len & alignment) == 0)) {
+		dbp = NULL;
+		put = xfs_dir_put_dirent64_direct;
+	} else {
+		dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
+		put = xfs_dir_put_dirent64_uio;
+	}
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	*eofp = 0;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put);
+	} else {
+		retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put);
+	}
+	if (dbp != NULL)
+		kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
+
+	return(retval);
+}
+
+static int							/* error */
+xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
+				    xfs_ino_t inum, xfs_fsblock_t *firstblock,
+				    xfs_bmap_free_t *flist, xfs_extlen_t total)
+{
+	xfs_da_args_t args;
+	int retval;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (namelen >= MAXNAMELEN) {
+		return(XFS_ERROR(EINVAL));
+	}
+
+	if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
+		return retval;
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = firstblock;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = args.addname = args.oknoent = 0;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		retval = xfs_dir_shortform_replace(&args);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_replace(&args);
+	} else {
+		retval = xfs_dir_node_replace(&args);
+	}
+
+	return(retval);
+}
+
+static int
+xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp)
+{
+	xfs_ino_t		ino;
+	int			namelen_sum;
+	int			count;
+	xfs_dir_shortform_t	*sf;
+	xfs_dir_sf_entry_t	*sfe;
+	int			i;
+
+
+
+	if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & IFMT) != IFDIR) {
+		return 0;
+	}
+	if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) {
+		return 0;
+	}
+	if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) {
+		xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p\n",
+			dp);
+		return 1;
+	}
+	sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf);
+	ino = XFS_GET_DIR_INO_ARCH(mp, sf->hdr.parent, ARCH_CONVERT);
+	if (xfs_dir_ino_validate(mp, ino))
+		return 1;
+
+	count = sf->hdr.count;
+	if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) {
+		xfs_fs_cmn_err(CE_WARN, mp,
+			"Invalid shortform count: dp 0x%p\n", dp);
+		return(1);
+	}
+
+	if (count == 0) {
+		return 0;
+	}
+
+	namelen_sum = 0;
+	sfe = &sf->list[0];
+	for (i = sf->hdr.count - 1; i >= 0; i--) {
+		ino = XFS_GET_DIR_INO_ARCH(mp, sfe->inumber, ARCH_CONVERT);
+		xfs_dir_ino_validate(mp, ino);
+		if (sfe->namelen >= XFS_LITINO(mp)) {
+			xfs_fs_cmn_err(CE_WARN, mp,
+				"Invalid shortform namelen: dp 0x%p\n", dp);
+			return 1;
+		}
+		namelen_sum += sfe->namelen;
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	if (namelen_sum >= XFS_LITINO(mp)) {
+		xfs_fs_cmn_err(CE_WARN, mp,
+			"Invalid shortform namelen: dp 0x%p\n", dp);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*========================================================================
+ * External routines when dirsize == XFS_LBSIZE(dp->i_mount).
+ *========================================================================*/
+
+/*
+ * Add a name to the leaf directory structure
+ * This is the external routine.
+ */
+int
+xfs_dir_leaf_addname(xfs_da_args_t *args)
+{
+	int index, retval;
+	xfs_dabuf_t *bp;
+
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+
+	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+	if (retval == ENOENT)
+		retval = xfs_dir_leaf_add(bp, args, index);
+	xfs_da_buf_done(bp);
+	return(retval);
+}
+
+/*
+ * Remove a name from the leaf directory structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen)
+{
+	xfs_dir_leafblock_t *leaf;
+	int index, retval;
+	xfs_dabuf_t *bp;
+
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+	if (retval == EEXIST) {
+		(void)xfs_dir_leaf_remove(args->trans, bp, index);
+		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		*totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		retval = 0;
+	}
+	xfs_da_buf_done(bp);
+	return(retval);
+}
+
+/*
+ * Look up a name in a leaf directory structure.
+ * This is the external routine.
+ */
+STATIC int
+xfs_dir_leaf_lookup(xfs_da_args_t *args)
+{
+	int index, retval;
+	xfs_dabuf_t *bp;
+
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+	xfs_da_brelse(args->trans, bp);
+	return(retval);
+}
+
+/*
+ * Copy out directory entries for getdents(), for leaf directories.
+ */
+STATIC int
+xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
+				  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
+{
+	xfs_dabuf_t *bp;
+	int retval, eob;
+
+	retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1);
+	xfs_da_brelse(trans, bp);
+	*eofp = (eob == 0);
+	return(retval);
+}
+
+/*
+ * Look up a name in a leaf directory structure, replace the inode number.
+ * This is the external routine.
+ */
+STATIC int
+xfs_dir_leaf_replace(xfs_da_args_t *args)
+{
+	int index, retval;
+	xfs_dabuf_t *bp;
+	xfs_ino_t inum;
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+
+	inum = args->inumber;
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+	if (retval == EEXIST) {
+		leaf = bp->data;
+		entry = &leaf->entries[index];
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		/* XXX - replace assert? */
+		XFS_DIR_SF_PUT_DIRINO_ARCH(&inum, &namest->inumber, ARCH_CONVERT);
+		xfs_da_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
+		xfs_da_buf_done(bp);
+		retval = 0;
+	} else
+		xfs_da_brelse(args->trans, bp);
+	return(retval);
+}
+
+
+/*========================================================================
+ * External routines when dirsize > XFS_LBSIZE(mp).
+ *========================================================================*/
+
+/*
+ * Add a name to a Btree-format directory.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_dir_node_addname(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	int retval, error;
+
+	/*
+	 * Fill in bucket of arguments/results/context to carry around.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name already exists, and get back a pointer
+	 * to where it should go.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error)
+		retval = error;
+	if (retval != ENOENT)
+		goto error;
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
+	retval = xfs_dir_leaf_add(blk->bp, args, blk->index);
+	if (retval == 0) {
+		/*
+		 * Addition succeeded, update Btree hashvals.
+		 */
+		if (!args->justcheck)
+			xfs_da_fixhashpath(state, &state->path);
+	} else {
+		/*
+		 * Addition failed, split as many Btree elements as required.
+		 */
+		if (args->total == 0) {
+			ASSERT(retval == ENOSPC);
+			goto error;
+		}
+		retval = xfs_da_split(state);
+	}
+error:
+	xfs_da_state_free(state);
+
+	return(retval);
+}
+
+/*
+ * Remove a name from a B-tree directory.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_dir_node_removename(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	int retval, error;
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error)
+		retval = error;
+	if (retval != EEXIST) {
+		xfs_da_state_free(state);
+		return(retval);
+	}
+
+	/*
+	 * Remove the name and update the hashvals in the tree.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
+	retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index);
+	xfs_da_fixhashpath(state, &state->path);
+
+	/*
+	 * Check to see if the tree needs to be collapsed.
+	 */
+	error = 0;
+	if (retval) {
+		error = xfs_da_join(state);
+	}
+
+	xfs_da_state_free(state);
+	if (error)
+		return(error);
+	return(0);
+}
+
+/*
+ * Look up a filename in a int directory.
+ * Use an internal routine to actually do all the work.
+ */
+STATIC int
+xfs_dir_node_lookup(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	int retval, error, i;
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name exists,
+	 * and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error) {
+		retval = error;
+	}
+
+	/*
+	 * If not in a transaction, we have to release all the buffers.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return(retval);
+}
+
+STATIC int
+xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
+				  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	xfs_dir_leafblock_t *leaf = NULL;
+	xfs_dablk_t bno, nextbno;
+	xfs_dahash_t cookhash;
+	xfs_mount_t *mp;
+	int error, eob, i;
+	xfs_dabuf_t *bp;
+	xfs_daddr_t nextda;
+
+	/*
+	 * Pick up our context.
+	 */
+	mp = dp->i_mount;
+	bp = NULL;
+	bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset);
+	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
+
+	xfs_dir_trace_g_du("node: start", dp, uio);
+
+	/*
+	 * Re-find our place, even if we're confused about what our place is.
+	 *
+	 * First we check the block number from the magic cookie, it is a
+	 * cache of where we ended last time.  If we find a leaf block, and
+	 * the starting hashval in that block is less than our desired
+	 * hashval, then we run with it.
+	 */
+	if (bno > 0) {
+		error = xfs_da_read_buf(trans, dp, bno, -1, &bp, XFS_DATA_FORK);
+		if ((error != 0) && (error != EFSCORRUPTED))
+			return(error);
+		if (bp)
+			leaf = bp->data;
+		if (bp && INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
+			xfs_dir_trace_g_dub("node: block not a leaf",
+						   dp, uio, bno);
+			xfs_da_brelse(trans, bp);
+			bp = NULL;
+		}
+		if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) {
+			xfs_dir_trace_g_dub("node: leaf hash too large",
+						   dp, uio, bno);
+			xfs_da_brelse(trans, bp);
+			bp = NULL;
+		}
+		if (bp &&
+		    cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) {
+			xfs_dir_trace_g_dub("node: leaf hash too small",
+						   dp, uio, bno);
+			xfs_da_brelse(trans, bp);
+			bp = NULL;
+		}
+	}
+
+	/*
+	 * If we did not find a leaf block from the blockno in the cookie,
+	 * or we there was no blockno in the cookie (eg: first time thru),
+	 * the we start at the top of the Btree and re-find our hashval.
+	 */
+	if (bp == NULL) {
+		xfs_dir_trace_g_du("node: start at root" , dp, uio);
+		bno = 0;
+		for (;;) {
+			error = xfs_da_read_buf(trans, dp, bno, -1, &bp,
+						       XFS_DATA_FORK);
+			if (error)
+				return(error);
+			if (bp == NULL)
+				return(XFS_ERROR(EFSCORRUPTED));
+			node = bp->data;
+			if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)
+				break;
+			btree = &node->btree[0];
+			xfs_dir_trace_g_dun("node: node detail", dp, uio, node);
+			for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); btree++, i++) {
+				if (INT_GET(btree->hashval, ARCH_CONVERT) >= cookhash) {
+					bno = INT_GET(btree->before, ARCH_CONVERT);
+					break;
+				}
+			}
+			if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
+				xfs_da_brelse(trans, bp);
+				xfs_dir_trace_g_du("node: hash beyond EOF",
+							  dp, uio);
+				uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0,
+							     XFS_DA_MAXHASH);
+				*eofp = 1;
+				return(0);
+			}
+			xfs_dir_trace_g_dub("node: going to block",
+						   dp, uio, bno);
+			xfs_da_brelse(trans, bp);
+		}
+	}
+	ASSERT(cookhash != XFS_DA_MAXHASH);
+
+	/*
+	 * We've dropped down to the (first) leaf block that contains the
+	 * hashval we are interested in.  Continue rolling upward thru the
+	 * leaf blocks until we fill up our buffer.
+	 */
+	for (;;) {
+		leaf = bp->data;
+		if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
+			xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf);
+			xfs_da_brelse(trans, bp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf);
+		if ((nextbno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))) {
+			nextda = xfs_da_reada_buf(trans, dp, nextbno,
+						  XFS_DATA_FORK);
+		} else
+			nextda = -1;
+		error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp,
+						  put, nextda);
+		xfs_da_brelse(trans, bp);
+		bno = nextbno;
+		if (eob) {
+			xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno);
+			*eofp = 0;
+			return(error);
+		}
+		if (bno == 0)
+			break;
+		error = xfs_da_read_buf(trans, dp, bno, nextda, &bp,
+					XFS_DATA_FORK);
+		if (error)
+			return(error);
+		if (bp == NULL)
+			return(XFS_ERROR(EFSCORRUPTED));
+	}
+	*eofp = 1;
+	xfs_dir_trace_g_du("node: E-O-F", dp, uio);
+	return(0);
+}
+
+/*
+ * Look up a filename in an int directory, replace the inode number.
+ * Use an internal routine to actually do the lookup.
+ */
+STATIC int
+xfs_dir_node_replace(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	xfs_ino_t inum;
+	int retval, error, i;
+	xfs_dabuf_t *bp;
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+	inum = args->inumber;
+
+	/*
+	 * Search to see if name exists,
+	 * and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error) {
+		retval = error;
+	}
+
+	if (retval == EEXIST) {
+		blk = &state->path.blk[state->path.active - 1];
+		ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
+		bp = blk->bp;
+		leaf = bp->data;
+		entry = &leaf->entries[blk->index];
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		/* XXX - replace assert ? */
+		XFS_DIR_SF_PUT_DIRINO_ARCH(&inum, &namest->inumber, ARCH_CONVERT);
+		xfs_da_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
+		xfs_da_buf_done(bp);
+		blk->bp = NULL;
+		retval = 0;
+	} else {
+		i = state->path.active - 1;
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	for (i = 0; i < state->path.active - 1; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return(retval);
+}
+
+#if defined(XFS_DIR_TRACE)
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)bno,
+		     NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio,
+			xfs_da_intnode_t *node)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)INT_GET(node->hdr.info.forw, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT),
+		     NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
+			xfs_dir_leafblock_t *leaf)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)INT_GET(leaf->hdr.info.forw, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT),
+		     NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio,
+			xfs_dir_leaf_entry_t *entry)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)INT_GET(entry->hashval, ARCH_CONVERT),
+		     NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)(cookie >> 32),
+		     (__psunsigned_t)(cookie & 0xFFFFFFFF),
+		     NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for the arguments given to the routine,
+ * generic form.
+ */
+void
+xfs_dir_trace_enter(int type, char *where,
+			__psunsigned_t a0, __psunsigned_t a1,
+			__psunsigned_t a2, __psunsigned_t a3,
+			__psunsigned_t a4, __psunsigned_t a5,
+			__psunsigned_t a6, __psunsigned_t a7,
+			__psunsigned_t a8, __psunsigned_t a9,
+			__psunsigned_t a10, __psunsigned_t a11)
+{
+	ASSERT(xfs_dir_trace_buf);
+	ktrace_enter(xfs_dir_trace_buf, (void *)((__psunsigned_t)type),
+					(void *)where,
+					(void *)a0, (void *)a1, (void *)a2,
+					(void *)a3, (void *)a4, (void *)a5,
+					(void *)a6, (void *)a7, (void *)a8,
+					(void *)a9, (void *)a10, (void *)a11,
+					NULL, NULL);
+}
+#endif	/* XFS_DIR_TRACE */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR_H__
+#define __XFS_DIR_H__
+
+/*
+ * Large directories are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Filenames are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of a filename may not be unique, we may have duplicate keys.	 The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Small directories use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+#ifdef XFS_ALL_TRACE
+#define XFS_DIR_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_DIR_TRACE
+#endif
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+struct uio;
+struct xfs_bmap_free;
+struct xfs_da_args;
+struct xfs_dinode;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Directory function types.
+ * Put in structures (xfs_dirops_t) for v1 and v2 directories.
+ */
+typedef void	(*xfs_dir_mount_t)(struct xfs_mount *mp);
+typedef int	(*xfs_dir_isempty_t)(struct xfs_inode *dp);
+typedef int	(*xfs_dir_init_t)(struct xfs_trans *tp,
+				  struct xfs_inode *dp,
+				  struct xfs_inode *pdp);
+typedef int	(*xfs_dir_createname_t)(struct xfs_trans *tp,
+					struct xfs_inode *dp,
+					char *name,
+					int namelen,
+					xfs_ino_t inum,
+					xfs_fsblock_t *first,
+					struct xfs_bmap_free *flist,
+					xfs_extlen_t total);
+typedef int	(*xfs_dir_lookup_t)(struct xfs_trans *tp,
+				    struct xfs_inode *dp,
+				    char *name,
+				    int namelen,
+				    xfs_ino_t *inum);
+typedef int	(*xfs_dir_removename_t)(struct xfs_trans *tp,
+					struct xfs_inode *dp,
+					char *name,
+					int namelen,
+					xfs_ino_t ino,
+					xfs_fsblock_t *first,
+					struct xfs_bmap_free *flist,
+					xfs_extlen_t total);
+typedef int	(*xfs_dir_getdents_t)(struct xfs_trans *tp,
+				      struct xfs_inode *dp,
+				      struct uio *uio,
+				      int *eofp);
+typedef int	(*xfs_dir_replace_t)(struct xfs_trans *tp,
+				     struct xfs_inode *dp,
+				     char *name,
+				     int namelen,
+				     xfs_ino_t inum,
+				     xfs_fsblock_t *first,
+				     struct xfs_bmap_free *flist,
+				     xfs_extlen_t total);
+typedef int	(*xfs_dir_canenter_t)(struct xfs_trans *tp,
+				      struct xfs_inode *dp,
+				      char *name,
+				      int namelen);
+typedef int	(*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp,
+						       struct xfs_dinode *dip);
+typedef int	(*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args);
+
+typedef struct xfs_dirops {
+	xfs_dir_mount_t				xd_mount;
+	xfs_dir_isempty_t			xd_isempty;
+	xfs_dir_init_t				xd_init;
+	xfs_dir_createname_t			xd_createname;
+	xfs_dir_lookup_t			xd_lookup;
+	xfs_dir_removename_t			xd_removename;
+	xfs_dir_getdents_t			xd_getdents;
+	xfs_dir_replace_t			xd_replace;
+	xfs_dir_canenter_t			xd_canenter;
+	xfs_dir_shortform_validate_ondisk_t	xd_shortform_validate_ondisk;
+	xfs_dir_shortform_to_single_t		xd_shortform_to_single;
+} xfs_dirops_t;
+
+/*
+ * Overall external interface routines.
+ */
+void	xfs_dir_startup(void);	/* called exactly once */
+
+#define XFS_DIR_MOUNT(mp)	\
+	((mp)->m_dirops.xd_mount(mp))
+#define XFS_DIR_ISEMPTY(mp,dp)	\
+	((mp)->m_dirops.xd_isempty(dp))
+#define XFS_DIR_INIT(mp,tp,dp,pdp)	\
+	((mp)->m_dirops.xd_init(tp,dp,pdp))
+#define XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \
+	((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\
+				      total))
+#define XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum)	\
+	((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum))
+#define XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total) \
+	((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total))
+#define XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp)	\
+	((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp))
+#define XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total)	\
+	((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total))
+#define XFS_DIR_CANENTER(mp,tp,dp,name,namelen) \
+	((mp)->m_dirops.xd_canenter(tp,dp,name,namelen))
+#define XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip)	\
+	((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip))
+#define XFS_DIR_SHORTFORM_TO_SINGLE(mp,args)	\
+	((mp)->m_dirops.xd_shortform_to_single(args))
+
+#define XFS_DIR_IS_V1(mp)	((mp)->m_dirversion == 1)
+extern xfs_dirops_t xfsv1_dirops;
+
+#endif	/* __XFS_DIR_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,830 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * XFS v2 directory implmentation.
+ * Top-level and utility routines.
+ */
+
+#include <xfs.h>
+
+/*
+ * Declarations for interface routines.
+ */
+static void	xfs_dir2_mount(xfs_mount_t *mp);
+static int	xfs_dir2_isempty(xfs_inode_t *dp);
+static int	xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp,
+			      xfs_inode_t *pdp);
+static int	xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp,
+				    char *name, int namelen, xfs_ino_t inum,
+				    xfs_fsblock_t *first,
+				    xfs_bmap_free_t *flist, xfs_extlen_t total);
+static int	xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
+				int namelen, xfs_ino_t *inum);
+static int	xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp,
+				    char *name, int namelen, xfs_ino_t ino,
+				    xfs_fsblock_t *first,
+				    xfs_bmap_free_t *flist, xfs_extlen_t total);
+static int	xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio,
+				  int *eofp);
+static int	xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
+				 int namelen, xfs_ino_t inum,
+				 xfs_fsblock_t *first, xfs_bmap_free_t *flist,
+				 xfs_extlen_t total);
+static int	xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
+				  int namelen);
+static int	xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp,
+						   xfs_dinode_t *dip);
+
+/*
+ * Utility routine declarations.
+ */
+static int	xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa);
+static int	xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa);
+
+/*
+ * Directory operations vector.
+ */
+xfs_dirops_t	xfsv2_dirops = {
+	.xd_mount			= xfs_dir2_mount,
+	.xd_isempty			= xfs_dir2_isempty,
+	.xd_init			= xfs_dir2_init,
+	.xd_createname			= xfs_dir2_createname,
+	.xd_lookup			= xfs_dir2_lookup,
+	.xd_removename			= xfs_dir2_removename,
+	.xd_getdents			= xfs_dir2_getdents,
+	.xd_replace			= xfs_dir2_replace,
+	.xd_canenter			= xfs_dir2_canenter,
+	.xd_shortform_validate_ondisk	= xfs_dir2_shortform_validate_ondisk,
+	.xd_shortform_to_single		= xfs_dir2_sf_to_block,
+};
+
+/*
+ * Interface routines.
+ */
+
+/*
+ * Initialize directory-related fields in the mount structure.
+ */
+static void
+xfs_dir2_mount(
+	xfs_mount_t	*mp)		/* filesystem mount point */
+{
+	mp->m_dirversion = 2;
+	ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
+	       XFS_MAX_BLOCKSIZE);
+	mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
+	mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
+	mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp));
+	mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
+	mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp));
+	mp->m_da_node_ents =
+		(mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
+		(uint)sizeof(xfs_da_node_entry_t);
+	mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+static int				/* return code */
+xfs_dir2_isempty(
+	xfs_inode_t	*dp)		/* incore inode structure */
+{
+	xfs_dir2_sf_t	*sfp;		/* shortform directory structure */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	/*
+	 * Might happen during shutdown.
+	 */
+	if (dp->i_d.di_size == 0) {
+		return 1;
+	}
+	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+		return 0;
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	return INT_ISZERO(sfp->hdr.count, ARCH_CONVERT);
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+static int				/* error */
+xfs_dir2_init(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	xfs_inode_t	*pdp)		/* incore parent directory inode */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		error;		/* error return value */
+
+	bzero((char *)&args, sizeof(args));
+	args.dp = dp;
+	args.trans = tp;
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) {
+		return error;
+	}
+	return xfs_dir2_sf_create(&args, pdp->i_ino);
+}
+
+/*
+  Enter a name in a directory.
+ */
+static int					/* error */
+xfs_dir2_createname(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*dp,		/* incore directory inode */
+	char			*name,		/* new entry name */
+	int			namelen,	/* new entry name length */
+	xfs_ino_t		inum,		/* new entry inode number */
+	xfs_fsblock_t		*first,		/* bmap's firstblock */
+	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
+	xfs_extlen_t		total)		/* bmap's total block count */
+{
+	xfs_da_args_t		args;		/* operation arguments */
+	int			rval;		/* return value */
+	int			v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+		return rval;
+	}
+	XFS_STATS_INC(xfsstats.xs_dir_create);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = first;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = 0;
+	args.addname = args.oknoent = 1;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_addname(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_addname(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_addname(&args);
+	else
+		rval = xfs_dir2_node_addname(&args);
+	return rval;
+}
+
+/*
+ * Lookup a name in a directory, give back the inode number.
+ */
+static int				/* error */
+xfs_dir2_lookup(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	char		*name,		/* lookup name */
+	int		namelen,	/* lookup name length */
+	xfs_ino_t	*inum)		/* out: inode number */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (namelen >= MAXNAMELEN) {
+		return XFS_ERROR(EINVAL);
+	}
+	XFS_STATS_INC(xfsstats.xs_dir_lookup);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = 0;
+	args.dp = dp;
+	args.firstblock = NULL;
+	args.flist = NULL;
+	args.total = 0;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = args.addname = 0;
+	args.oknoent = 1;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_lookup(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_lookup(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_lookup(&args);
+	else
+		rval = xfs_dir2_node_lookup(&args);
+	if (rval == EEXIST)
+		rval = 0;
+	if (rval == 0)
+		*inum = args.inumber;
+	return rval;
+}
+
+/*
+ * Remove an entry from a directory.
+ */
+static int				/* error */
+xfs_dir2_removename(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	char		*name,		/* name of entry to remove */
+	int		namelen,	/* name length of entry to remove */
+	xfs_ino_t	ino,		/* inode number of entry to remove */
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t *flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	XFS_STATS_INC(xfsstats.xs_dir_remove);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = ino;
+	args.dp = dp;
+	args.firstblock = first;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = args.addname = args.oknoent = 0;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_removename(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_removename(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_removename(&args);
+	else
+		rval = xfs_dir2_node_removename(&args);
+	return rval;
+}
+
+/*
+ * Read a directory.
+ */
+static int				/* error */
+xfs_dir2_getdents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	uio_t		*uio,		/* caller's buffer control */
+	int		*eofp)		/* out: eof reached */
+{
+	int		alignment;	/* alignment required for ABI */
+	xfs_dirent_t	*dbp;		/* malloc'ed buffer */
+	xfs_dir2_put_t	put;		/* entry formatting routine */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	XFS_STATS_INC(xfsstats.xs_dir_getdents);
+	/*
+	 * If our caller has given us a single contiguous aligned memory buffer,
+	 * just work directly within that buffer.  If it's in user memory,
+	 * lock it down first.
+	 */
+	alignment = sizeof(xfs_off_t) - 1;
+	if ((uio->uio_iovcnt == 1) &&
+	    (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
+	    ((uio->uio_iov[0].iov_len & alignment) == 0)) {
+		dbp = NULL;
+		put = xfs_dir2_put_dirent64_direct;
+	} else {
+		dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
+		put = xfs_dir2_put_dirent64_uio;
+	}
+
+	*eofp = 0;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		;
+	} else if (v)
+		rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put);
+	else
+		rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put);
+	if (dbp != NULL)
+		kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
+	return rval;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+static int				/* error */
+xfs_dir2_replace(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	char		*name,		/* name of entry to replace */
+	int		namelen,	/* name length of entry to replace */
+	xfs_ino_t	inum,		/* new inode number */
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t *flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (namelen >= MAXNAMELEN) {
+		return XFS_ERROR(EINVAL);
+	}
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+		return rval;
+	}
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = first;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = args.addname = args.oknoent = 0;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_replace(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_replace(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_replace(&args);
+	else
+		rval = xfs_dir2_node_replace(&args);
+	return rval;
+}
+
+/*
+ * See if this entry can be added to the directory without allocating space.
+ */
+static int				/* error */
+xfs_dir2_canenter(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	char		*name,		/* name of entry to add */
+	int		namelen)	/* name length of entry to add */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = 0;
+	args.dp = dp;
+	args.firstblock = NULL;
+	args.flist = NULL;
+	args.total = 0;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = args.addname = args.oknoent = 1;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_addname(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_addname(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_addname(&args);
+	else
+		rval = xfs_dir2_node_addname(&args);
+	return rval;
+}
+
+/*
+ * Dummy routine for shortform inode validation.
+ * Can't really do this.
+ */
+/* ARGSUSED */
+static int				/* error */
+xfs_dir2_shortform_validate_ondisk(
+	xfs_mount_t	*mp,		/* filesystem mount point */
+	xfs_dinode_t	*dip)		/* ondisk inode */
+{
+	return 0;
+}
+
+/*
+ * Utility routines.
+ */
+
+/*
+ * Add a block to the directory.
+ * This routine is for data and free blocks, not leaf/node blocks
+ * which are handled by xfs_da_grow_inode.
+ */
+int					/* error */
+xfs_dir2_grow_inode(
+	xfs_da_args_t	*args,		/* operation arguments */
+	int		space,		/* v2 dir's space XFS_DIR2_xxx_SPACE */
+	xfs_dir2_db_t	*dbp)		/* out: block number added */
+{
+	xfs_fileoff_t	bno;		/* directory offset of new block */
+	int		count;		/* count of filesystem blocks */
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		error;		/* error return value */
+	int		got;		/* blocks actually mapped */
+	int		i;		/* temp mapping index */
+	xfs_bmbt_irec_t map;		/* single structure for bmap */
+	int		mapi;		/* mapping index */
+	xfs_bmbt_irec_t *mapp;		/* bmap mapping structure(s) */
+	xfs_mount_t	*mp;		/* filesystem mount point */
+	int		nmap;		/* number of bmap entries */
+	xfs_trans_t	*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_s("grow_inode", args, space);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Set lowest possible block in the space requested.
+	 */
+	bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+	count = mp->m_dirblkfsbs;
+	/*
+	 * Find the first hole for our block.
+	 */
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) {
+		return error;
+	}
+	nmap = 1;
+	ASSERT(args->firstblock != NULL);
+	/*
+	 * Try mapping the new block contiguously (one extent).
+	 */
+	if ((error = xfs_bmapi(tp, dp, bno, count,
+			XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+			args->firstblock, args->total, &map, &nmap,
+			args->flist))) {
+		return error;
+	}
+	ASSERT(nmap <= 1);
+	/*
+	 * Got it in 1.
+	 */
+	if (nmap == 1) {
+		mapp = &map;
+		mapi = 1;
+	}
+	/*
+	 * Didn't work and this is a multiple-fsb directory block.
+	 * Try again with contiguous flag turned on.
+	 */
+	else if (nmap == 0 && count > 1) {
+		xfs_fileoff_t	b;	/* current file offset */
+
+		/*
+		 * Space for maximum number of mappings.
+		 */
+		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		/*
+		 * Iterate until we get to the end of our block.
+		 */
+		for (b = bno, mapi = 0; b < bno + count; ) {
+			int	c;	/* current fsb count */
+
+			/*
+			 * Can't map more than MAX_NMAP at once.
+			 */
+			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+			c = (int)(bno + count - b);
+			if ((error = xfs_bmapi(tp, dp, b, c,
+					XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
+					args->firstblock, args->total,
+					&mapp[mapi], &nmap, args->flist))) {
+				kmem_free(mapp, sizeof(*mapp) * count);
+				return error;
+			}
+			if (nmap < 1)
+				break;
+			/*
+			 * Add this bunch into our table, go to the next offset.
+			 */
+			mapi += nmap;
+			b = mapp[mapi - 1].br_startoff +
+			    mapp[mapi - 1].br_blockcount;
+		}
+	}
+	/*
+	 * Didn't work.
+	 */
+	else {
+		mapi = 0;
+		mapp = NULL;
+	}
+	/*
+	 * See how many fsb's we got.
+	 */
+	for (i = 0, got = 0; i < mapi; i++)
+		got += mapp[i].br_blockcount;
+	/*
+	 * Didn't get enough fsb's, or the first/last block's are wrong.
+	 */
+	if (got != count || mapp[0].br_startoff != bno ||
+	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+	    bno + count) {
+		if (mapp != &map)
+			kmem_free(mapp, sizeof(*mapp) * count);
+		return XFS_ERROR(ENOSPC);
+	}
+	/*
+	 * Done with the temporary mapping table.
+	 */
+	if (mapp != &map)
+		kmem_free(mapp, sizeof(*mapp) * count);
+	*dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno);
+	/*
+	 * Update file's size if this is the data space and it grew.
+	 */
+	if (space == XFS_DIR2_DATA_SPACE) {
+		xfs_fsize_t	size;		/* directory file (data) size */
+
+		size = XFS_FSB_TO_B(mp, bno + count);
+		if (size > dp->i_d.di_size) {
+			dp->i_d.di_size = size;
+			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+}
+
+/*
+ * See if the directory is a single-block form directory.
+ */
+int					/* error */
+xfs_dir2_isblock(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	int		*vp)		/* out: 1 is block, 0 is not block */
+{
+	xfs_fileoff_t	last;		/* last file offset */
+	xfs_mount_t	*mp;		/* filesystem mount point */
+	int		rval;		/* return value */
+
+	mp = dp->i_mount;
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+		return rval;
+	}
+	rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
+	ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
+	*vp = rval;
+	return 0;
+}
+
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int					/* error */
+xfs_dir2_isleaf(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	int		*vp)		/* out: 1 is leaf, 0 is not leaf */
+{
+	xfs_fileoff_t	last;		/* last file offset */
+	xfs_mount_t	*mp;		/* filesystem mount point */
+	int		rval;		/* return value */
+
+	mp = dp->i_mount;
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+		return rval;
+	}
+	*vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
+	return 0;
+}
+
+/*
+ * Getdents put routine for 64-bit ABI, direct form.
+ */
+static int					/* error */
+xfs_dir2_put_dirent64_direct(
+	xfs_dir2_put_args_t	*pa)		/* argument bundle */
+{
+	xfs_dirent_t		*idbp;		/* dirent pointer */
+	iovec_t			*iovp;		/* io vector */
+	int			namelen;	/* entry name length */
+	int			reclen;		/* entry total length */
+	uio_t			*uio;		/* I/O control */
+
+	namelen = pa->namelen;
+	reclen = DIRENTSIZE(namelen);
+	uio = pa->uio;
+	/*
+	 * Won't fit in the remaining space.
+	 */
+	if (reclen > uio->uio_resid) {
+		pa->done = 0;
+		return 0;
+	}
+	iovp = uio->uio_iov;
+	idbp = (xfs_dirent_t *)iovp->iov_base;
+	iovp->iov_base = (char *)idbp + reclen;
+	iovp->iov_len -= reclen;
+	uio->uio_resid -= reclen;
+	idbp->d_reclen = reclen;
+	idbp->d_ino = pa->ino;
+	idbp->d_off = pa->cook;
+	idbp->d_name[namelen] = '\0';
+	pa->done = 1;
+	bcopy(pa->name, idbp->d_name, namelen);
+	return 0;
+}
+
+/*
+ * Getdents put routine for 64-bit ABI, uio form.
+ */
+static int					/* error */
+xfs_dir2_put_dirent64_uio(
+	xfs_dir2_put_args_t	*pa)		/* argument bundle */
+{
+	xfs_dirent_t		*idbp;		/* dirent pointer */
+	int			namelen;	/* entry name length */
+	int			reclen;		/* entry total length */
+	int			rval;		/* return value */
+	uio_t			*uio;		/* I/O control */
+
+	namelen = pa->namelen;
+	reclen = DIRENTSIZE(namelen);
+	uio = pa->uio;
+	/*
+	 * Won't fit in the remaining space.
+	 */
+	if (reclen > uio->uio_resid) {
+		pa->done = 0;
+		return 0;
+	}
+	idbp = pa->dbp;
+	idbp->d_reclen = reclen;
+	idbp->d_ino = pa->ino;
+	idbp->d_off = pa->cook;
+	idbp->d_name[namelen] = '\0';
+	bcopy(pa->name, idbp->d_name, namelen);
+	rval = uiomove((caddr_t)idbp, reclen, UIO_READ, uio);
+	pa->done = (rval == 0);
+	return rval;
+}
+
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_dir2_db_t	db,		/* directory block number */
+	xfs_dabuf_t	*bp)		/* block's buffer */
+{
+	xfs_fileoff_t	bno;		/* directory file offset */
+	xfs_dablk_t	da;		/* directory file offset */
+	int		done;		/* bunmap is finished */
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		error;		/* error return value */
+	xfs_mount_t	*mp;		/* filesystem mount point */
+	xfs_trans_t	*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	da = XFS_DIR2_DB_TO_DA(mp, db);
+	/*
+	 * Unmap the fsblock(s).
+	 */
+	if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
+			XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
+			&done))) {
+		/*
+		 * ENOSPC actually can happen if we're in a removename with
+		 * no space reservation, and the resulting block removal
+		 * would cause a bmap btree split or conversion from extents
+		 * to btree.  This can only happen for un-fragmented
+		 * directory blocks, since you need to be punching out
+		 * the middle of an extent.
+		 * In this case we need to leave the block in the file,
+		 * and not binval it.
+		 * So the block has to be in a consistent empty state
+		 * and appropriately logged.
+		 * We don't free up the buffer, the caller can tell it
+		 * hasn't happened since it got an error back.
+		 */
+		return error;
+	}
+	ASSERT(done);
+	/*
+	 * Invalidate the buffer from the transaction.
+	 */
+	xfs_da_binval(tp, bp);
+	/*
+	 * If it's not a data block, we're done.
+	 */
+	if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
+		return 0;
+	/*
+	 * If the block isn't the last one in the directory, we're done.
+	 */
+	if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0))
+		return 0;
+	bno = da;
+	if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+		/*
+		 * This can't really happen unless there's kernel corruption.
+		 */
+		return error;
+	}
+	if (db == mp->m_dirdatablk)
+		ASSERT(bno == 0);
+	else
+		ASSERT(bno > 0);
+	/*
+	 * Set the size to the new last block.
+	 */
+	dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_H__
+#define __XFS_DIR2_H__
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_put_args;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Directory version 2.
+ * There are 4 possible formats:
+ *	shortform
+ *	single block - data with embedded leaf at the end
+ *	multiple data blocks, single leaf+freeindex block
+ *	data blocks, node&leaf blocks (btree), freeindex blocks
+ *
+ *	The shortform format is in xfs_dir2_sf.h.
+ *	The single block format is in xfs_dir2_block.h.
+ *	The data block format is in xfs_dir2_data.h.
+ *	The leaf and freeindex block formats are in xfs_dir2_leaf.h.
+ *	Node blocks are the same as the other version, in xfs_da_btree.h.
+ */
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef __uint16_t	xfs_dir2_data_off_t;
+#define NULLDATAOFF	0xffffU
+typedef uint		xfs_dir2_data_aoff_t;	/* argument form */
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef __uint32_t	xfs_dir2_db_t;
+
+/*
+ * Byte offset in a directory.
+ */
+typedef xfs_off_t		xfs_dir2_off_t;
+
+/*
+ * For getdents, argument struct for put routines.
+ */
+typedef int (*xfs_dir2_put_t)(struct xfs_dir2_put_args *pa);
+typedef struct xfs_dir2_put_args {
+	xfs_off_t		cook;		/* cookie of (next) entry */
+	xfs_intino_t	ino;		/* inode number */
+	struct xfs_dirent	*dbp;		/* buffer pointer */
+	char		*name;		/* directory entry name */
+	int		namelen;	/* length of name */
+	int		done;		/* output: set if value was stored */
+	xfs_dir2_put_t	put;		/* put function ptr (i/o) */
+	struct uio	*uio;		/* uio control structure */
+} xfs_dir2_put_args_t;
+
+#define XFS_DIR_IS_V2(mp)	((mp)->m_dirversion == 2)
+extern xfs_dirops_t	xfsv2_dirops;
+
+/*
+ * Other interfaces used by the rest of the dir v2 code.
+ */
+extern int
+	xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+			    xfs_dir2_db_t *dbp);
+
+extern int
+	xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
+
+extern int
+	xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
+
+extern int
+	xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+			      struct xfs_dabuf *bp);
+
+#endif	/* __XFS_DIR2_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_block.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_block.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_block.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_block.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1231 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_block.c
+ * XFS V2 directory implementation, single-block form.
+ * See xfs_dir2_block.h for the format.
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first,
+				    int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
+				     int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+
+/*
+ * Add an entry to a block directory.
+ */
+int						/* error */
+xfs_dir2_block_addname(
+	xfs_da_args_t		*args)		/* directory op arguments */
+{
+	xfs_dir2_data_free_t	*bf;		/* bestfree table in block */
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	int			compact;	/* need to compact leaf ents */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	int			error;		/* error return value */
+	xfs_dir2_data_unused_t	*enddup=NULL;	/* unused at end of data */
+	xfs_dahash_t		hash;		/* hash value of found entry */
+	int			high;		/* high index for binary srch */
+	int			highstale;	/* high stale index */
+	int			lfloghigh=0;	/* last final leaf to log */
+	int			lfloglow=0;	/* first final leaf to log */
+	int			len;		/* length of the new entry */
+	int			low;		/* low index for binary srch */
+	int			lowstale;	/* low stale index */
+	int			mid=0;		/* midpoint for binary srch */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log header */
+	int			needscan;	/* need to rescan freespace */
+	xfs_dir2_data_off_t	*tagp;		/* pointer to tag value */
+	xfs_trans_t		*tp;		/* transaction structure */
+
+	xfs_dir2_trace_args("block_addname", args);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the (one and only) directory block into dabuf bp.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	block = bp->data;
+	/*
+	 * Check the magic number, corrupted if wrong.
+	 */
+	if (INT_GET(block->hdr.magic, ARCH_CONVERT) != XFS_DIR2_BLOCK_MAGIC) {
+		xfs_da_brelse(tp, bp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	len = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+	/*
+	 * Set up pointers to parts of the block.
+	 */
+	bf = block->hdr.bestfree;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * No stale entries?  Need space for entry and new leaf.
+	 */
+	if (INT_ISZERO(btp->stale, ARCH_CONVERT)) {
+		/*
+		 * Tag just before the first leaf entry.
+		 */
+		tagp = (xfs_dir2_data_off_t *)blp - 1;
+		/*
+		 * Data object just before the first leaf entry.
+		 */
+		enddup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+		/*
+		 * If it's not free then can't do this add without cleaning up:
+		 * the space before the first leaf entry needs to be free so it
+		 * can be expanded to hold the pointer to the new entry.
+		 */
+		if (INT_GET(enddup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+			dup = enddup = NULL;
+		/*
+		 * Check out the biggest freespace and see if it's the same one.
+		 */
+		else {
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
+			if (dup == enddup) {
+				/*
+				 * It is the biggest freespace, is it too small
+				 * to hold the new leaf too?
+				 */
+				if (INT_GET(dup->length, ARCH_CONVERT) < len + (uint)sizeof(*blp)) {
+					/*
+					 * Yes, we use the second-largest
+					 * entry instead if it works.
+					 */
+					if (INT_GET(bf[1].length, ARCH_CONVERT) >= len)
+						dup = (xfs_dir2_data_unused_t *)
+						      ((char *)block +
+						       INT_GET(bf[1].offset, ARCH_CONVERT));
+					else
+						dup = NULL;
+				}
+			} else {
+				/*
+				 * Not the same free entry,
+				 * just check its length.
+				 */
+				if (INT_GET(dup->length, ARCH_CONVERT) < len) {
+					dup = NULL;
+				}
+			}
+		}
+		compact = 0;
+	}
+	/*
+	 * If there are stale entries we'll use one for the leaf.
+	 * Is the biggest entry enough to avoid compaction?
+	 */
+	else if (INT_GET(bf[0].length, ARCH_CONVERT) >= len) {
+		dup = (xfs_dir2_data_unused_t *)
+		      ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
+		compact = 0;
+	}
+	/*
+	 * Will need to compact to make this work.
+	 */
+	else {
+		/*
+		 * Tag just before the first leaf entry.
+		 */
+		tagp = (xfs_dir2_data_off_t *)blp - 1;
+		/*
+		 * Data object just before the first leaf entry.
+		 */
+		dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+		/*
+		 * If it's not free then the data will go where the
+		 * leaf data starts now, if it works at all.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			if (INT_GET(dup->length, ARCH_CONVERT) + (INT_GET(btp->stale, ARCH_CONVERT) - 1) *
+			    (uint)sizeof(*blp) < len)
+				dup = NULL;
+		} else if ((INT_GET(btp->stale, ARCH_CONVERT) - 1) * (uint)sizeof(*blp) < len)
+			dup = NULL;
+		else
+			dup = (xfs_dir2_data_unused_t *)blp;
+		compact = 1;
+	}
+	/*
+	 * If this isn't a real add, we're done with the buffer.
+	 */
+	if (args->justcheck)
+		xfs_da_brelse(tp, bp);
+	/*
+	 * If we don't have space for the new entry & leaf ...
+	 */
+	if (!dup) {
+		/*
+		 * Not trying to actually do anything, or don't have
+		 * a space reservation: return no-space.
+		 */
+		if (args->justcheck || args->total == 0)
+			return XFS_ERROR(ENOSPC);
+		/*
+		 * Convert to the next larger format.
+		 * Then add the new entry in that format.
+		 */
+		error = xfs_dir2_block_to_leaf(args, bp);
+		xfs_da_buf_done(bp);
+		if (error)
+			return error;
+		return xfs_dir2_leaf_addname(args);
+	}
+	/*
+	 * Just checking, and it would work, so say so.
+	 */
+	if (args->justcheck)
+		return 0;
+	needlog = needscan = 0;
+	/*
+	 * If need to compact the leaf entries, do it now.
+	 * Leave the highest-numbered stale entry stale.
+	 * XXX should be the one closest to mid but mid is not yet computed.
+	 */
+	if (compact) {
+		int	fromidx;		/* source leaf index */
+		int	toidx;			/* target leaf index */
+
+		for (fromidx = toidx = INT_GET(btp->count, ARCH_CONVERT) - 1,
+			highstale = lfloghigh = -1;
+		     fromidx >= 0;
+		     fromidx--) {
+			if (INT_GET(blp[fromidx].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
+				if (highstale == -1)
+					highstale = toidx;
+				else {
+					if (lfloghigh == -1)
+						lfloghigh = toidx;
+					continue;
+				}
+			}
+			if (fromidx < toidx)
+				blp[toidx] = blp[fromidx];
+			toidx--;
+		}
+		lfloglow = toidx + 1 - (INT_GET(btp->stale, ARCH_CONVERT) - 1);
+		lfloghigh -= INT_GET(btp->stale, ARCH_CONVERT) - 1;
+		INT_MOD(btp->count, ARCH_CONVERT, -(INT_GET(btp->stale, ARCH_CONVERT) - 1));
+		xfs_dir2_data_make_free(tp, bp,
+			(xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+			(xfs_dir2_data_aoff_t)((INT_GET(btp->stale, ARCH_CONVERT) - 1) * sizeof(*blp)),
+			&needlog, &needscan);
+		blp += INT_GET(btp->stale, ARCH_CONVERT) - 1;
+		INT_SET(btp->stale, ARCH_CONVERT, 1);
+		/*
+		 * If we now need to rebuild the bestfree map, do so.
+		 * This needs to happen before the next call to use_free.
+		 */
+		if (needscan) {
+			xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
+				&needlog, NULL);
+			needscan = 0;
+		}
+	}
+	/*
+	 * Set leaf logging boundaries to impossible state.
+	 * For the no-stale case they're set explicitly.
+	 */
+	else if (INT_GET(btp->stale, ARCH_CONVERT)) {
+		lfloglow = INT_GET(btp->count, ARCH_CONVERT);
+		lfloghigh = -1;
+	}
+	/*
+	 * Find the slot that's first lower than our hash value, -1 if none.
+	 */
+	for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	while (mid >= 0 && INT_GET(blp[mid].hashval, ARCH_CONVERT) >= args->hashval) {
+		mid--;
+	}
+	/*
+	 * No stale entries, will use enddup space to hold new leaf.
+	 */
+	if (INT_ISZERO(btp->stale, ARCH_CONVERT)) {
+		/*
+		 * Mark the space needed for the new leaf entry, now in use.
+		 */
+		xfs_dir2_data_use_free(tp, bp, enddup,
+			(xfs_dir2_data_aoff_t)
+			((char *)enddup - (char *)block + INT_GET(enddup->length, ARCH_CONVERT) -
+			 sizeof(*blp)),
+			(xfs_dir2_data_aoff_t)sizeof(*blp),
+			&needlog, &needscan);
+		/*
+		 * Update the tail (entry count).
+		 */
+		INT_MOD(btp->count, ARCH_CONVERT, +1);
+		/*
+		 * If we now need to rebuild the bestfree map, do so.
+		 * This needs to happen before the next call to use_free.
+		 */
+		if (needscan) {
+			xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
+				&needlog, NULL);
+			needscan = 0;
+		}
+		/*
+		 * Adjust pointer to the first leaf entry, we're about to move
+		 * the table up one to open up space for the new leaf entry.
+		 * Then adjust our index to match.
+		 */
+		blp--;
+		mid++;
+		if (mid)
+			ovbcopy(&blp[1], blp, mid * sizeof(*blp));
+		lfloglow = 0;
+		lfloghigh = mid;
+	}
+	/*
+	 * Use a stale leaf for our new entry.
+	 */
+	else {
+		for (lowstale = mid;
+		     lowstale >= 0 &&
+			INT_GET(blp[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
+		     lowstale--)
+			continue;
+		for (highstale = mid + 1;
+		     highstale < INT_GET(btp->count, ARCH_CONVERT) &&
+			INT_GET(blp[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
+			(lowstale < 0 || mid - lowstale > highstale - mid);
+		     highstale++)
+			continue;
+		/*
+		 * Move entries toward the low-numbered stale entry.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == INT_GET(btp->count, ARCH_CONVERT) ||
+		     mid - lowstale <= highstale - mid)) {
+			if (mid - lowstale)
+				ovbcopy(&blp[lowstale + 1], &blp[lowstale],
+					(mid - lowstale) * sizeof(*blp));
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(mid, lfloghigh);
+		}
+		/*
+		 * Move entries toward the high-numbered stale entry.
+		 */
+		else {
+			ASSERT(highstale < INT_GET(btp->count, ARCH_CONVERT));
+			mid++;
+			if (highstale - mid)
+				ovbcopy(&blp[mid], &blp[mid + 1],
+					(highstale - mid) * sizeof(*blp));
+			lfloglow = MIN(mid, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		INT_MOD(btp->stale, ARCH_CONVERT, -1);
+	}
+	/*
+	 * Point to the new data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	/*
+	 * Fill in the leaf entry.
+	 */
+	INT_SET(blp[mid].hashval, ARCH_CONVERT, args->hashval);
+	INT_SET(blp[mid].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+	xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+	/*
+	 * Mark space for the data entry used.
+	 */
+	xfs_dir2_data_use_free(tp, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+		(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+	/*
+	 * Create the new data entry.
+	 */
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->namelen = args->namelen;
+	bcopy(args->name, dep->name, args->namelen);
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+	/*
+	 * Clean up the bestfree array and log the header, tail, and entry.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+			NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir2_data_log_entry(tp, bp, dep);
+	xfs_dir2_data_check(dp, bp);
+	xfs_da_buf_done(bp);
+	return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+int						/* error */
+xfs_dir2_block_getdents(
+	xfs_trans_t		*tp,		/* transaction (NULL) */
+	xfs_inode_t		*dp,		/* incore inode */
+	uio_t			*uio,		/* caller's buffer control */
+	int			*eofp,		/* eof reached? (out) */
+	xfs_dirent_t		*dbp,		/* caller's buffer */
+	xfs_dir2_put_t		put)		/* abi's formatting function */
+{
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dabuf_t		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	char			*endptr;	/* end of the data entries */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_put_args_t	p;		/* arg package for put rtn */
+	char			*ptr;		/* current data entry */
+	char			*savptr;	/* saved data entry */
+	int			wantoff;	/* starting block offset */
+
+	mp = dp->i_mount;
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) {
+		*eofp = 1;
+		return 0;
+	}
+	/*
+	 * Can't read the block, give up, else get dabuf in bp.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	/*
+	 * Extract the byte offset we start at from the seek pointer.
+	 * We'll skip entries before this.
+	 */
+	wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset);
+	block = bp->data;
+	xfs_dir2_data_check(dp, bp);
+	/*
+	 * Set up values for the loop.
+	 */
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	ptr = (char *)block->u;
+	endptr = (char *)XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+	/*
+	 * Loop over the data portion of the block.
+	 * Each object is a real entry (dep) or an unused one (dup).
+	 */
+	while (ptr < endptr) {
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		/*
+		 * Unused, skip it.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			ptr += INT_GET(dup->length, ARCH_CONVERT);
+			continue;
+		}
+
+		dep = (xfs_dir2_data_entry_t *)ptr;
+
+		savptr = ptr;		/* In case we need it.. */
+
+		/*
+		 * Bump pointer for the next iteration.
+		 */
+		ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+		/*
+		 * The entry is before the desired starting point, skip it.
+		 */
+		if ((char *)dep - (char *)block < wantoff)
+			continue;
+		/*
+		 * Set up argument structure for put routine.
+		 */
+		p.namelen = dep->namelen;
+
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+						    savptr - (char *)block);
+#if XFS_BIG_FILESYSTEMS
+		p.ino = INT_GET(dep->inumber, ARCH_CONVERT) + mp->m_inoadd;
+#else
+		p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
+#endif
+		p.name = (char *)dep->name;
+
+		/*
+		 * Put the entry in the caller's buffer.
+		 */
+		error = p.put(&p);
+
+		/*
+		 * If it didn't fit, set the final offset to here & return.
+		 */
+		if (!p.done) {
+			uio->uio_offset =
+				XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+					(char *)dep - (char *)block);
+			xfs_da_brelse(tp, bp);
+			return error;
+		}
+	}
+
+	/*
+	 * Reached the end of the block.
+	 * Set the offset to a nonexistent block 1 and return.
+	 */
+	*eofp = 1;
+
+	uio->uio_offset =
+		XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
+
+	xfs_da_brelse(tp, bp);
+
+	return 0;
+}
+
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+	xfs_trans_t		*tp,		/* transaction structure */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	int			first,		/* index of first logged leaf */
+	int			last)		/* index of last logged leaf */
+{
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	mp = tp->t_mountp;
+	block = bp->data;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block),
+		(uint)((char *)&blp[last + 1] - (char *)block - 1));
+}
+
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+	xfs_trans_t		*tp,		/* transaction structure */
+	xfs_dabuf_t		*bp)		/* block buffer */
+{
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	mp = tp->t_mountp;
+	block = bp->data;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block),
+		(uint)((char *)(btp + 1) - (char *)block - 1));
+}
+
+/*
+ * Look up an entry in the block.  This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int						/* error */
+xfs_dir2_block_lookup(
+	xfs_da_args_t		*args)		/* dir lookup arguments */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	xfs_dir2_trace_args("block_lookup", args);
+	/*
+	 * Get the buffer, look up the entry.
+	 * If not found (ENOENT) then return, have no buffer.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+		return error;
+	dp = args->dp;
+	mp = dp->i_mount;
+	block = bp->data;
+	xfs_dir2_data_check(dp, bp);
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Get the offset from the leaf entry, to point to the data.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+	/*
+	 * Fill in inode number, release the block.
+	 */
+	args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+	xfs_da_brelse(args->trans, bp);
+	return XFS_ERROR(EEXIST);
+}
+
+/*
+ * Internal block lookup routine.
+ */
+static int					/* error */
+xfs_dir2_block_lookup_int(
+	xfs_da_args_t		*args,		/* dir lookup arguments */
+	xfs_dabuf_t		**bpp,		/* returned block buffer */
+	int			*entno)		/* returned entry number */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			error;		/* error return value */
+	xfs_dahash_t		hash;		/* found hash value */
+	int			high;		/* binary search high index */
+	int			low;		/* binary search low index */
+	int			mid;		/* binary search current idx */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the buffer, return error if we can't get it.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	block = bp->data;
+	xfs_dir2_data_check(dp, bp);
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Loop doing a binary search for our hash value.
+	 * Find our entry, ENOENT if it's not there.
+	 */
+	for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; ; ) {
+		ASSERT(low <= high);
+		mid = (low + high) >> 1;
+		if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+		if (low > high) {
+			ASSERT(args->oknoent);
+			xfs_da_brelse(tp, bp);
+			return XFS_ERROR(ENOENT);
+		}
+	}
+	/*
+	 * Back up to the first one with the right hash value.
+	 */
+	while (mid > 0 && INT_GET(blp[mid - 1].hashval, ARCH_CONVERT) == args->hashval) {
+		mid--;
+	}
+	/*
+	 * Now loop forward through all the entries with the
+	 * right hash value looking for our name.
+	 */
+	do {
+		if ((addr = INT_GET(blp[mid].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get pointer to the entry from the leaf.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+			((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
+		/*
+		 * Compare, if it's right give back buffer & entry number.
+		 */
+		if (dep->namelen == args->namelen &&
+		    dep->name[0] == args->name[0] &&
+		    bcmp(dep->name, args->name, args->namelen) == 0) {
+			*bpp = bp;
+			*entno = mid;
+			return 0;
+		}
+	} while (++mid < INT_GET(btp->count, ARCH_CONVERT) && INT_GET(blp[mid].hashval, ARCH_CONVERT) == hash);
+	/*
+	 * No match, release the buffer and return ENOENT.
+	 */
+	ASSERT(args->oknoent);
+	xfs_da_brelse(tp, bp);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int						/* error */
+xfs_dir2_block_removename(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf pointer */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* block leaf entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to fixup bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* shortform size */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("block_removename", args);
+	/*
+	 * Look up the entry in the block.  Gets the buffer and entry index.
+	 * It will always be there, the vnodeops level does a lookup first.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	block = bp->data;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Point to the data entry using the leaf entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+	/*
+	 * Mark the data entry's space free.
+	 */
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(tp, bp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)block),
+		XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+	/*
+	 * Fix up the block tail.
+	 */
+	INT_MOD(btp->stale, ARCH_CONVERT, +1);
+	xfs_dir2_block_log_tail(tp, bp);
+	/*
+	 * Remove the leaf entry by marking it stale.
+	 */
+	INT_SET(blp[ent].address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+	/*
+	 * Fix up bestfree, log the header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+			NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_data_check(dp, bp);
+	/*
+	 * See if the size as a shortform is good enough.
+	 */
+	if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+	    XFS_IFORK_DSIZE(dp)) {
+		xfs_da_buf_done(bp);
+		return 0;
+	}
+	/*
+	 * If it works, do the conversion.
+	 */
+	return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int						/* error */
+xfs_dir2_block_replace(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* leaf entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	xfs_dir2_trace_args("block_replace", args);
+	/*
+	 * Lookup the entry in the directory.  Get buffer and entry index.
+	 * This will always succeed since the caller has already done a lookup.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	mp = dp->i_mount;
+	block = bp->data;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Point to the data entry we need to change.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+	ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber);
+	/*
+	 * Change the inode number to the new value.
+	 */
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	xfs_dir2_data_log_entry(args->trans, bp, dep);
+	xfs_dir2_data_check(dp, bp);
+	xfs_da_buf_done(bp);
+	return 0;
+}
+
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int					/* sort order */
+xfs_dir2_block_sort(
+	const void			*a,	/* first leaf entry */
+	const void			*b)	/* second leaf entry */
+{
+	const xfs_dir2_leaf_entry_t	*la;	/* first leaf entry */
+	const xfs_dir2_leaf_entry_t	*lb;	/* second leaf entry */
+
+	la = a;
+	lb = b;
+	return INT_GET(la->hashval, ARCH_CONVERT) < INT_GET(lb->hashval, ARCH_CONVERT) ? -1 :
+		(INT_GET(la->hashval, ARCH_CONVERT) > INT_GET(lb->hashval, ARCH_CONVERT) ? 1 : 0);
+}
+
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int						/* error */
+xfs_dir2_leaf_to_block(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp,		/* leaf buffer */
+	xfs_dabuf_t		*dbp)		/* data buffer */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* leaf bests table */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	int			error;		/* error return value */
+	int			from;		/* leaf from index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to scan for bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* bytes used */
+	xfs_dir2_data_off_t	*tagp;		/* end of entry (tag) */
+	int			to;		/* block/leaf to index */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	/*
+	 * If there are data blocks other than the first one, take this
+	 * opportunity to remove trailing empty data blocks that may have
+	 * been left behind during no-space-reservation operations.
+	 * These will show up in the leaf bests table.
+	 */
+	while (dp->i_d.di_size > mp->m_dirblksize) {
+		bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+		if (INT_GET(bestsp[INT_GET(ltp->bestcount, ARCH_CONVERT) - 1], ARCH_CONVERT) ==
+		    mp->m_dirblksize - (uint)sizeof(block->hdr)) {
+			if ((error =
+			    xfs_dir2_leaf_trim_data(args, lbp,
+				    (xfs_dir2_db_t)(INT_GET(ltp->bestcount, ARCH_CONVERT) - 1))))
+				goto out;
+		} else {
+			error = 0;
+			goto out;
+		}
+	}
+	/*
+	 * Read the data block if we don't already have it, give up if it fails.
+	 */
+	if (dbp == NULL &&
+	    (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
+		    XFS_DATA_FORK))) {
+		goto out;
+	}
+	block = dbp->data;
+	ASSERT(INT_GET(block->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+	/*
+	 * Size of the "leaf" area in the block.
+	 */
+	size = (uint)sizeof(block->tail) +
+	       (uint)sizeof(*lep) * (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
+	/*
+	 * Look at the last data entry.
+	 */
+	tagp = (xfs_dir2_data_off_t *)((char *)block + mp->m_dirblksize) - 1;
+	dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+	/*
+	 * If it's not free or is too short we can't do it.
+	 */
+	if (INT_GET(dup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG || INT_GET(dup->length, ARCH_CONVERT) < size) {
+		error = 0;
+		goto out;
+	}
+	/*
+	 * Start converting it to block form.
+	 */
+	INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
+	needlog = 1;
+	needscan = 0;
+	/*
+	 * Use up the space at the end of the block (blp/btp).
+	 */
+	xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
+		&needlog, &needscan);
+	/*
+	 * Initialize the block tail.
+	 */
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	INT_SET(btp->count, ARCH_CONVERT, INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
+	INT_ZERO(btp->stale, ARCH_CONVERT);
+	xfs_dir2_block_log_tail(tp, dbp);
+	/*
+	 * Initialize the block leaf area.  We compact out stale entries.
+	 */
+	lep = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+		if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		lep[to++] = leaf->ents[from];
+	}
+	ASSERT(to == INT_GET(btp->count, ARCH_CONVERT));
+	xfs_dir2_block_log_leaf(tp, dbp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
+	/*
+	 * Scan the bestfree if we need it and log the data block header.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+			NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	/*
+	 * Pitch the old leaf block.
+	 */
+	error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
+	lbp = NULL;
+	if (error) {
+		goto out;
+	}
+	/*
+	 * Now see if the resulting block can be shrunken to shortform.
+	 */
+	if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+	    XFS_IFORK_DSIZE(dp)) {
+		error = 0;
+		goto out;
+	}
+	return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+out:
+	if (lbp)
+		xfs_da_buf_done(lbp);
+	if (dbp)
+		xfs_da_buf_done(dbp);
+	return error;
+}
+
+/*
+ * Convert the shortform directory to block form.
+ */
+int						/* error */
+xfs_dir2_sf_to_block(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dir2_db_t		blkno;		/* dir-relative block # (0) */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	char			*buf;		/* sf buffer */
+	int			buf_len;
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			dummy;		/* trash */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	int			endoffset;	/* end of data objects */
+	int			error;		/* error return value */
+	int			i;		/* index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to scan block freespc */
+	int			newoffset;	/* offset from current entry */
+	int			offset;		/* target block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* sf entry pointer */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_dir2_data_off_t	*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("sf_to_block", args);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bomb out if the shortform directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+	/*
+	 * Copy the directory into the stack buffer.
+	 * Then pitch the incore inode data so we can make extents.
+	 */
+
+	buf_len = dp->i_df.if_bytes;
+	buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
+
+	bcopy(sfp, buf, dp->i_df.if_bytes);
+	xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+	dp->i_d.di_size = 0;
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	/*
+	 * Reset pointer - old sfp is gone.
+	 */
+	sfp = (xfs_dir2_sf_t *)buf;
+	/*
+	 * Add block 0 to the inode.
+	 */
+	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+	if (error) {
+		kmem_free(buf, buf_len);
+		return error;
+	}
+	/*
+	 * Initialize the data block.
+	 */
+	error = xfs_dir2_data_init(args, blkno, &bp);
+	if (error) {
+		kmem_free(buf, buf_len);
+		return error;
+	}
+	block = bp->data;
+	INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
+	/*
+	 * Compute size of block "tail" area.
+	 */
+	i = (uint)sizeof(*btp) +
+	    (INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+	/*
+	 * The whole thing is initialized to free by the init routine.
+	 * Say we're using the leaf and tail area.
+	 */
+	dup = (xfs_dir2_data_unused_t *)block->u;
+	needlog = needscan = 0;
+	xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
+		&needscan);
+	ASSERT(needscan == 0);
+	/*
+	 * Fill in the tail.
+	 */
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	INT_SET(btp->count, ARCH_CONVERT, INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2);	/* ., .. */
+	INT_ZERO(btp->stale, ARCH_CONVERT);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	endoffset = (uint)((char *)blp - (char *)block);
+	/*
+	 * Remove the freespace, we'll manage it.
+	 */
+	xfs_dir2_data_use_free(tp, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+		INT_GET(dup->length, ARCH_CONVERT), &needlog, &needscan);
+	/*
+	 * Create entry for .
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
+	INT_SET(dep->inumber, ARCH_CONVERT, dp->i_ino);
+	dep->namelen = 1;
+	dep->name[0] = '.';
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+	xfs_dir2_data_log_entry(tp, bp, dep);
+	INT_SET(blp[0].hashval, ARCH_CONVERT, xfs_dir_hash_dot);
+	INT_SET(blp[0].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+	/*
+	 * Create entry for ..
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+		((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
+	INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT));
+	dep->namelen = 2;
+	dep->name[0] = dep->name[1] = '.';
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+	xfs_dir2_data_log_entry(tp, bp, dep);
+	INT_SET(blp[1].hashval, ARCH_CONVERT, xfs_dir_hash_dotdot);
+	INT_SET(blp[1].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	/*
+	 * Loop over existing entries, stuff them in.
+	 */
+	if ((i = 0) == INT_GET(sfp->hdr.count, ARCH_CONVERT))
+		sfep = NULL;
+	else
+		sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	/*
+	 * Need to preserve the existing offset values in the sf directory.
+	 * Insert holes (unused entries) where necessary.
+	 */
+	while (offset < endoffset) {
+		/*
+		 * sfep is null when we reach the end of the list.
+		 */
+		if (sfep == NULL)
+			newoffset = endoffset;
+		else
+			newoffset = XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT);
+		/*
+		 * There should be a hole here, make one.
+		 */
+		if (offset < newoffset) {
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)block + offset);
+			INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+			INT_SET(dup->length, ARCH_CONVERT, newoffset - offset);
+			INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT,
+				(xfs_dir2_data_off_t)
+				((char *)dup - (char *)block));
+			xfs_dir2_data_log_unused(tp, bp, dup);
+			(void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block,
+				dup, &dummy);
+			offset += INT_GET(dup->length, ARCH_CONVERT);
+			continue;
+		}
+		/*
+		 * Copy a real entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
+		INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+				XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT));
+		dep->namelen = sfep->namelen;
+		bcopy(sfep->name, dep->name, dep->namelen);
+		tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+		INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+		xfs_dir2_data_log_entry(tp, bp, dep);
+		INT_SET(blp[2 + i].hashval, ARCH_CONVERT, xfs_da_hashname((char *)sfep->name, sfep->namelen));
+		INT_SET(blp[2 + i].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp,
+						 (char *)dep - (char *)block));
+		offset = (int)((char *)(tagp + 1) - (char *)block);
+		if (++i == INT_GET(sfp->hdr.count, ARCH_CONVERT))
+			sfep = NULL;
+		else
+			sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+	}
+	/* Done with the temporary buffer */
+	kmem_free(buf, buf_len);
+	/*
+	 * Sort the leaf entries by hash value.
+	 */
+	qsort(blp, INT_GET(btp->count, ARCH_CONVERT), sizeof(*blp), xfs_dir2_block_sort);
+	/*
+	 * Log the leaf entry area and tail.
+	 * Already logged the header in data_init, ignore needlog.
+	 */
+	ASSERT(needscan == 0);
+	xfs_dir2_block_log_leaf(tp, bp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir2_data_check(dp, bp);
+	xfs_da_buf_done(bp);
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_block.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_block.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_block.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_block.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_BLOCK_H__
+#define __XFS_DIR2_BLOCK_H__
+
+/*
+ * xfs_dir2_block.h
+ * Directory version 2, single block format structures
+ */
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_leaf_entry;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * The single block format is as follows:
+ * xfs_dir2_data_hdr_t structure
+ * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures
+ * xfs_dir2_leaf_entry_t structures
+ * xfs_dir2_block_tail_t structure
+ */
+
+#define XFS_DIR2_BLOCK_MAGIC	0x58443242	/* XD2B: for one block dirs */
+
+typedef struct xfs_dir2_block_tail {
+	__uint32_t	count;			/* count of leaf entries */
+	__uint32_t	stale;			/* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Generic single-block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_block {
+	xfs_dir2_data_hdr_t	hdr;		/* magic XFS_DIR2_BLOCK_MAGIC */
+	xfs_dir2_data_union_t	u[1];
+	xfs_dir2_leaf_entry_t	leaf[1];
+	xfs_dir2_block_tail_t	tail;
+} xfs_dir2_block_t;
+
+/*
+ * Pointer to the leaf header embedded in a data block (1-block format)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_TAIL_P)
+xfs_dir2_block_tail_t *
+xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block);
+#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block)
+#else
+#define XFS_DIR2_BLOCK_TAIL_P(mp,block) \
+	(((xfs_dir2_block_tail_t *)((char *)(block) + (mp)->m_dirblksize)) - 1)
+#endif
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_LEAF_P)
+struct xfs_dir2_leaf_entry *xfs_dir2_block_leaf_p_arch(
+	xfs_dir2_block_tail_t *btp, xfs_arch_t arch);
+#define XFS_DIR2_BLOCK_LEAF_P_ARCH(btp,arch) \
+	xfs_dir2_block_leaf_p_arch(btp,arch)
+#else
+#define XFS_DIR2_BLOCK_LEAF_P_ARCH(btp,arch)	\
+	(((struct xfs_dir2_leaf_entry *)(btp)) - INT_GET((btp)->count, arch))
+#endif
+
+/*
+ * Function declarations.
+ */
+
+extern int
+	xfs_dir2_block_addname(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_block_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct uio *uio, int *eofp, struct xfs_dirent *dbp,
+				xfs_dir2_put_t put);
+
+extern int
+	xfs_dir2_block_lookup(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_block_removename(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_block_replace(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_leaf_to_block(struct xfs_da_args *args, struct xfs_dabuf *lbp,
+			       struct xfs_dabuf *dbp);
+
+extern int
+	xfs_dir2_sf_to_block(struct xfs_da_args *args);
+
+#endif	/* __XFS_DIR2_BLOCK_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_data.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_data.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_data.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_data.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_data.c
+ * Core data block handling routines for XFS V2 directories.
+ * See xfs_dir2_data.h for data structures.
+ */
+
+#include <xfs.h>
+
+
+#ifdef DEBUG
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Pop an assert if we find anything bad.
+ */
+void
+xfs_dir2_data_check(
+	xfs_inode_t		*dp,		/* incore inode pointer */
+	xfs_dabuf_t		*bp)		/* data block's buffer */
+{
+	xfs_dir2_dataptr_t	addr;		/* addr for leaf lookup */
+	xfs_dir2_data_free_t	*bf;		/* bestfree table */
+	xfs_dir2_block_tail_t	*btp=NULL;	/* block tail */
+	int			count;		/* count of entries found */
+	xfs_dir2_data_t		*d;		/* data block pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	char			*endp;		/* end of useful data */
+	int			freeseen;	/* mask of bestfrees seen */
+	xfs_dahash_t		hash;		/* hash of current name */
+	int			i;		/* leaf index */
+	int			lastfree;	/* last entry was unused */
+	xfs_dir2_leaf_entry_t	*lep=NULL;	/* block leaf entries */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*p;		/* current data position */
+	int			stale;		/* count of stale leaves */
+
+	mp = dp->i_mount;
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	bf = d->hdr.bestfree;
+	p = (char *)d->u;
+	if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+		btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
+		lep = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+		endp = (char *)lep;
+	} else
+		endp = (char *)d + mp->m_dirblksize;
+	count = lastfree = freeseen = 0;
+	/*
+	 * Account for zero bestfree entries.
+	 */
+	if (INT_ISZERO(bf[0].length, ARCH_CONVERT)) {
+		ASSERT(INT_ISZERO(bf[0].offset, ARCH_CONVERT));
+		freeseen |= 1 << 0;
+	}
+	if (INT_ISZERO(bf[1].length, ARCH_CONVERT)) {
+		ASSERT(INT_ISZERO(bf[1].offset, ARCH_CONVERT));
+		freeseen |= 1 << 1;
+	}
+	if (INT_ISZERO(bf[2].length, ARCH_CONVERT)) {
+		ASSERT(INT_ISZERO(bf[2].offset, ARCH_CONVERT));
+		freeseen |= 1 << 2;
+	}
+	ASSERT(INT_GET(bf[0].length, ARCH_CONVERT) >= INT_GET(bf[1].length, ARCH_CONVERT));
+	ASSERT(INT_GET(bf[1].length, ARCH_CONVERT) >= INT_GET(bf[2].length, ARCH_CONVERT));
+	/*
+	 * Loop over the data/unused entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's unused, look for the space in the bestfree table.
+		 * If we find it, account for that, else make sure it
+		 * doesn't need to be there.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			ASSERT(lastfree == 0);
+			ASSERT(INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT) ==
+			       (char *)dup - (char *)d);
+			dfp = xfs_dir2_data_freefind(d, dup);
+			if (dfp) {
+				i = (int)(dfp - bf);
+				ASSERT((freeseen & (1 << i)) == 0);
+				freeseen |= 1 << i;
+			} else
+				ASSERT(INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(bf[2].length, ARCH_CONVERT));
+			p += INT_GET(dup->length, ARCH_CONVERT);
+			lastfree = 1;
+			continue;
+		}
+		/*
+		 * It's a real entry.  Validate the fields.
+		 * If this is a block directory then make sure it's
+		 * in the leaf section of the block.
+		 * The linear search is crude but this is DEBUG code.
+		 */
+		dep = (xfs_dir2_data_entry_t *)p;
+		ASSERT(dep->namelen != 0);
+		ASSERT(xfs_dir_ino_validate(mp, INT_GET(dep->inumber, ARCH_CONVERT)) == 0);
+		ASSERT(INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT) ==
+		       (char *)dep - (char *)d);
+		count++;
+		lastfree = 0;
+		if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+			addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+				(xfs_dir2_data_aoff_t)
+				((char *)dep - (char *)d));
+			hash = xfs_da_hashname((char *)dep->name, dep->namelen);
+			for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
+				if (INT_GET(lep[i].address, ARCH_CONVERT) == addr &&
+				    INT_GET(lep[i].hashval, ARCH_CONVERT) == hash)
+					break;
+			}
+			ASSERT(i < INT_GET(btp->count, ARCH_CONVERT));
+		}
+		p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+	}
+	/*
+	 * Need to have seen all the entries and all the bestfree slots.
+	 */
+	ASSERT(freeseen == 7);
+	if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+		for (i = stale = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
+			if (INT_GET(lep[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+				stale++;
+			if (i > 0)
+				ASSERT(INT_GET(lep[i].hashval, ARCH_CONVERT) >= INT_GET(lep[i - 1].hashval, ARCH_CONVERT));
+		}
+		ASSERT(count == INT_GET(btp->count, ARCH_CONVERT) - INT_GET(btp->stale, ARCH_CONVERT));
+		ASSERT(stale == INT_GET(btp->stale, ARCH_CONVERT));
+	}
+}
+#endif
+
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+	xfs_dir2_data_t		*d,		/* data block */
+	xfs_dir2_data_unused_t	*dup)		/* data unused entry */
+{
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_aoff_t	off;		/* offset value needed */
+#if defined(DEBUG) && defined(__KERNEL__)
+	int			matched;	/* matched the value */
+	int			seenzero;	/* saw a 0 bestfree entry */
+#endif
+
+	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d);
+#if defined(DEBUG) && defined(__KERNEL__)
+	/*
+	 * Validate some consistency in the bestfree table.
+	 * Check order, non-overlapping entries, and if we find the
+	 * one we're looking for it has to be exact.
+	 */
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0;
+	     dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+	     dfp++) {
+		if (INT_ISZERO(dfp->offset, ARCH_CONVERT)) {
+			ASSERT(INT_ISZERO(dfp->length, ARCH_CONVERT));
+			seenzero = 1;
+			continue;
+		}
+		ASSERT(seenzero == 0);
+		if (INT_GET(dfp->offset, ARCH_CONVERT) == off) {
+			matched = 1;
+			ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(dup->length, ARCH_CONVERT));
+		} else if (off < INT_GET(dfp->offset, ARCH_CONVERT))
+			ASSERT(off + INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(dfp->offset, ARCH_CONVERT));
+		else
+			ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) + INT_GET(dfp->length, ARCH_CONVERT) <= off);
+		ASSERT(matched || INT_GET(dfp->length, ARCH_CONVERT) >= INT_GET(dup->length, ARCH_CONVERT));
+		if (dfp > &d->hdr.bestfree[0])
+			ASSERT(INT_GET(dfp[-1].length, ARCH_CONVERT) >= INT_GET(dfp[0].length, ARCH_CONVERT));
+	}
+#endif
+	/*
+	 * If this is smaller than the smallest bestfree entry,
+	 * it can't be there since they're sorted.
+	 */
+	if (INT_GET(dup->length, ARCH_CONVERT) < INT_GET(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length, ARCH_CONVERT))
+		return NULL;
+	/*
+	 * Look at the three bestfree entries for our guy.
+	 */
+	for (dfp = &d->hdr.bestfree[0];
+	     dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+	     dfp++) {
+		if (INT_ISZERO(dfp->offset, ARCH_CONVERT))
+			return NULL;
+		if (INT_GET(dfp->offset, ARCH_CONVERT) == off)
+			return dfp;
+	}
+	/*
+	 * Didn't find it.  This only happens if there are duplicate lengths.
+	 */
+	return NULL;
+}
+
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t *				/* entry inserted */
+xfs_dir2_data_freeinsert(
+	xfs_dir2_data_t		*d,		/* data block pointer */
+	xfs_dir2_data_unused_t	*dup,		/* unused space */
+	int			*loghead)	/* log the data header (out) */
+{
+	xfs_dir2_data_free_t	*dfp;		/* bestfree table pointer */
+	xfs_dir2_data_free_t	new;		/* new bestfree entry */
+
+#ifdef __KERNEL__
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+	dfp = d->hdr.bestfree;
+	INT_COPY(new.length, dup->length, ARCH_CONVERT);
+	INT_SET(new.offset, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dup - (char *)d));
+	/*
+	 * Insert at position 0, 1, or 2; or not at all.
+	 */
+	if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[0].length, ARCH_CONVERT)) {
+		dfp[2] = dfp[1];
+		dfp[1] = dfp[0];
+		dfp[0] = new;
+		*loghead = 1;
+		return &dfp[0];
+	}
+	if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[1].length, ARCH_CONVERT)) {
+		dfp[2] = dfp[1];
+		dfp[1] = new;
+		*loghead = 1;
+		return &dfp[1];
+	}
+	if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[2].length, ARCH_CONVERT)) {
+		dfp[2] = new;
+		*loghead = 1;
+		return &dfp[2];
+	}
+	return NULL;
+}
+
+/*
+ * Remove a bestfree entry from the table.
+ */
+void
+xfs_dir2_data_freeremove(
+	xfs_dir2_data_t		*d,		/* data block pointer */
+	xfs_dir2_data_free_t	*dfp,		/* bestfree entry pointer */
+	int			*loghead)	/* out: log data header */
+{
+#ifdef __KERNEL__
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+	/*
+	 * It's the first entry, slide the next 2 up.
+	 */
+	if (dfp == &d->hdr.bestfree[0]) {
+		d->hdr.bestfree[0] = d->hdr.bestfree[1];
+		d->hdr.bestfree[1] = d->hdr.bestfree[2];
+	}
+	/*
+	 * It's the second entry, slide the 3rd entry up.
+	 */
+	else if (dfp == &d->hdr.bestfree[1])
+		d->hdr.bestfree[1] = d->hdr.bestfree[2];
+	/*
+	 * Must be the last entry.
+	 */
+	else
+		ASSERT(dfp == &d->hdr.bestfree[2]);
+	/*
+	 * Clear the 3rd entry, must be zero now.
+	 */
+	INT_ZERO(d->hdr.bestfree[2].length, ARCH_CONVERT);
+	INT_ZERO(d->hdr.bestfree[2].offset, ARCH_CONVERT);
+	*loghead = 1;
+}
+
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+	xfs_mount_t		*mp,		/* filesystem mount point */
+	xfs_dir2_data_t		*d,		/* data block pointer */
+	int			*loghead,	/* out: log data header */
+	char			*aendp)		/* in: caller's endp */
+{
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* active data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	char			*endp;		/* end of block's data */
+	char			*p;		/* current entry pointer */
+
+#ifdef __KERNEL__
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+	/*
+	 * Start by clearing the table.
+	 */
+	bzero(d->hdr.bestfree, sizeof(d->hdr.bestfree));
+	*loghead = 1;
+	/*
+	 * Set up pointers.
+	 */
+	p = (char *)d->u;
+	if (aendp)
+		endp = aendp;
+	else if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+		btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
+		endp = (char *)XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	} else
+		endp = (char *)d + mp->m_dirblksize;
+	/*
+	 * Loop over the block's entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's a free entry, insert it.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			ASSERT((char *)dup - (char *)d ==
+			       INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT));
+			xfs_dir2_data_freeinsert(d, dup, loghead);
+			p += INT_GET(dup->length, ARCH_CONVERT);
+		}
+		/*
+		 * For active entries, check their tags and skip them.
+		 */
+		else {
+			dep = (xfs_dir2_data_entry_t *)p;
+			ASSERT((char *)dep - (char *)d ==
+			       INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT));
+			p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+		}
+	}
+}
+
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int						/* error */
+xfs_dir2_data_init(
+	xfs_da_args_t		*args,		/* directory operation args */
+	xfs_dir2_db_t		blkno,		/* logical dir block number */
+	xfs_dabuf_t		**bpp)		/* output block buffer */
+{
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_data_t		*d;		/* pointer to block */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	int			error;		/* error return value */
+	int			i;		/* bestfree index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int			t;		/* temp */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Get the buffer set up for the block.
+	 */
+	error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp,
+		XFS_DATA_FORK);
+	if (error) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	/*
+	 * Initialize the header.
+	 */
+	d = bp->data;
+	INT_SET(d->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
+	INT_SET(d->hdr.bestfree[0].offset, ARCH_CONVERT, (xfs_dir2_data_off_t)sizeof(d->hdr));
+	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+		INT_ZERO(d->hdr.bestfree[i].length, ARCH_CONVERT);
+		INT_ZERO(d->hdr.bestfree[i].offset, ARCH_CONVERT);
+	}
+	/*
+	 * Set up an unused entry for the block's body.
+	 */
+	dup = &d->u[0].unused;
+	INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+
+	t=mp->m_dirblksize - (uint)sizeof(d->hdr);
+	INT_SET(d->hdr.bestfree[0].length, ARCH_CONVERT, t);
+	INT_SET(dup->length, ARCH_CONVERT, t);
+	INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT,
+		(xfs_dir2_data_off_t)((char *)dup - (char *)d));
+	/*
+	 * Log it and return it.
+	 */
+	xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_data_log_unused(tp, bp, dup);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	xfs_dir2_data_entry_t	*dep)		/* data entry pointer */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d),
+		(uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) -
+		       (char *)d - 1));
+}
+
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* block buffer */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d),
+		(uint)(sizeof(d->hdr) - 1));
+}
+
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	xfs_dir2_data_unused_t	*dup)		/* data unused pointer */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	/*
+	 * Log the first part of the unused entry.
+	 */
+	xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d),
+		(uint)((char *)&dup->length + sizeof(dup->length) -
+		       1 - (char *)d));
+	/*
+	 * Log the end (tag) of the unused entry.
+	 */
+	xfs_da_log_buf(tp, bp,
+		(uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT) - (char *)d),
+		(uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT) - (char *)d +
+		       sizeof(xfs_dir2_data_off_t) - 1));
+}
+
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	xfs_dir2_data_aoff_t	offset,		/* starting byte offset */
+	xfs_dir2_data_aoff_t	len,		/* length in bytes */
+	int			*needlogp,	/* out: log header */
+	int			*needscanp)	/* out: regen bestfree */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	char			*endptr;	/* end of data area */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*postdup;	/* unused entry after us */
+	xfs_dir2_data_unused_t	*prevdup;	/* unused entry before us */
+
+	mp = tp->t_mountp;
+	d = bp->data;
+	/*
+	 * Figure out where the end of the data area is.
+	 */
+	if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC)
+		endptr = (char *)d + mp->m_dirblksize;
+	else {
+		xfs_dir2_block_tail_t	*btp;	/* block tail */
+
+		ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+		btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
+		endptr = (char *)XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	}
+	/*
+	 * If this isn't the start of the block, then back up to
+	 * the previous entry and see if it's free.
+	 */
+	if (offset > sizeof(d->hdr)) {
+		xfs_dir2_data_off_t	*tagp;	/* tag just before us */
+
+		tagp = (xfs_dir2_data_off_t *)((char *)d + offset) - 1;
+		prevdup = (xfs_dir2_data_unused_t *)((char *)d + INT_GET(*tagp, ARCH_CONVERT));
+		if (INT_GET(prevdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+			prevdup = NULL;
+	} else
+		prevdup = NULL;
+	/*
+	 * If this isn't the end of the block, see if the entry after
+	 * us is free.
+	 */
+	if ((char *)d + offset + len < endptr) {
+		postdup =
+			(xfs_dir2_data_unused_t *)((char *)d + offset + len);
+		if (INT_GET(postdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+			postdup = NULL;
+	} else
+		postdup = NULL;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * Previous and following entries are both free,
+	 * merge everything into a single free entry.
+	 */
+	if (prevdup && postdup) {
+		xfs_dir2_data_free_t	*dfp2;	/* another bestfree pointer */
+
+		/*
+		 * See if prevdup and/or postdup are in bestfree table.
+		 */
+		dfp = xfs_dir2_data_freefind(d, prevdup);
+		dfp2 = xfs_dir2_data_freefind(d, postdup);
+		/*
+		 * We need a rescan unless there are exactly 2 free entries
+		 * namely our two.  Then we know what's happening, otherwise
+		 * since the third bestfree is there, there might be more
+		 * entries.
+		 */
+		needscan = !INT_ISZERO(d->hdr.bestfree[2].length, ARCH_CONVERT);
+		/*
+		 * Fix up the new big freespace.
+		 */
+		INT_MOD(prevdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(prevdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, prevdup);
+		if (!needscan) {
+			/*
+			 * Has to be the case that entries 0 and 1 are
+			 * dfp and dfp2 (don't know which is which), and
+			 * entry 2 is empty.
+			 * Remove entry 1 first then entry 0.
+			 */
+			ASSERT(dfp && dfp2);
+			if (dfp == &d->hdr.bestfree[1]) {
+				dfp = &d->hdr.bestfree[0];
+				ASSERT(dfp2 == dfp);
+				dfp2 = &d->hdr.bestfree[1];
+			}
+			xfs_dir2_data_freeremove(d, dfp2, needlogp);
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			/*
+			 * Now insert the new entry.
+			 */
+			dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+			ASSERT(dfp == &d->hdr.bestfree[0]);
+			ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(prevdup->length, ARCH_CONVERT));
+			ASSERT(INT_ISZERO(dfp[1].length, ARCH_CONVERT));
+			ASSERT(INT_ISZERO(dfp[2].length, ARCH_CONVERT));
+		}
+	}
+	/*
+	 * The entry before us is free, merge with it.
+	 */
+	else if (prevdup) {
+		dfp = xfs_dir2_data_freefind(d, prevdup);
+		INT_MOD(prevdup->length, ARCH_CONVERT, len);
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(prevdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, prevdup);
+		/*
+		 * If the previous entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			(void)xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else
+			needscan = INT_GET(prevdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
+	}
+	/*
+	 * The following entry is free, merge with it.
+	 */
+	else if (postdup) {
+		dfp = xfs_dir2_data_freefind(d, postdup);
+		newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+		INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+		INT_SET(newdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		/*
+		 * If the following entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			(void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else
+			needscan = INT_GET(newdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
+	}
+	/*
+	 * Neither neighbor is free.  Make a new entry.
+	 */
+	else {
+		newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+		INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+		INT_SET(newdup->length, ARCH_CONVERT, len);
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		(void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+	}
+	*needscanp = needscan;
+}
+
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+void
+xfs_dir2_data_use_free(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* data block buffer */
+	xfs_dir2_data_unused_t	*dup,		/* unused entry */
+	xfs_dir2_data_aoff_t	offset,		/* starting offset to use */
+	xfs_dir2_data_aoff_t	len,		/* length to use */
+	int			*needlogp,	/* out: need to log header */
+	int			*needscanp)	/* out: need regen bestfree */
+{
+	xfs_dir2_data_t		*d;		/* data block */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	int			matchback;	/* matches end of freespace */
+	int			matchfront;	/* matches start of freespace */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*newdup2;	/* another new unused entry */
+	int			oldlen;		/* old unused entry's length */
+
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	ASSERT(INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG);
+	ASSERT(offset >= (char *)dup - (char *)d);
+	ASSERT(offset + len <= (char *)dup + INT_GET(dup->length, ARCH_CONVERT) - (char *)d);
+	ASSERT((char *)dup - (char *)d == INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT));
+	/*
+	 * Look up the entry in the bestfree table.
+	 */
+	dfp = xfs_dir2_data_freefind(d, dup);
+	oldlen = INT_GET(dup->length, ARCH_CONVERT);
+	ASSERT(dfp || oldlen <= INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT));
+	/*
+	 * Check for alignment with front and back of the entry.
+	 */
+	matchfront = (char *)dup - (char *)d == offset;
+	matchback = (char *)dup + oldlen - (char *)d == offset + len;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * If we matched it exactly we just need to get rid of it from
+	 * the bestfree table.
+	 */
+	if (matchfront && matchback) {
+		if (dfp) {
+			needscan = !INT_ISZERO(d->hdr.bestfree[2].offset, ARCH_CONVERT);
+			if (!needscan)
+				xfs_dir2_data_freeremove(d, dfp, needlogp);
+		}
+	}
+	/*
+	 * We match the first part of the entry.
+	 * Make a new entry with the remaining freespace.
+	 */
+	else if (matchfront) {
+		newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+		INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+		INT_SET(newdup->length, ARCH_CONVERT, oldlen - len);
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
+			ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &d->hdr.bestfree[2];
+		}
+	}
+	/*
+	 * We match the last part of the entry.
+	 * Trim the allocated space off the tail of the entry.
+	 */
+	else if (matchback) {
+		newdup = dup;
+		INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
+			(((char *)d + offset) - (char *)newdup));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
+			ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &d->hdr.bestfree[2];
+		}
+	}
+	/*
+	 * Poking out the middle of an entry.
+	 * Make two new entries.
+	 */
+	else {
+		newdup = dup;
+		INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
+			(((char *)d + offset) - (char *)newdup));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+		INT_SET(newdup2->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+		INT_SET(newdup2->length, ARCH_CONVERT, oldlen - len - INT_GET(newdup->length, ARCH_CONVERT));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup2, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup2 - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup2);
+		/*
+		 * If the old entry was in the table, we need to scan
+		 * if the 3rd entry was valid, since these entries
+		 * are smaller than the old one.
+		 * If we don't need to scan that means there were 1 or 2
+		 * entries in the table, and removing the old and adding
+		 * the 2 new will work.
+		 */
+		if (dfp) {
+			needscan = !INT_ISZERO(d->hdr.bestfree[2].length, ARCH_CONVERT);
+			if (!needscan) {
+				xfs_dir2_data_freeremove(d, dfp, needlogp);
+				(void)xfs_dir2_data_freeinsert(d, newdup,
+					needlogp);
+				(void)xfs_dir2_data_freeinsert(d, newdup2,
+					needlogp);
+			}
+		}
+	}
+	*needscanp = needscan;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_data.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_data.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_data.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_data.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_DATA_H__
+#define __XFS_DIR2_DATA_H__
+
+/*
+ * Directory format 2, data block structures.
+ */
+
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Constants.
+ */
+#define XFS_DIR2_DATA_MAGIC	0x58443244	/* XD2D: for multiblock dirs */
+#define XFS_DIR2_DATA_ALIGN_LOG 3		/* i.e., 8 bytes */
+#define XFS_DIR2_DATA_ALIGN	(1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define XFS_DIR2_DATA_FREE_TAG	0xffff
+#define XFS_DIR2_DATA_FD_COUNT	3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32gb.
+ */
+#define XFS_DIR2_SPACE_SIZE	(1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define XFS_DIR2_DATA_SPACE	0
+#define XFS_DIR2_DATA_OFFSET	(XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_DATA_FIRSTDB(mp)	\
+	XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET)
+
+/*
+ * Offsets of . and .. in data space (always block 0)
+ */
+#define XFS_DIR2_DATA_DOT_OFFSET	\
+	((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t))
+#define XFS_DIR2_DATA_DOTDOT_OFFSET	\
+	(XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1))
+#define XFS_DIR2_DATA_FIRST_OFFSET		\
+	(XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2))
+
+/*
+ * Structures.
+ */
+
+/*
+ * Describe a free area in the data block.
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+	xfs_dir2_data_off_t	offset;		/* start of freespace */
+	xfs_dir2_data_off_t	length;		/* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ * Always at the beginning of a directory-sized block.
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+	__uint32_t		magic;		/* XFS_DIR2_DATA_MAGIC */
+						/* or XFS_DIR2_BLOCK_MAGIC */
+	xfs_dir2_data_free_t	bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * Active entry in a data block.  Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_entry {
+	xfs_ino_t		inumber;	/* inode number */
+	__uint8_t		namelen;	/* name length */
+	__uint8_t		name[1];	/* name bytes, no null */
+						/* variable offset */
+	xfs_dir2_data_off_t	tag;		/* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.  Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_unused {
+	__uint16_t		freetag;	/* XFS_DIR2_DATA_FREE_TAG */
+	xfs_dir2_data_off_t	length;		/* total free length */
+						/* variable offset */
+	xfs_dir2_data_off_t	tag;		/* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+typedef union {
+	xfs_dir2_data_entry_t	entry;
+	xfs_dir2_data_unused_t	unused;
+} xfs_dir2_data_union_t;
+
+/*
+ * Generic data block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_data {
+	xfs_dir2_data_hdr_t	hdr;		/* magic XFS_DIR2_DATA_MAGIC */
+	xfs_dir2_data_union_t	u[1];
+} xfs_dir2_data_t;
+
+/*
+ * Macros.
+ */
+
+/*
+ * Size of a data entry.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTSIZE)
+int xfs_dir2_data_entsize(int n);
+#define XFS_DIR2_DATA_ENTSIZE(n)	xfs_dir2_data_entsize(n)
+#else
+#define XFS_DIR2_DATA_ENTSIZE(n)	\
+	((int)(roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \
+		 (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN)))
+#endif
+
+/*
+ * Pointer to an entry's tag word.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTRY_TAG_P)
+xfs_dir2_data_off_t *xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep);
+#define XFS_DIR2_DATA_ENTRY_TAG_P(dep)	xfs_dir2_data_entry_tag_p(dep)
+#else
+#define XFS_DIR2_DATA_ENTRY_TAG_P(dep)	\
+	((xfs_dir2_data_off_t *)\
+	 ((char *)(dep) + XFS_DIR2_DATA_ENTSIZE((dep)->namelen) - \
+	  (uint)sizeof(xfs_dir2_data_off_t)))
+#endif
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_UNUSED_TAG_P)
+xfs_dir2_data_off_t *xfs_dir2_data_unused_tag_p_arch(
+	xfs_dir2_data_unused_t *dup, xfs_arch_t arch);
+#define XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup,arch) \
+	xfs_dir2_data_unused_tag_p_arch(dup,arch)
+#else
+#define XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup,arch)	\
+	((xfs_dir2_data_off_t *)\
+	 ((char *)(dup) + INT_GET((dup)->length, arch) \
+			- (uint)sizeof(xfs_dir2_data_off_t)))
+#endif
+
+/*
+ * Function declarations.
+ */
+
+#ifdef DEBUG
+extern void
+	xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp);
+#else
+#define xfs_dir2_data_check(dp,bp)
+#endif
+
+extern xfs_dir2_data_free_t *
+	xfs_dir2_data_freefind(xfs_dir2_data_t *d,
+			       xfs_dir2_data_unused_t *dup);
+
+extern xfs_dir2_data_free_t *
+	xfs_dir2_data_freeinsert(xfs_dir2_data_t *d,
+				 xfs_dir2_data_unused_t *dup, int *loghead);
+
+extern void
+	xfs_dir2_data_freeremove(xfs_dir2_data_t *d,
+				 xfs_dir2_data_free_t *dfp, int *loghead);
+
+extern void
+	xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d,
+			       int *loghead, char *aendp);
+
+extern int
+	xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+			   struct xfs_dabuf **bpp);
+
+extern void
+	xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				xfs_dir2_data_entry_t *dep);
+
+extern void
+	xfs_dir2_data_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp);
+
+extern void
+	xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				 xfs_dir2_data_unused_t *dup);
+
+extern void
+	xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				xfs_dir2_data_aoff_t offset,
+				xfs_dir2_data_aoff_t len, int *needlogp,
+				int *needscanp);
+
+extern void
+	xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+			       xfs_dir2_data_unused_t *dup,
+			       xfs_dir2_data_aoff_t offset,
+			       xfs_dir2_data_aoff_t len, int *needlogp,
+			       int *needscanp);
+
+#endif	/* __XFS_DIR2_DATA_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_leaf.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_leaf.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_leaf.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_leaf.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1873 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_leaf.c
+ * XFS directory version 2 implementation - single leaf form
+ * see xfs_dir2_leaf.h for data structures.
+ * These directories have multiple XFS_DIR2_DATA blocks and one
+ * XFS_DIR2_LEAF1 block containing the hash table and freespace map.
+ */
+
+#include <xfs.h>
+
+/*
+ * Local function declarations.
+ */
+#ifdef DEBUG
+static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define xfs_dir2_leaf_check(dp, bp)
+#endif
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp,
+				    int *indexp, xfs_dabuf_t **dbpp);
+
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_block_to_leaf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*dbp)		/* input block's buffer */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* leaf's bestsp entries */
+	xfs_dablk_t		blkno;		/* leaf block's bno */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block's leaf entries */
+	xfs_dir2_block_tail_t	*btp;		/* block's tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dabuf_t		*lbp;		/* leaf block's buffer */
+	xfs_dir2_db_t		ldb;		/* leaf block's bno */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf's tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to rescan bestfree */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_b("block_to_leaf", args, dbp);
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Add the leaf block to the inode.
+	 * This interface will only put blocks in the leaf/node range.
+	 * Since that's empty now, we'll get the root (block 0 in range).
+	 */
+	if ((error = xfs_da_grow_inode(args, &blkno))) {
+		return error;
+	}
+	ldb = XFS_DIR2_DA_TO_DB(mp, blkno);
+	ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
+	/*
+	 * Initialize the leaf block, get a buffer for it.
+	 */
+	if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
+		return error;
+	}
+	ASSERT(lbp != NULL);
+	leaf = lbp->data;
+	block = dbp->data;
+	xfs_dir2_data_check(dp, dbp);
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Set the counts in the leaf header.
+	 */
+	INT_COPY(leaf->hdr.count, btp->count, ARCH_CONVERT); /* INT_: type change */
+	INT_COPY(leaf->hdr.stale, btp->stale, ARCH_CONVERT); /* INT_: type change */
+	/*
+	 * Could compact these but I think we always do the conversion
+	 * after squeezing out stale entries.
+	 */
+	bcopy(blp, leaf->ents, INT_GET(btp->count, ARCH_CONVERT) * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir2_leaf_log_ents(tp, lbp, 0, INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1);
+	needscan = 0;
+	needlog = 1;
+	/*
+	 * Make the space formerly occupied by the leaf entries and block
+	 * tail be free.
+	 */
+	xfs_dir2_data_make_free(tp, dbp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+		(xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize -
+				       (char *)blp),
+		&needlog, &needscan);
+	/*
+	 * Fix up the block header, make it a data block.
+	 */
+	INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+			NULL);
+	/*
+	 * Set up leaf tail and bests table.
+	 */
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	INT_SET(ltp->bestcount, ARCH_CONVERT, 1);
+	bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+	INT_COPY(bestsp[0], block->hdr.bestfree[0].length, ARCH_CONVERT);
+	/*
+	 * Log the data header and leaf bests table.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	xfs_dir2_leaf_check(dp, lbp);
+	xfs_dir2_data_check(dp, dbp);
+	xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
+	xfs_da_buf_done(lbp);
+	return 0;
+}
+
+/*
+ * Add an entry to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_leaf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* freespace table in leaf */
+	int			compact;	/* need to compact leaves */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry */
+	int			error;		/* error return value */
+	int			grown;		/* allocated new data block */
+	int			highstale;	/* index of next stale leaf */
+	int			i;		/* temporary, index */
+	int			index;		/* leaf table position */
+	xfs_dabuf_t		*lbp;		/* leaf's buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length;		/* length of new entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry table pointer */
+	int			lfloglow;	/* low leaf logging index */
+	int			lfloghigh;	/* high leaf logging index */
+	int			lowstale;	/* index of prev stale leaf */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needbytes;	/* leaf block bytes needed */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data free */
+	xfs_dir2_data_off_t	*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		use_block;	/* data block number */
+
+	xfs_dir2_trace_args("leaf_addname", args);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the leaf block.
+	 */
+	error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+		XFS_DATA_FORK);
+	if (error) {
+		return error;
+	}
+	ASSERT(lbp != NULL);
+	/*
+	 * Look up the entry by hash value and name.
+	 * We know it's not there, our caller has already done a lookup.
+	 * So the index is of the entry to insert in front of.
+	 * But if there are dup hash values the index is of the first of those.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	leaf = lbp->data;
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+	length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+	/*
+	 * See if there are any entries with the same hash value
+	 * and space in their block for the new entry.
+	 * This is good because it puts multiple same-hash value entries
+	 * in a data block, improving the lookup of those entries.
+	 */
+	for (use_block = -1, lep = &leaf->ents[index];
+	     index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+	     index++, lep++) {
+		if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		i = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+		ASSERT(i < INT_GET(ltp->bestcount, ARCH_CONVERT));
+		ASSERT(INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF);
+		if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
+			use_block = i;
+			break;
+		}
+	}
+	/*
+	 * Didn't find a block yet, linear search all the data blocks.
+	 */
+	if (use_block == -1) {
+		for (i = 0; i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++) {
+			/*
+			 * Remember a block we see that's missing.
+			 */
+			if (INT_GET(bestsp[i], ARCH_CONVERT) == NULLDATAOFF && use_block == -1)
+				use_block = i;
+			else if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
+				use_block = i;
+				break;
+			}
+		}
+	}
+	/*
+	 * How many bytes do we need in the leaf block?
+	 */
+	needbytes =
+		(!INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT) ? 0 : (uint)sizeof(leaf->ents[0])) +
+		(use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0]));
+	/*
+	 * Now kill use_block if it refers to a missing block, so we
+	 * can use it as an indication of allocation needed.
+	 */
+	if (use_block != -1 && INT_GET(bestsp[use_block], ARCH_CONVERT) == NULLDATAOFF)
+		use_block = -1;
+	/*
+	 * If we don't have enough free bytes but we can make enough
+	 * by compacting out stale entries, we'll do that.
+	 */
+	if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] < needbytes &&
+	    INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1) {
+		compact = 1;
+	}
+	/*
+	 * Otherwise if we don't have enough free bytes we need to
+	 * convert to node form.
+	 */
+	else if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <
+		 needbytes) {
+		/*
+		 * Just checking or no space reservation, give up.
+		 */
+		if (args->justcheck || args->total == 0) {
+			xfs_da_brelse(tp, lbp);
+			return XFS_ERROR(ENOSPC);
+		}
+		/*
+		 * Convert to node form.
+		 */
+		error = xfs_dir2_leaf_to_node(args, lbp);
+		xfs_da_buf_done(lbp);
+		if (error)
+			return error;
+		/*
+		 * Then add the new entry.
+		 */
+		return xfs_dir2_node_addname(args);
+	}
+	/*
+	 * Otherwise it will fit without compaction.
+	 */
+	else
+		compact = 0;
+	/*
+	 * If just checking, then it will fit unless we needed to allocate
+	 * a new data block.
+	 */
+	if (args->justcheck) {
+		xfs_da_brelse(tp, lbp);
+		return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
+	}
+	/*
+	 * If no allocations are allowed, return now before we've
+	 * changed anything.
+	 */
+	if (args->total == 0 && use_block == -1) {
+		xfs_da_brelse(tp, lbp);
+		return XFS_ERROR(ENOSPC);
+	}
+	/*
+	 * Need to compact the leaf entries, removing stale ones.
+	 * Leave one stale entry behind - the one closest to our
+	 * insertion index - and we'll shift that one to our insertion
+	 * point later.
+	 */
+	if (compact) {
+		xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
+			&lfloglow, &lfloghigh);
+	}
+	/*
+	 * There are stale entries, so we'll need log-low and log-high
+	 * impossibly bad values later.
+	 */
+	else if (INT_GET(leaf->hdr.stale, ARCH_CONVERT)) {
+		lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		lfloghigh = -1;
+	}
+	/*
+	 * If there was no data block space found, we need to allocate
+	 * a new one.
+	 */
+	if (use_block == -1) {
+		/*
+		 * Add the new data block.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+				&use_block))) {
+			xfs_da_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * Initialize the block.
+		 */
+		if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
+			xfs_da_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * If we're adding a new data block on the end we need to
+		 * extend the bests table.  Copy it up one entry.
+		 */
+		if (use_block >= INT_GET(ltp->bestcount, ARCH_CONVERT)) {
+			bestsp--;
+			ovbcopy(&bestsp[1], &bestsp[0],
+				INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(bestsp[0]));
+			INT_MOD(ltp->bestcount, ARCH_CONVERT, +1);
+			xfs_dir2_leaf_log_tail(tp, lbp);
+			xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+		}
+		/*
+		 * If we're filling in a previously empty block just log it.
+		 */
+		else
+			xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+		data = dbp->data;
+		INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		grown = 1;
+	}
+	/*
+	 * Already had space in some data block.
+	 * Just read that one in.
+	 */
+	else {
+		if ((error =
+		    xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block),
+			    -1, &dbp, XFS_DATA_FORK))) {
+			xfs_da_brelse(tp, lbp);
+			return error;
+		}
+		data = dbp->data;
+		grown = 0;
+	}
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * Point to the biggest freespace in our data block.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
+	ASSERT(INT_GET(dup->length, ARCH_CONVERT) >= length);
+	needscan = needlog = 0;
+	/*
+	 * Mark the initial part of our freespace in use for the new entry.
+	 */
+	xfs_dir2_data_use_free(tp, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+		&needlog, &needscan);
+	/*
+	 * Initialize our new entry (at last).
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->namelen = args->namelen;
+	bcopy(args->name, dep->name, dep->namelen);
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
+	/*
+	 * Need to scan fix up the bestfree table.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+	/*
+	 * Need to log the data block's header.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	xfs_dir2_data_log_entry(tp, dbp, dep);
+	/*
+	 * If the bests table needs to be changed, do it.
+	 * Log the change unless we've already done that.
+	 */
+	if (INT_GET(bestsp[use_block], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+		INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		if (!grown)
+			xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+	}
+	/*
+	 * Now we need to make room to insert the leaf entry.
+	 * If there are no stale entries, we just insert a hole at index.
+	 */
+	if (INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT)) {
+		/*
+		 * lep is still good as the index leaf entry.
+		 */
+		if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+			ovbcopy(lep, lep + 1,
+				(INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
+		/*
+		 * Record low and high logging indices for the leaf.
+		 */
+		lfloglow = index;
+		lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
+	}
+	/*
+	 * There are stale entries.
+	 * We will use one of them for the new entry.
+	 * It's probably not at the right location, so we'll have to
+	 * shift some up or down first.
+	 */
+	else {
+		/*
+		 * If we didn't compact before, we need to find the nearest
+		 * stale entries before and after our insertion point.
+		 */
+		if (compact == 0) {
+			/*
+			 * Find the first stale entry before the insertion
+			 * point, if any.
+			 */
+			for (lowstale = index - 1;
+			     lowstale >= 0 &&
+				INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
+				XFS_DIR2_NULL_DATAPTR;
+			     lowstale--)
+				continue;
+			/*
+			 * Find the next stale entry at or after the insertion
+			 * point, if any.   Stop if we go so far that the
+			 * lowstale entry would be better.
+			 */
+			for (highstale = index;
+			     highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
+				INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
+				XFS_DIR2_NULL_DATAPTR &&
+				(lowstale < 0 ||
+				 index - lowstale - 1 >= highstale - index);
+			     highstale++)
+				continue;
+		}
+		/*
+		 * If the low one is better, use it.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+		     index - lowstale - 1 < highstale - index)) {
+			ASSERT(index - lowstale - 1 >= 0);
+			ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			/*
+			 * Copy entries up to cover the stale entry
+			 * and make room for the new entry.
+			 */
+			if (index - lowstale - 1 > 0)
+				ovbcopy(&leaf->ents[lowstale + 1],
+					&leaf->ents[lowstale],
+					(index - lowstale - 1) * sizeof(*lep));
+			lep = &leaf->ents[index - 1];
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(index - 1, lfloghigh);
+		}
+		/*
+		 * The high one is better, so use that one.
+		 */
+		else {
+			ASSERT(highstale - index >= 0);
+			ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			/*
+			 * Copy entries down to copver the stale entry
+			 * and make room for the new entry.
+			 */
+			if (highstale - index > 0)
+				ovbcopy(&leaf->ents[index],
+					&leaf->ents[index + 1],
+					(highstale - index) * sizeof(*lep));
+			lep = &leaf->ents[index];
+			lfloglow = MIN(index, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
+	}
+	/*
+	 * Fill in the new leaf entry.
+	 */
+	INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
+	INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, INT_GET(*tagp, ARCH_CONVERT)));
+	/*
+	 * Log the leaf fields and give up the buffers.
+	 */
+	xfs_dir2_leaf_log_header(tp, lbp);
+	xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
+	xfs_dir2_leaf_check(dp, lbp);
+	xfs_da_buf_done(lbp);
+	xfs_dir2_data_check(dp, dbp);
+	xfs_da_buf_done(dbp);
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+void
+xfs_dir2_leaf_check(
+	xfs_inode_t		*dp,		/* incore directory inode */
+	xfs_dabuf_t		*bp)		/* leaf's buffer */
+{
+	int			i;		/* leaf index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			stale;		/* count of stale leaves */
+
+	leaf = bp->data;
+	mp = dp->i_mount;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+	/*
+	 * This value is not restrictive enough.
+	 * Should factor in the size of the bests table as well.
+	 * We can deduce a value for that from di_size.
+	 */
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	/*
+	 * Leaves and bests don't overlap.
+	 */
+	ASSERT((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <=
+	       (char *)XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT));
+	/*
+	 * Check hash value order, count stale entries.
+	 */
+	for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
+		if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+			ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
+			       INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
+		if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			stale++;
+	}
+	ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
+}
+#endif	/* DEBUG */
+
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir2_leaf_compact(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_dabuf_t	*bp)		/* leaf buffer */
+{
+	int		from;		/* source leaf index */
+	xfs_dir2_leaf_t *leaf;		/* leaf structure */
+	int		loglow;		/* first leaf entry to log */
+	int		to;		/* target leaf index */
+
+	leaf = bp->data;
+	if (INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT)) {
+		return;
+	}
+	/*
+	 * Compress out the stale entries in place.
+	 */
+	for (from = to = 0, loglow = -1; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+		if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Only actually copy the entries that are different.
+		 */
+		if (from > to) {
+			if (loglow == -1)
+				loglow = to;
+			leaf->ents[to] = leaf->ents[from];
+		}
+		to++;
+	}
+	/*
+	 * Update and log the header, log the leaf entries.
+	 */
+	ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == from - to);
+	INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(INT_GET(leaf->hdr.stale, ARCH_CONVERT)));
+	INT_ZERO(leaf->hdr.stale, ARCH_CONVERT);
+	xfs_dir2_leaf_log_header(args->trans, bp);
+	if (loglow != -1)
+		xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
+}
+
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir2_leaf_compact_x1(
+	xfs_dabuf_t	*bp,		/* leaf buffer */
+	int		*indexp,	/* insertion index */
+	int		*lowstalep,	/* out: stale entry before us */
+	int		*highstalep,	/* out: stale entry after us */
+	int		*lowlogp,	/* out: low log index */
+	int		*highlogp)	/* out: high log index */
+{
+	int		from;		/* source copy index */
+	int		highstale;	/* stale entry at/after index */
+	int		index;		/* insertion index */
+	int		keepstale;	/* source index of kept stale */
+	xfs_dir2_leaf_t *leaf;		/* leaf structure */
+	int		lowstale;	/* stale entry before index */
+	int		newindex=0;	/* new insertion index */
+	int		to;		/* destination copy index */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1);
+	index = *indexp;
+	/*
+	 * Find the first stale entry before our index, if any.
+	 */
+	for (lowstale = index - 1;
+	     lowstale >= 0 &&
+		INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
+	     lowstale--)
+		continue;
+	/*
+	 * Find the first stale entry at or after our index, if any.
+	 * Stop if the answer would be worse than lowstale.
+	 */
+	for (highstale = index;
+	     highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
+		INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
+		(lowstale < 0 || index - lowstale > highstale - index);
+	     highstale++)
+		continue;
+	/*
+	 * Pick the better of lowstale and highstale.
+	 */
+	if (lowstale >= 0 &&
+	    (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+	     index - lowstale <= highstale - index))
+		keepstale = lowstale;
+	else
+		keepstale = highstale;
+	/*
+	 * Copy the entries in place, removing all the stale entries
+	 * except keepstale.
+	 */
+	for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+		/*
+		 * Notice the new value of index.
+		 */
+		if (index == from)
+			newindex = to;
+		if (from != keepstale &&
+		    INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
+			if (from == to)
+				*lowlogp = to;
+			continue;
+		}
+		/*
+		 * Record the new keepstale value for the insertion.
+		 */
+		if (from == keepstale)
+			lowstale = highstale = to;
+		/*
+		 * Copy only the entries that have moved.
+		 */
+		if (from > to)
+			leaf->ents[to] = leaf->ents[from];
+		to++;
+	}
+	ASSERT(from > to);
+	/*
+	 * If the insertion point was past the last entry,
+	 * set the new insertion point accordingly.
+	 */
+	if (index == from)
+		newindex = to;
+	*indexp = newindex;
+	/*
+	 * Adjust the leaf header values.
+	 */
+	INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(from - to));
+	INT_SET(leaf->hdr.stale, ARCH_CONVERT, 1);
+	/*
+	 * Remember the low/high stale value only in the "right"
+	 * direction.
+	 */
+	if (lowstale >= newindex)
+		lowstale = -1;
+	else
+		highstale = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	*highlogp = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
+	*lowstalep = lowstale;
+	*highstalep = highstale;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+int						/* error */
+xfs_dir2_leaf_getdents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*dp,		/* incore directory inode */
+	uio_t			*uio,		/* I/O control & vectors */
+	int			*eofp,		/* out: reached end of dir */
+	xfs_dirent_t		*dbp,		/* caller's buffer */
+	xfs_dir2_put_t		put)		/* ABI formatting routine */
+{
+	xfs_dabuf_t		*bp;		/* data block buffer */
+	int			byteoff;	/* offset in current block */
+	xfs_dir2_db_t		curdb;		/* db for current block */
+	xfs_dir2_off_t		curoff;		/* current overall offset */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	int			eof;		/* reached end of directory */
+	int			error=0;		/* error return value */
+	int			i;		/* temporary loop index */
+	int			j;		/* temporary loop index */
+	int			length;		/* temporary length value */
+	xfs_bmbt_irec_t		*map;		/* map vector for blocks */
+	xfs_extlen_t		map_blocks;	/* number of fsbs in map */
+	xfs_dablk_t		map_off;	/* last mapped file offset */
+	int			map_size;	/* total entries in *map */
+	int			map_valid;	/* valid entries in *map */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
+	int			nmap;		/* mappings to ask xfs_bmapi */
+	xfs_dir2_put_args_t	p;		/* formatting arg bundle */
+	char			*ptr=NULL;		/* pointer to current data */
+	int			ra_current;	/* number of read-ahead blks */
+	int			ra_index;	/* *map index for read-ahead */
+	int			ra_offset;	/* map entry offset for ra */
+	int			ra_want;	/* readahead count wanted */
+
+	/*
+	 * If the offset is at or past the largest allowed value,
+	 * give up right away, return eof.
+	 */
+	if (uio->uio_offset >= XFS_DIR2_MAX_DATAPTR) {
+		*eofp = 1;
+		return 0;
+	}
+	mp = dp->i_mount;
+	/*
+	 * Setup formatting arguments.
+	 */
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+	/*
+	 * Set up to bmap a number of blocks based on the caller's
+	 * buffer size, the directory block size, and the filesystem
+	 * block size.
+	 */
+	map_size =
+		howmany(uio->uio_resid + mp->m_dirblksize,
+			mp->m_sb.sb_blocksize);
+	map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
+	map_valid = ra_index = ra_offset = ra_current = map_blocks = 0;
+	bp = NULL;
+	eof = 1;
+	/*
+	 * Inside the loop we keep the main offset value as a byte offset
+	 * in the directory file.
+	 */
+	curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset);
+	/*
+	 * Force this conversion through db so we truncate the offset
+	 * down to get the start of the data block.
+	 */
+	map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff));
+	/*
+	 * Loop over directory entries until we reach the end offset.
+	 * Get more blocks and readahead as necessary.
+	 */
+	while (curoff < XFS_DIR2_LEAF_OFFSET) {
+		/*
+		 * If we have no buffer, or we're off the end of the
+		 * current buffer, need to get another one.
+		 */
+		if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) {
+			/*
+			 * If we have a buffer, we need to release it and
+			 * take it out of the mapping.
+			 */
+			if (bp) {
+				xfs_da_brelse(tp, bp);
+				bp = NULL;
+				map_blocks -= mp->m_dirblkfsbs;
+				/*
+				 * Loop to get rid of the extents for the
+				 * directory block.
+				 */
+				for (i = mp->m_dirblkfsbs; i > 0; ) {
+					j = MIN((int)map->br_blockcount, i);
+					map->br_blockcount -= j;
+					map->br_startblock += j;
+					map->br_startoff += j;
+					/*
+					 * If mapping is done, pitch it from
+					 * the table.
+					 */
+					if (!map->br_blockcount && --map_valid)
+						ovbcopy(&map[1], &map[0],
+							sizeof(map[0]) *
+							map_valid);
+					i -= j;
+				}
+			}
+			/*
+			 * Recalculate the readahead blocks wanted.
+			 */
+			ra_want = howmany(uio->uio_resid + mp->m_dirblksize,
+					  mp->m_sb.sb_blocksize) - 1;
+			/*
+			 * If we don't have as many as we want, and we haven't
+			 * run out of data blocks, get some more mappings.
+			 */
+			if (1 + ra_want > map_blocks &&
+			    map_off <
+			    XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) {
+				/*
+				 * Get more bmaps, fill in after the ones
+				 * we already have in the table.
+				 */
+				nmap = map_size - map_valid;
+				error = xfs_bmapi(tp, dp,
+					map_off,
+					XFS_DIR2_BYTE_TO_DA(mp,
+						XFS_DIR2_LEAF_OFFSET) - map_off,
+					XFS_BMAPI_METADATA, NULL, 0,
+					&map[map_valid], &nmap, NULL);
+				/*
+				 * Don't know if we should ignore this or
+				 * try to return an error.
+				 * The trouble with returning errors
+				 * is that readdir will just stop without
+				 * actually passing the error through.
+				 */
+				if (error)
+					break;	/* XXX */
+				/*
+				 * If we got all the mappings we asked for,
+				 * set the final map offset based on the
+				 * last bmap value received.
+				 * Otherwise, we've reached the end.
+				 */
+				if (nmap == map_size - map_valid)
+					map_off =
+					map[map_valid + nmap - 1].br_startoff +
+					map[map_valid + nmap - 1].br_blockcount;
+				else
+					map_off =
+						XFS_DIR2_BYTE_TO_DA(mp,
+							XFS_DIR2_LEAF_OFFSET);
+				/*
+				 * Look for holes in the mapping, and
+				 * eliminate them.  Count up the valid blocks.
+				 */
+				for (i = map_valid; i < map_valid + nmap; ) {
+					if (map[i].br_startblock ==
+					    HOLESTARTBLOCK) {
+						nmap--;
+						length = map_valid + nmap - i;
+						if (length)
+							ovbcopy(&map[i + 1],
+								&map[i],
+								sizeof(map[i]) *
+								length);
+					} else {
+						map_blocks +=
+							map[i].br_blockcount;
+						i++;
+					}
+				}
+				map_valid += nmap;
+			}
+			/*
+			 * No valid mappings, so no more data blocks.
+			 */
+			if (!map_valid) {
+				curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off);
+				break;
+			}
+			/*
+			 * Read the directory block starting at the first
+			 * mapping.
+			 */
+			curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff);
+			error = xfs_da_read_buf(tp, dp, map->br_startoff,
+				map->br_blockcount >= mp->m_dirblkfsbs ?
+				    XFS_FSB_TO_DADDR(mp, map->br_startblock) :
+				    -1,
+				&bp, XFS_DATA_FORK);
+			/*
+			 * Should just skip over the data block instead
+			 * of giving up.
+			 */
+			if (error)
+				break;	/* XXX */
+			/*
+			 * Adjust the current amount of read-ahead: we just
+			 * read a block that was previously ra.
+			 */
+			if (ra_current)
+				ra_current -= mp->m_dirblkfsbs;
+			/*
+			 * Do we need more readahead?
+			 */
+			for (ra_index = ra_offset = i = 0;
+			     ra_want > ra_current && i < map_blocks;
+			     i += mp->m_dirblkfsbs) {
+				ASSERT(ra_index < map_valid);
+				/*
+				 * Read-ahead a contiguous directory block.
+				 */
+				if (i > ra_current &&
+				    map[ra_index].br_blockcount >=
+				    mp->m_dirblkfsbs) {
+					xfs_baread(mp->m_ddev_targp,
+						XFS_FSB_TO_DADDR(mp,
+						   map[ra_index].br_startblock +
+						   ra_offset),
+						(int)BTOBB(mp->m_dirblksize));
+					ra_current = i;
+				}
+				/*
+				 * Read-ahead a non-contiguous directory block.
+				 * This doesn't use our mapping, but this
+				 * is a very rare case.
+				 */
+				else if (i > ra_current) {
+					(void)xfs_da_reada_buf(tp, dp,
+						map[ra_index].br_startoff +
+						ra_offset, XFS_DATA_FORK);
+					ra_current = i;
+				}
+				/*
+				 * Advance offset through the mapping table.
+				 */
+				for (j = 0; j < mp->m_dirblkfsbs; j++) {
+					/*
+					 * The rest of this extent but not
+					 * more than a dir block.
+					 */
+					length = MIN(mp->m_dirblkfsbs,
+						(int)(map[ra_index].br_blockcount -
+						ra_offset));
+					j += length;
+					ra_offset += length;
+					/*
+					 * Advance to the next mapping if
+					 * this one is used up.
+					 */
+					if (ra_offset ==
+					    map[ra_index].br_blockcount) {
+						ra_offset = 0;
+						ra_index++;
+					}
+				}
+			}
+			/*
+			 * Having done a read, we need to set a new offset.
+			 */
+			newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0);
+			/*
+			 * Start of the current block.
+			 */
+			if (curoff < newoff)
+				curoff = newoff;
+			/*
+			 * Make sure we're in the right block.
+			 */
+			else if (curoff > newoff)
+				ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) ==
+				       curdb);
+			data = bp->data;
+			xfs_dir2_data_check(dp, bp);
+			/*
+			 * Find our position in the block.
+			 */
+			ptr = (char *)&data->u;
+			byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff);
+			/*
+			 * Skip past the header.
+			 */
+			if (byteoff == 0)
+				curoff += (uint)sizeof(data->hdr);
+			/*
+			 * Skip past entries until we reach our offset.
+			 */
+			else {
+				while ((char *)ptr - (char *)data < byteoff) {
+					dup = (xfs_dir2_data_unused_t *)ptr;
+
+					if (INT_GET(dup->freetag, ARCH_CONVERT)
+						  == XFS_DIR2_DATA_FREE_TAG) {
+
+						length = INT_GET(dup->length,
+								 ARCH_CONVERT);
+						ptr += length;
+						continue;
+					}
+					dep = (xfs_dir2_data_entry_t *)ptr;
+					length =
+					   XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+					ptr += length;
+				}
+				/*
+				 * Now set our real offset.
+				 */
+				curoff =
+					XFS_DIR2_DB_OFF_TO_BYTE(mp,
+					    XFS_DIR2_BYTE_TO_DB(mp, curoff),
+					    (char *)ptr - (char *)data);
+			}
+		}
+		/*
+		 * We have a pointer to an entry.
+		 * Is it a live one?
+		 */
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		/*
+		 * No, it's unused, skip over it.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT)
+						== XFS_DIR2_DATA_FREE_TAG) {
+			length = INT_GET(dup->length, ARCH_CONVERT);
+			ptr += length;
+			curoff += length;
+			continue;
+		}
+
+		/*
+		 * Copy the entry into the putargs, and try formatting it.
+		 */
+		dep = (xfs_dir2_data_entry_t *)ptr;
+
+		p.namelen = dep->namelen;
+
+		length = XFS_DIR2_DATA_ENTSIZE(p.namelen);
+
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff);
+
+#if XFS_BIG_FILESYSTEMS
+		p.ino = INT_GET(dep->inumber, ARCH_CONVERT) + mp->m_inoadd;
+#else
+		p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
+#endif
+		p.name = (char *)dep->name;
+
+		error = p.put(&p);
+
+		/*
+		 * Won't fit.  Return to caller.
+		 */
+		if (!p.done) {
+			eof = 0;
+			break;
+		}
+		/*
+		 * Advance to next entry in the block.
+		 */
+		ptr += length;
+		curoff += length;
+	}
+
+	/*
+	 * All done.  Set output offset value to current offset.
+	 */
+	*eofp = eof;
+	if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR))
+		uio->uio_offset = XFS_DIR2_MAX_DATAPTR;
+	else
+		uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff);
+	kmem_free(map, map_size * sizeof(*map));
+	if (bp)
+		xfs_da_brelse(tp, bp);
+	return error;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+int
+xfs_dir2_leaf_init(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dir2_db_t		bno,		/* directory block number */
+	xfs_dabuf_t		**bpp,		/* out: leaf buffer */
+	int			magic)		/* magic number for block */
+{
+	xfs_dabuf_t		*bp;		/* leaf buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	tp = args->trans;
+	mp = dp->i_mount;
+	ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
+	       bno < XFS_DIR2_FREE_FIRSTDB(mp));
+	/*
+	 * Get the buffer for the block.
+	 */
+	error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp,
+		XFS_DATA_FORK);
+	if (error) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	/*
+	 * Initialize the header.
+	 */
+	INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, magic);
+	INT_ZERO(leaf->hdr.info.forw, ARCH_CONVERT);
+	INT_ZERO(leaf->hdr.info.back, ARCH_CONVERT);
+	INT_ZERO(leaf->hdr.count, ARCH_CONVERT);
+	INT_ZERO(leaf->hdr.stale, ARCH_CONVERT);
+	xfs_dir2_leaf_log_header(tp, bp);
+	/*
+	 * If it's a leaf-format directory initialize the tail.
+	 * In this case our caller has the real bests table to copy into
+	 * the block.
+	 */
+	if (magic == XFS_DIR2_LEAF1_MAGIC) {
+		ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+		INT_ZERO(ltp->bestcount, ARCH_CONVERT);
+		xfs_dir2_leaf_log_tail(tp, bp);
+	}
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+void
+xfs_dir2_leaf_log_bests(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_data_off_t	*firstb;	/* pointer to first entry */
+	xfs_dir2_data_off_t	*lastb;		/* pointer to last entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+	ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf);
+	firstb = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT) + first;
+	lastb = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT) + last;
+	xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+		(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_ents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_leaf_entry_t	*firstlep;	/* pointer to first entry */
+	xfs_dir2_leaf_entry_t	*lastlep;	/* pointer to last entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
+	       INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	firstlep = &leaf->ents[first];
+	lastlep = &leaf->ents[last];
+	xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+		(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_header(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* leaf buffer */
+{
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
+	       INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
+		(uint)(sizeof(leaf->hdr) - 1));
+}
+
+/*
+ * Log the tail of the leaf1 block.
+ */
+void
+xfs_dir2_leaf_log_tail(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* leaf buffer */
+{
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	mp = tp->t_mountp;
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
+		(uint)(mp->m_dirblksize - 1));
+}
+
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* found entry index */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("leaf_lookup", args);
+	/*
+	 * Look up name in the leaf block, returning both buffers and index.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	tp = args->trans;
+	dp = args->dp;
+	xfs_dir2_leaf_check(dp, lbp);
+	leaf = lbp->data;
+	/*
+	 * Get to the leaf entry and contained data entry address.
+	 */
+	lep = &leaf->ents[index];
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->data +
+	       XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
+	/*
+	 * Return the found inode number.
+	 */
+	args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+	xfs_da_brelse(tp, dbp);
+	xfs_da_brelse(tp, lbp);
+	return XFS_ERROR(EEXIST);
+}
+
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int					/* error */
+xfs_dir2_leaf_lookup_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		**lbpp,		/* out: leaf buffer */
+	int			*indexp,	/* out: index in leaf block */
+	xfs_dabuf_t		**dbpp)		/* out: data buffer */
+{
+	xfs_dir2_db_t		curdb;		/* current data block number */
+	xfs_dabuf_t		*dbp;		/* data buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index in leaf block */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the leaf block into the buffer.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+		    XFS_DATA_FORK))) {
+		return error;
+	}
+	*lbpp = lbp;
+	leaf = lbp->data;
+	xfs_dir2_leaf_check(dp, lbp);
+	/*
+	 * Look for the first leaf entry with our hash value.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	/*
+	 * Loop over all the entries with the right hash value
+	 * looking to match the name.
+	 */
+	for (lep = &leaf->ents[index], dbp = NULL, curdb = -1;
+	     index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip over stale leaf entries.
+		 */
+		if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get the new data block number.
+		 */
+		newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+		/*
+		 * If it's not the same as the old data block number,
+		 * need to pitch the old one and read the new one.
+		 */
+		if (newdb != curdb) {
+			if (dbp)
+				xfs_da_brelse(tp, dbp);
+			if ((error =
+			    xfs_da_read_buf(tp, dp,
+				    XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp,
+				    XFS_DATA_FORK))) {
+				xfs_da_brelse(tp, lbp);
+				return error;
+			}
+			xfs_dir2_data_check(dp, dbp);
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)dbp->data +
+		       XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+		/*
+		 * If it matches then return it.
+		 */
+		if (dep->namelen == args->namelen &&
+		    dep->name[0] == args->name[0] &&
+		    bcmp(dep->name, args->name, args->namelen) == 0) {
+			*dbpp = dbp;
+			*indexp = index;
+			return 0;
+		}
+	}
+	/*
+	 * No match found, return ENOENT.
+	 */
+	ASSERT(args->oknoent);
+	if (dbp)
+		xfs_da_brelse(tp, dbp);
+	xfs_da_brelse(tp, lbp);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int						/* error */
+xfs_dir2_leaf_removename(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* leaf block best freespace */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_db_t		db;		/* data block number */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry structure */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_db_t		i;		/* temporary data block # */
+	int			index;		/* index into leaf entries */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_dir2_data_off_t	oldbest;	/* old value of best free */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("leaf_removename", args);
+	/*
+	 * Lookup the leaf entry, get the leaf and data blocks read in.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->data;
+	data = dbp->data;
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * Point to the leaf entry, use that to point to the data entry.
+	 */
+	lep = &leaf->ents[index];
+	db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+	needscan = needlog = 0;
+	oldbest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+	ASSERT(INT_GET(bestsp[db], ARCH_CONVERT) == oldbest);
+	/*
+	 * Mark the former data entry unused.
+	 */
+	xfs_dir2_data_make_free(tp, dbp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)data),
+		XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+	/*
+	 * We just mark the leaf entry stale by putting a null in it.
+	 */
+	INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
+	xfs_dir2_leaf_log_header(tp, lbp);
+	INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_leaf_log_ents(tp, lbp, index, index);
+	/*
+	 * Scan the freespace in the data block again if necessary,
+	 * log the data block header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	/*
+	 * If the longest freespace in the data block has changed,
+	 * put the new value in the bests table and log that.
+	 */
+	if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) != oldbest) {
+		INT_COPY(bestsp[db], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		xfs_dir2_leaf_log_bests(tp, lbp, db, db);
+	}
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * If the data block is now empty then get rid of the data block.
+	 */
+	if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
+	    mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+		ASSERT(db != mp->m_dirdatablk);
+		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+			/*
+			 * Nope, can't get rid of it because it caused
+			 * allocation of a bmap btree block to do so.
+			 * Just go on, returning success, leaving the
+			 * empty block in place.
+			 */
+			if (error == ENOSPC && args->total == 0) {
+				xfs_da_buf_done(dbp);
+				error = 0;
+			}
+			xfs_dir2_leaf_check(dp, lbp);
+			xfs_da_buf_done(lbp);
+			return error;
+		}
+		dbp = NULL;
+		/*
+		 * If this is the last data block then compact the
+		 * bests table by getting rid of entries.
+		 */
+		if (db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1) {
+			/*
+			 * Look for the last active entry (i).
+			 */
+			for (i = db - 1; i > 0; i--) {
+				if (INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF)
+					break;
+			}
+			/*
+			 * Copy the table down so inactive entries at the
+			 * end are removed.
+			 */
+			ovbcopy(bestsp, &bestsp[db - i],
+				(INT_GET(ltp->bestcount, ARCH_CONVERT) - (db - i)) * sizeof(*bestsp));
+			INT_MOD(ltp->bestcount, ARCH_CONVERT, -(db - i));
+			xfs_dir2_leaf_log_tail(tp, lbp);
+			xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+		} else
+			INT_SET(bestsp[db], ARCH_CONVERT, NULLDATAOFF);
+	}
+	/*
+	 * If the data block was not the first one, drop it.
+	 */
+	else if (db != mp->m_dirdatablk && dbp != NULL) {
+		xfs_da_buf_done(dbp);
+		dbp = NULL;
+	}
+	xfs_dir2_leaf_check(dp, lbp);
+	/*
+	 * See if we can convert to block form.
+	 */
+	return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int						/* error */
+xfs_dir2_leaf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index of leaf entry */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("leaf_replace", args);
+	/*
+	 * Look up the entry.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	leaf = lbp->data;
+	/*
+	 * Point to the leaf entry, get data address from it.
+	 */
+	lep = &leaf->ents[index];
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->data +
+	       XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
+	ASSERT(args->inumber != INT_GET(dep->inumber, ARCH_CONVERT));
+	/*
+	 * Put the new inode number in, log it.
+	 */
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	tp = args->trans;
+	xfs_dir2_data_log_entry(tp, dbp, dep);
+	xfs_da_buf_done(dbp);
+	xfs_dir2_leaf_check(dp, lbp);
+	xfs_da_brelse(tp, lbp);
+	return 0;
+}
+
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int						/* index value */
+xfs_dir2_leaf_search_hash(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp)		/* leaf buffer */
+{
+	xfs_dahash_t		hash=0;		/* hash from this entry */
+	xfs_dahash_t		hashwant;	/* hash value looking for */
+	int			high;		/* high leaf index */
+	int			low;		/* low leaf index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			mid=0;		/* current leaf index */
+
+	leaf = lbp->data;
+#ifndef __KERNEL__
+	if (INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+		return 0;
+#endif
+	/*
+	 * Note, the table cannot be empty, so we have to go through the loop.
+	 * Binary search the leaf entries looking for our hash value.
+	 */
+	for (lep = leaf->ents, low = 0, high = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1,
+		hashwant = args->hashval;
+	     low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = INT_GET(lep[mid].hashval, ARCH_CONVERT)) == hashwant)
+			break;
+		if (hash < hashwant)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	/*
+	 * Found one, back up through all the equal hash values.
+	 */
+	if (hash == hashwant) {
+		while (mid > 0 && INT_GET(lep[mid - 1].hashval, ARCH_CONVERT) == hashwant) {
+			mid--;
+		}
+	}
+	/*
+	 * Need to point to an entry higher than ours.
+	 */
+	else if (hash < hashwant)
+		mid++;
+	return mid;
+}
+
+/*
+ * Trim off a trailing data block.  We know it's empty since the leaf
+ * freespace table says so.
+ */
+int						/* error */
+xfs_dir2_leaf_trim_data(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp,		/* leaf buffer */
+	xfs_dir2_db_t		db)		/* data block number */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* leaf bests table */
+#ifdef DEBUG
+	xfs_dir2_data_t		*data;		/* data block structure */
+#endif
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Read the offending data block.  We need its buffer.
+	 */
+	if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+#ifdef DEBUG
+	data = dbp->data;
+	ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+#endif
+	/* this seems to be an error
+	 * data is only valid if DEBUG is defined?
+	 * RMC 09/08/1999
+	 */
+
+	leaf = lbp->data;
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
+	       mp->m_dirblksize - (uint)sizeof(data->hdr));
+	ASSERT(db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+	/*
+	 * Get rid of the data block.
+	 */
+	if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+		ASSERT(error != ENOSPC);
+		xfs_da_brelse(tp, dbp);
+		return error;
+	}
+	/*
+	 * Eliminate the last bests entry from the table.
+	 */
+	bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+	INT_MOD(ltp->bestcount, ARCH_CONVERT, -1);
+	ovbcopy(&bestsp[0], &bestsp[1], INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(*bestsp));
+	xfs_dir2_leaf_log_tail(tp, lbp);
+	xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+	return 0;
+}
+
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int						/* error */
+xfs_dir2_node_to_leaf(
+	xfs_da_state_t		*state)		/* directory operation state */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dabuf_t		*fbp;		/* buffer for freespace block */
+	xfs_fileoff_t		fo;		/* freespace file offset */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_dabuf_t		*lbp;		/* buffer for leaf block */
+	xfs_dir2_leaf_tail_t	*ltp;		/* tail of leaf structure */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			rval;		/* successful free trim? */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	/*
+	 * There's more than a leaf level in the btree, so there must
+	 * be multiple leafn blocks.  Give up.
+	 */
+	if (state->path.active > 1)
+		return 0;
+	args = state->args;
+	xfs_dir2_trace_args("node_to_leaf", args);
+	mp = state->mp;
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Get the last offset in the file.
+	 */
+	if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	fo -= mp->m_dirblkfsbs;
+	/*
+	 * If there are freespace blocks other than the first one,
+	 * take this opportunity to remove trailing empty freespace blocks
+	 * that may have been left behind during no-space-reservation
+	 * operations.
+	 */
+	while (fo > mp->m_dirfreeblk) {
+		if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+			return error;
+		}
+		if (rval)
+			fo -= mp->m_dirblkfsbs;
+		else
+			return 0;
+	}
+	/*
+	 * Now find the block just before the freespace block.
+	 */
+	if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	/*
+	 * If it's not the single leaf block, give up.
+	 */
+	if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
+		return 0;
+	lbp = state->path.blk[0].bp;
+	leaf = lbp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Read the freespace block.
+	 */
+	if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+	free = fbp->data;
+	ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	ASSERT(INT_ISZERO(free->hdr.firstdb, ARCH_CONVERT));
+	/*
+	 * Now see if the leafn and free data will fit in a leaf1.
+	 * If not, release the buffer and give up.
+	 */
+	if ((uint)sizeof(leaf->hdr) +
+	    (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)) * (uint)sizeof(leaf->ents[0]) +
+	    INT_GET(free->hdr.nvalid, ARCH_CONVERT) * (uint)sizeof(leaf->bests[0]) +
+	    (uint)sizeof(leaf->tail) >
+	    mp->m_dirblksize) {
+		xfs_da_brelse(tp, fbp);
+		return 0;
+	}
+	/*
+	 * If the leaf has any stale entries in it, compress them out.
+	 * The compact routine will log the header.
+	 */
+	if (INT_GET(leaf->hdr.stale, ARCH_CONVERT))
+		xfs_dir2_leaf_compact(args, lbp);
+	else
+		xfs_dir2_leaf_log_header(tp, lbp);
+	INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAF1_MAGIC);
+	/*
+	 * Set up the leaf tail from the freespace block.
+	 */
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	INT_COPY(ltp->bestcount, free->hdr.nvalid, ARCH_CONVERT);
+	/*
+	 * Set up the leaf bests table.
+	 */
+	bcopy(free->bests, XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT),
+		INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(leaf->bests[0]));
+	xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+	xfs_dir2_leaf_log_tail(tp, lbp);
+	xfs_dir2_leaf_check(dp, lbp);
+	/*
+	 * Get rid of the freespace block.
+	 */
+	error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
+	if (error) {
+		/*
+		 * This can't fail here because it can only happen when
+		 * punching out the middle of an extent, and this is an
+		 * isolated block.
+		 */
+		ASSERT(error != ENOSPC);
+		return error;
+	}
+	fbp = NULL;
+	/*
+	 * Now see if we can convert the single-leaf directory
+	 * down to a block form directory.
+	 * This routine always kills the dabuf for the leaf, so
+	 * eliminate it from the path.
+	 */
+	error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+	state->path.blk[0].bp = NULL;
+	return error;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_leaf.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_leaf.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_leaf.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_leaf.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_LEAF_H__
+#define __XFS_DIR2_LEAF_H__
+
+/*
+ * Directory version 2, leaf block structures.
+ */
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Constants.
+ */
+
+/*
+ * Offset of the leaf/node space.  First block in this space
+ * is the btree root.
+ */
+#define XFS_DIR2_LEAF_SPACE	1
+#define XFS_DIR2_LEAF_OFFSET	(XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_LEAF_FIRSTDB(mp)	\
+	XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET)
+
+/*
+ * Types.
+ */
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef __uint32_t	xfs_dir2_dataptr_t;
+#define XFS_DIR2_MAX_DATAPTR	((xfs_dir2_dataptr_t)0x7fffffff)
+#define XFS_DIR2_NULL_DATAPTR	((xfs_dir2_dataptr_t)0)
+
+/*
+ * Structures.
+ */
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+	xfs_da_blkinfo_t	info;		/* header for da routines */
+	__uint16_t		count;		/* count of entries */
+	__uint16_t		stale;		/* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+	xfs_dahash_t		hashval;	/* hash value of name */
+	xfs_dir2_dataptr_t	address;	/* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+	__uint32_t		bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ * bests and tail are at the end of the block for single-leaf only
+ * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC).
+ */
+typedef struct xfs_dir2_leaf {
+	xfs_dir2_leaf_hdr_t	hdr;		/* leaf header */
+	xfs_dir2_leaf_entry_t	ents[1];	/* entries */
+						/* ... */
+	xfs_dir2_data_off_t	bests[1];	/* best free counts */
+	xfs_dir2_leaf_tail_t	tail;		/* leaf tail */
+} xfs_dir2_leaf_t;
+
+/*
+ * Macros.
+ * The DB blocks are logical directory block numbers, not filesystem blocks.
+ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_MAX_LEAF_ENTS)
+int
+xfs_dir2_max_leaf_ents(struct xfs_mount *mp);
+#define XFS_DIR2_MAX_LEAF_ENTS(mp)	\
+	xfs_dir2_max_leaf_ents(mp)
+#else
+#define XFS_DIR2_MAX_LEAF_ENTS(mp)	\
+	((int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / \
+	       (uint)sizeof(xfs_dir2_leaf_entry_t)))
+#endif
+
+/*
+ * Get address of the bestcount field in the single-leaf block.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_TAIL_P)
+xfs_dir2_leaf_tail_t *
+xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp);
+#define XFS_DIR2_LEAF_TAIL_P(mp,lp)	\
+	xfs_dir2_leaf_tail_p(mp, lp)
+#else
+#define XFS_DIR2_LEAF_TAIL_P(mp,lp)	\
+	((xfs_dir2_leaf_tail_t *)\
+	 ((char *)(lp) + (mp)->m_dirblksize - \
+	  (uint)sizeof(xfs_dir2_leaf_tail_t)))
+#endif
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_BESTS_P)
+xfs_dir2_data_off_t *
+xfs_dir2_leaf_bests_p_arch(xfs_dir2_leaf_tail_t *ltp, xfs_arch_t arch);
+#define XFS_DIR2_LEAF_BESTS_P_ARCH(ltp,arch)	xfs_dir2_leaf_bests_p_arch(ltp,arch)
+#else
+#define XFS_DIR2_LEAF_BESTS_P_ARCH(ltp,arch)	\
+	((xfs_dir2_data_off_t *)(ltp) - INT_GET((ltp)->bestcount, arch))
+#endif
+
+/*
+ * Convert dataptr to byte in file space
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
+#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp)
+#else
+#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) \
+	((xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG)
+#endif
+
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DATAPTR)
+xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by)
+#else
+#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) \
+	((xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG))
+#endif
+
+/*
+ * Convert dataptr to a block number
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
+#define XFS_DIR2_DATAPTR_TO_DB(mp,dp)	xfs_dir2_dataptr_to_db(mp, dp)
+#else
+#define XFS_DIR2_DATAPTR_TO_DB(mp,dp)	\
+	XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp))
+#endif
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
+#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp)	xfs_dir2_dataptr_to_off(mp, dp)
+#else
+#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp)	\
+	XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp))
+#endif
+
+/*
+ * Convert block and offset to byte in space
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
+			xfs_dir2_data_aoff_t o);
+#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o)	\
+	xfs_dir2_db_off_to_byte(mp, db, o)
+#else
+#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o)	\
+	(((xfs_dir2_off_t)(db) << \
+	 ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o))
+#endif
+
+/*
+ * Convert byte in space to (DB) block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DB)
+xfs_dir2_db_t xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_DB(mp,by)	xfs_dir2_byte_to_db(mp, by)
+#else
+#define XFS_DIR2_BYTE_TO_DB(mp,by)	\
+	((xfs_dir2_db_t)((by) >> \
+			 ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)))
+#endif
+
+/*
+ * Convert byte in space to (DA) block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DA)
+xfs_dablk_t xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_DA(mp,by)	xfs_dir2_byte_to_da(mp, by)
+#else
+#define XFS_DIR2_BYTE_TO_DA(mp,by)	\
+	XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by))
+#endif
+
+/*
+ * Convert byte in space to offset in a block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_OFF(mp,by)	xfs_dir2_byte_to_off(mp, by)
+#else
+#define XFS_DIR2_BYTE_TO_OFF(mp,by)	\
+	((xfs_dir2_data_aoff_t)((by) & \
+				((1 << ((mp)->m_sb.sb_blocklog + \
+					(mp)->m_sb.sb_dirblklog)) - 1)))
+#endif
+
+/*
+ * Convert block and offset to dataptr
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_DATAPTR)
+xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
+			   xfs_dir2_data_aoff_t o);
+#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o)	\
+	xfs_dir2_db_off_to_dataptr(mp, db, o)
+#else
+#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o)	\
+	XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o))
+#endif
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_DA)
+xfs_dablk_t xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db);
+#define XFS_DIR2_DB_TO_DA(mp,db)	xfs_dir2_db_to_da(mp, db)
+#else
+#define XFS_DIR2_DB_TO_DA(mp,db)	\
+	((xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog))
+#endif
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_DB)
+xfs_dir2_db_t xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da);
+#define XFS_DIR2_DA_TO_DB(mp,da)	xfs_dir2_da_to_db(mp, da)
+#else
+#define XFS_DIR2_DA_TO_DB(mp,da)	\
+	((xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog))
+#endif
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_BYTE)
+xfs_dir2_off_t xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da);
+#define XFS_DIR2_DA_TO_BYTE(mp,da)	xfs_dir2_da_to_byte(mp, da)
+#else
+#define XFS_DIR2_DA_TO_BYTE(mp,da)	\
+	XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0)
+#endif
+
+/*
+ * Function declarations.
+ */
+
+extern int
+	xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_dabuf *dbp);
+
+extern int
+	xfs_dir2_leaf_addname(struct xfs_da_args *args);
+
+extern void
+	xfs_dir2_leaf_compact(struct xfs_da_args *args, struct xfs_dabuf *bp);
+
+extern void
+	xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp,
+				 int *lowstalep, int *highstalep, int *lowlogp,
+				 int *highlogp);
+
+extern int
+	xfs_dir2_leaf_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
+			       struct uio *uio, int *eofp, struct xfs_dirent *dbp,
+			       xfs_dir2_put_t put);
+
+extern int
+	xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
+			   struct xfs_dabuf **bpp, int magic);
+
+extern void
+	xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp,
+			       int first, int last);
+
+extern void
+	xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				int first, int last);
+
+extern void
+	xfs_dir2_leaf_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp);
+
+extern void
+	xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
+
+extern int
+	xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_leaf_removename(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_leaf_replace(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+				  struct xfs_dabuf *lbp);
+extern int
+	xfs_dir2_leaf_trim_data(struct xfs_da_args *args, struct xfs_dabuf *lbp,				xfs_dir2_db_t db);
+
+extern int
+	xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+#endif	/* __XFS_DIR2_LEAF_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_node.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_node.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_node.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_node.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1976 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_node.c
+ * XFS directory implementation, version 2, node form files
+ * See data structures in xfs_dir2_node.h and xfs_da_btree.h.
+ */
+
+#include <xfs.h>
+
+/*
+ * Function declarations.
+ */
+static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index);
+#ifdef DEBUG
+static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define xfs_dir2_leafn_check(dp, bp)
+#endif
+static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s,
+				    int start_s, xfs_dabuf_t *bp_d, int start_d,
+				    int count);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+				     xfs_da_state_blk_t *blk1,
+				     xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp,
+				 int index, xfs_da_state_blk_t *dblk,
+				 int *rval);
+static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
+				     xfs_da_state_blk_t *fblk);
+
+/*
+ * Log entries from a freespace block.
+ */
+void
+xfs_dir2_free_log_bests(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* freespace buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_free_t		*free;		/* freespace structure */
+
+	free = bp->data;
+	ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	xfs_da_log_buf(tp, bp,
+		(uint)((char *)&free->bests[first] - (char *)free),
+		(uint)((char *)&free->bests[last] - (char *)free +
+		       sizeof(free->bests[0]) - 1));
+}
+
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* freespace buffer */
+{
+	xfs_dir2_free_t		*free;		/* freespace structure */
+
+	free = bp->data;
+	ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
+		(uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
+}
+
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int						/* error */
+xfs_dir2_leaf_to_node(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp)		/* leaf buffer */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	xfs_dabuf_t		*fbp;		/* freespace buffer */
+	xfs_dir2_db_t		fdb;		/* freespace block number */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_dir2_data_off_t	*from;		/* pointer to freespace entry */
+	int			i;		/* leaf freespace index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			n;		/* count of live freespc ents */
+	xfs_dir2_data_off_t	off;		/* freespace entry value */
+	xfs_dir2_data_off_t	*to;		/* pointer to freespace entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_b("leaf_to_node", args, lbp);
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Add a freespace block to the directory.
+	 */
+	if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+		return error;
+	}
+	ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
+	/*
+	 * Get the buffer for the new freespace block.
+	 */
+	if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(fbp != NULL);
+	free = fbp->data;
+	leaf = lbp->data;
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	/*
+	 * Initialize the freespace block header.
+	 */
+	INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
+	INT_ZERO(free->hdr.firstdb, ARCH_CONVERT);
+	ASSERT(INT_GET(ltp->bestcount, ARCH_CONVERT) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
+	INT_COPY(free->hdr.nvalid, ltp->bestcount, ARCH_CONVERT);
+	/*
+	 * Copy freespace entries from the leaf block to the new block.
+	 * Count active entries.
+	 */
+	for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT), to = free->bests;
+	     i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++, from++, to++) {
+		if ((off = INT_GET(*from, ARCH_CONVERT)) != NULLDATAOFF)
+			n++;
+		INT_SET(*to, ARCH_CONVERT, off);
+	}
+	INT_SET(free->hdr.nused, ARCH_CONVERT, n);
+	INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Log everything.
+	 */
+	xfs_dir2_leaf_log_header(tp, lbp);
+	xfs_dir2_free_log_header(tp, fbp);
+	xfs_dir2_free_log_bests(tp, fbp, 0, INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1);
+	xfs_da_buf_done(fbp);
+	xfs_dir2_leafn_check(dp, lbp);
+	return 0;
+}
+
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int					/* error */
+xfs_dir2_leafn_add(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			index)		/* insertion pt for new entry */
+{
+	int			compact;	/* compacting stale leaves */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			highstale;	/* next stale entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			lfloghigh;	/* high leaf entry logging */
+	int			lfloglow;	/* low leaf entry logging */
+	int			lowstale;	/* previous stale entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_sb("leafn_add", args, index, bp);
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	leaf = bp->data;
+	/*
+	 * If there are already the maximum number of leaf entries in
+	 * the block, if there are no stale entries it won't fit.
+	 * Caller will do a split.  If there are stale entries we'll do
+	 * a compact.
+	 */
+	if (INT_GET(leaf->hdr.count, ARCH_CONVERT) == XFS_DIR2_MAX_LEAF_ENTS(mp)) {
+		if (INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT))
+			return XFS_ERROR(ENOSPC);
+		compact = INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1;
+	} else
+		compact = 0;
+	ASSERT(index == 0 || INT_GET(leaf->ents[index - 1].hashval, ARCH_CONVERT) <= args->hashval);
+	ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+	       INT_GET(leaf->ents[index].hashval, ARCH_CONVERT) >= args->hashval);
+
+	if (args->justcheck)
+		return 0;
+
+	/*
+	 * Compact out all but one stale leaf entry.  Leaves behind
+	 * the entry closest to index.
+	 */
+	if (compact) {
+		xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
+			&lfloglow, &lfloghigh);
+	}
+	/*
+	 * Set impossible logging indices for this case.
+	 */
+	else if (!INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT)) {
+		lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		lfloghigh = -1;
+	}
+	/*
+	 * No stale entries, just insert a space for the new entry.
+	 */
+	if (INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT)) {
+		lep = &leaf->ents[index];
+		if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+			ovbcopy(lep, lep + 1,
+				(INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
+		lfloglow = index;
+		lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
+	}
+	/*
+	 * There are stale entries.  We'll use one for the new entry.
+	 */
+	else {
+		/*
+		 * If we didn't do a compact then we need to figure out
+		 * which stale entry will be used.
+		 */
+		if (compact == 0) {
+			/*
+			 * Find first stale entry before our insertion point.
+			 */
+			for (lowstale = index - 1;
+			     lowstale >= 0 &&
+				INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
+				XFS_DIR2_NULL_DATAPTR;
+			     lowstale--)
+				continue;
+			/*
+			 * Find next stale entry after insertion point.
+			 * Stop looking if the answer would be worse than
+			 * lowstale already found.
+			 */
+			for (highstale = index;
+			     highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
+				INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
+				XFS_DIR2_NULL_DATAPTR &&
+				(lowstale < 0 ||
+				 index - lowstale - 1 >= highstale - index);
+			     highstale++)
+				continue;
+		}
+		/*
+		 * Using the low stale entry.
+		 * Shift entries up toward the stale slot.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+		     index - lowstale - 1 < highstale - index)) {
+			ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			ASSERT(index - lowstale - 1 >= 0);
+			if (index - lowstale - 1 > 0)
+				ovbcopy(&leaf->ents[lowstale + 1],
+					&leaf->ents[lowstale],
+					(index - lowstale - 1) * sizeof(*lep));
+			lep = &leaf->ents[index - 1];
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(index - 1, lfloghigh);
+		}
+		/*
+		 * Using the high stale entry.
+		 * Shift entries down toward the stale slot.
+		 */
+		else {
+			ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			ASSERT(highstale - index >= 0);
+			if (highstale - index > 0)
+				ovbcopy(&leaf->ents[index],
+					&leaf->ents[index + 1],
+					(highstale - index) * sizeof(*lep));
+			lep = &leaf->ents[index];
+			lfloglow = MIN(index, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
+	}
+	/*
+	 * Insert the new entry, log everything.
+	 */
+	INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
+	INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, args->blkno, args->index));
+	xfs_dir2_leaf_log_header(tp, bp);
+	xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
+	xfs_dir2_leafn_check(dp, bp);
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check internal consistency of a leafn block.
+ */
+void
+xfs_dir2_leafn_check(
+	xfs_inode_t	*dp,			/* incore directory inode */
+	xfs_dabuf_t	*bp)			/* leaf buffer */
+{
+	int		i;			/* leaf index */
+	xfs_dir2_leaf_t *leaf;			/* leaf structure */
+	xfs_mount_t	*mp;			/* filesystem mount point */
+	int		stale;			/* count of stale leaves */
+
+	leaf = bp->data;
+	mp = dp->i_mount;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
+	for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
+		if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+			ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
+			       INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
+		}
+		if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			stale++;
+	}
+	ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
+}
+#endif	/* DEBUG */
+
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t					/* hash value */
+xfs_dir2_leafn_lasthash(
+	xfs_dabuf_t	*bp,			/* leaf buffer */
+	int		*count)			/* count of entries in leaf */
+{
+	xfs_dir2_leaf_t *leaf;			/* leaf structure */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	if (count)
+		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	if (INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+		return 0;
+	return INT_GET(leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	xfs_dabuf_t		*curbp;		/* current data/free buffer */
+	xfs_dir2_db_t		curdb;		/* current data block number */
+	xfs_dir2_db_t		curfdb;		/* current free block number */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			fi;		/* free entry index */
+	xfs_dir2_free_t		*free=NULL;	/* free block structure */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length=0;	/* length of new data entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_dir2_db_t		newfdb;		/* new free block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) > 0);
+#endif
+	xfs_dir2_leafn_check(dp, bp);
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid)
+		curbp = state->extrablk.bp;
+	else
+		curbp = NULL;
+	/*
+	 * For addname, it's a free block buffer, get the block number.
+	 */
+	if (args->addname) {
+		curfdb = curbp ? state->extrablk.blkno : -1;
+		curdb = -1;
+		length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+		if ((free = (curbp ? curbp->data : NULL)))
+			ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	}
+	/*
+	 * For others, it's a data block buffer, get the block number.
+	 */
+	else {
+		curfdb = -1;
+		curdb = curbp ? state->extrablk.blkno : -1;
+	}
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &leaf->ents[index];
+	     index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+		/*
+		 * For addname, we're looking for a place to put the new entry.
+		 * We want to use a data block with an entry of equal
+		 * hash value to ours if there is one with room.
+		 */
+		if (args->addname) {
+			/*
+			 * If this block isn't the data block we already have
+			 * in hand, take a look at it.
+			 */
+			if (newdb != curdb) {
+				curdb = newdb;
+				/*
+				 * Convert the data block to the free block
+				 * holding its freespace information.
+				 */
+				newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb);
+				/*
+				 * If it's not the one we have in hand,
+				 * read it in.
+				 */
+				if (newfdb != curfdb) {
+					/*
+					 * If we had one before, drop it.
+					 */
+					if (curbp)
+						xfs_da_brelse(tp, curbp);
+					/*
+					 * Read the free block.
+					 */
+					if ((error = xfs_da_read_buf(tp, dp,
+							XFS_DIR2_DB_TO_DA(mp,
+								newfdb),
+							-1, &curbp,
+							XFS_DATA_FORK))) {
+						return error;
+					}
+					curfdb = newfdb;
+					free = curbp->data;
+					ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) ==
+					       XFS_DIR2_FREE_MAGIC);
+					ASSERT((INT_GET(free->hdr.firstdb, ARCH_CONVERT) %
+						XFS_DIR2_MAX_FREE_BESTS(mp)) ==
+					       0);
+					ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) <= curdb);
+					ASSERT(curdb <
+					       INT_GET(free->hdr.firstdb, ARCH_CONVERT) +
+					       INT_GET(free->hdr.nvalid, ARCH_CONVERT));
+				}
+				/*
+				 * Get the index for our entry.
+				 */
+				fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb);
+				/*
+				 * If it has room, return it.
+				 */
+				if (INT_GET(free->bests[fi], ARCH_CONVERT) == NULLDATAOFF) {
+					return XFS_ERROR(EFSCORRUPTED);
+				}
+				if (INT_GET(free->bests[fi], ARCH_CONVERT) >= length) {
+					*indexp = index;
+					state->extravalid = 1;
+					state->extrablk.bp = curbp;
+					state->extrablk.blkno = curfdb;
+					state->extrablk.index = fi;
+					state->extrablk.magic =
+						XFS_DIR2_FREE_MAGIC;
+					ASSERT(args->oknoent);
+					return XFS_ERROR(ENOENT);
+				}
+			}
+		}
+		/*
+		 * Not adding a new entry, so we really want to find
+		 * the name given to us.
+		 */
+		else {
+			/*
+			 * If it's a different data block, go get it.
+			 */
+			if (newdb != curdb) {
+				/*
+				 * If we had a block before, drop it.
+				 */
+				if (curbp)
+					xfs_da_brelse(tp, curbp);
+				/*
+				 * Read the data block.
+				 */
+				if ((error =
+				    xfs_da_read_buf(tp, dp,
+					    XFS_DIR2_DB_TO_DA(mp, newdb), -1,
+					    &curbp, XFS_DATA_FORK))) {
+					return error;
+				}
+				xfs_dir2_data_check(dp, curbp);
+				curdb = newdb;
+			}
+			/*
+			 * Point to the data entry.
+			 */
+			dep = (xfs_dir2_data_entry_t *)
+			      ((char *)curbp->data +
+			       XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+			/*
+			 * Compare the entry, return it if it matches.
+			 */
+			if (dep->namelen == args->namelen &&
+			    dep->name[0] == args->name[0] &&
+			    bcmp(dep->name, args->name, args->namelen) == 0) {
+				args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+				*indexp = index;
+				state->extravalid = 1;
+				state->extrablk.bp = curbp;
+				state->extrablk.blkno = curdb;
+				state->extrablk.index =
+					(int)((char *)dep -
+					      (char *)curbp->data);
+				state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+				return XFS_ERROR(EEXIST);
+			}
+		}
+	}
+	/*
+	 * Didn't find a match.
+	 * If we are holding a buffer, give it back in case our caller
+	 * finds it useful.
+	 */
+	if ((state->extravalid = (curbp != NULL))) {
+		state->extrablk.bp = curbp;
+		state->extrablk.index = -1;
+		/*
+		 * For addname, giving back a free block.
+		 */
+		if (args->addname) {
+			state->extrablk.blkno = curfdb;
+			state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+		}
+		/*
+		 * For other callers, giving back a data block.
+		 */
+		else {
+			state->extrablk.blkno = curdb;
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+		}
+	}
+	/*
+	 * Return the final index, that will be the insertion point.
+	 */
+	*indexp = index;
+	ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers.  Stale entries are preserved.
+ */
+static void
+xfs_dir2_leafn_moveents(
+	xfs_da_args_t	*args,			/* operation arguments */
+	xfs_dabuf_t	*bp_s,			/* source leaf buffer */
+	int		start_s,		/* source leaf index */
+	xfs_dabuf_t	*bp_d,			/* destination leaf buffer */
+	int		start_d,		/* destination leaf index */
+	int		count)			/* count of leaves to copy */
+{
+	xfs_dir2_leaf_t *leaf_d;		/* destination leaf structure */
+	xfs_dir2_leaf_t *leaf_s;		/* source leaf structure */
+	int		stale;			/* count stale leaves copied */
+	xfs_trans_t	*tp;			/* transaction pointer */
+
+	xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d,
+		start_d, count);
+	/*
+	 * Silently return if nothing to do.
+	 */
+	if (count == 0) {
+		return;
+	}
+	tp = args->trans;
+	leaf_s = bp_s->data;
+	leaf_d = bp_d->data;
+	/*
+	 * If the destination index is not the end of the current
+	 * destination leaf entries, open up a hole in the destination
+	 * to hold the new entries.
+	 */
+	if (start_d < INT_GET(leaf_d->hdr.count, ARCH_CONVERT)) {
+		ovbcopy(&leaf_d->ents[start_d], &leaf_d->ents[start_d + count],
+			(INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - start_d) *
+			sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
+			count + INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - 1);
+	}
+	/*
+	 * If the source has stale leaves, count the ones in the copy range
+	 * so we can update the header correctly.
+	 */
+	if (!INT_ISZERO(leaf_s->hdr.stale, ARCH_CONVERT)) {
+		int	i;			/* temp leaf index */
+
+		for (i = start_s, stale = 0; i < start_s + count; i++) {
+			if (INT_GET(leaf_s->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+				stale++;
+		}
+	} else
+		stale = 0;
+	/*
+	 * Copy the leaf entries from source to destination.
+	 */
+	bcopy(&leaf_s->ents[start_s], &leaf_d->ents[start_d],
+		count * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+	/*
+	 * If there are source entries after the ones we copied,
+	 * delete the ones we copied by sliding the next ones down.
+	 */
+	if (start_s + count < INT_GET(leaf_s->hdr.count, ARCH_CONVERT)) {
+		ovbcopy(&leaf_s->ents[start_s + count], &leaf_s->ents[start_s],
+			count * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+	}
+	/*
+	 * Update the headers and log them.
+	 */
+	INT_MOD(leaf_s->hdr.count, ARCH_CONVERT, -(count));
+	INT_MOD(leaf_s->hdr.stale, ARCH_CONVERT, -(stale));
+	INT_MOD(leaf_d->hdr.count, ARCH_CONVERT, count);
+	INT_MOD(leaf_d->hdr.stale, ARCH_CONVERT, stale);
+	xfs_dir2_leaf_log_header(tp, bp_s);
+	xfs_dir2_leaf_log_header(tp, bp_d);
+	xfs_dir2_leafn_check(args->dp, bp_s);
+	xfs_dir2_leafn_check(args->dp, bp_d);
+}
+
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int						/* sort order */
+xfs_dir2_leafn_order(
+	xfs_dabuf_t	*leaf1_bp,		/* leaf1 buffer */
+	xfs_dabuf_t	*leaf2_bp)		/* leaf2 buffer */
+{
+	xfs_dir2_leaf_t *leaf1;			/* leaf1 structure */
+	xfs_dir2_leaf_t *leaf2;			/* leaf2 structure */
+
+	leaf1 = leaf1_bp->data;
+	leaf2 = leaf2_bp->data;
+	ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0 &&
+	    INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0 &&
+	    (INT_GET(leaf2->ents[0].hashval, ARCH_CONVERT) < INT_GET(leaf1->ents[0].hashval, ARCH_CONVERT) ||
+	     INT_GET(leaf2->ents[INT_GET(leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT) <
+	     INT_GET(leaf1->ents[INT_GET(leaf1->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)))
+		return 1;
+	return 0;
+}
+
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*blk1,		/* first btree block */
+	xfs_da_state_blk_t	*blk2)		/* second btree block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	int			count;		/* count (& direction) leaves */
+	int			isleft;		/* new goes in left leaf */
+	xfs_dir2_leaf_t		*leaf1;		/* first leaf structure */
+	xfs_dir2_leaf_t		*leaf2;		/* second leaf structure */
+	int			mid;		/* midpoint leaf index */
+#ifdef DEBUG
+	int			oldstale;	/* old count of stale leaves */
+#endif
+	int			oldsum;		/* old total leaf count */
+	int			swap;		/* swapped leaf blocks */
+
+	args = state->args;
+	/*
+	 * If the block order is wrong, swap the arguments.
+	 */
+	if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
+		xfs_da_state_blk_t	*tmp;	/* temp for block swap */
+
+		tmp = blk1;
+		blk1 = blk2;
+		blk2 = tmp;
+	}
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	oldsum = INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT);
+#ifdef DEBUG
+	oldstale = INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT);
+#endif
+	mid = oldsum >> 1;
+	/*
+	 * If the old leaf count was odd then the new one will be even,
+	 * so we need to divide the new count evenly.
+	 */
+	if (oldsum & 1) {
+		xfs_dahash_t	midhash;	/* middle entry hash value */
+
+		if (mid >= INT_GET(leaf1->hdr.count, ARCH_CONVERT))
+			midhash = INT_GET(leaf2->ents[mid - INT_GET(leaf1->hdr.count, ARCH_CONVERT)].hashval, ARCH_CONVERT);
+		else
+			midhash = INT_GET(leaf1->ents[mid].hashval, ARCH_CONVERT);
+		isleft = args->hashval <= midhash;
+	}
+	/*
+	 * If the old count is even then the new count is odd, so there's
+	 * no preferred side for the new entry.
+	 * Pick the left one.
+	 */
+	else
+		isleft = 1;
+	/*
+	 * Calculate moved entry count.	 Positive means left-to-right,
+	 * negative means right-to-left.  Then move the entries.
+	 */
+	count = INT_GET(leaf1->hdr.count, ARCH_CONVERT) - mid + (isleft == 0);
+	if (count > 0)
+		xfs_dir2_leafn_moveents(args, blk1->bp,
+			INT_GET(leaf1->hdr.count, ARCH_CONVERT) - count, blk2->bp, 0, count);
+	else if (count < 0)
+		xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
+			INT_GET(leaf1->hdr.count, ARCH_CONVERT), count);
+	ASSERT(INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT) == oldsum);
+	ASSERT(INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT) == oldstale);
+	/*
+	 * Mark whether we're inserting into the old or new leaf.
+	 */
+	if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) < INT_GET(leaf2->hdr.count, ARCH_CONVERT))
+		state->inleaf = swap;
+	else if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > INT_GET(leaf2->hdr.count, ARCH_CONVERT))
+		state->inleaf = !swap;
+	else
+		state->inleaf =
+			swap ^ (args->hashval < INT_GET(leaf2->ents[0].hashval, ARCH_CONVERT));
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (!state->inleaf)
+		blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+}
+
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int					/* error */
+xfs_dir2_leafn_remove(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	int			index,		/* leaf entry index */
+	xfs_da_state_blk_t	*dblk,		/* data block */
+	int			*rval)		/* resulting block needs join */
+{
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_db_t		db;		/* data block number */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			longest;	/* longest data free entry */
+	int			off;		/* data block entry offset */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_sb("leafn_remove", args, index, bp);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Point to the entry we're removing.
+	 */
+	lep = &leaf->ents[index];
+	/*
+	 * Extract the data block and offset from the entry.
+	 */
+	db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+	ASSERT(dblk->blkno == db);
+	off = XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT));
+	ASSERT(dblk->index == off);
+	/*
+	 * Kill the leaf entry by marking it stale.
+	 * Log the leaf block changes.
+	 */
+	INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
+	xfs_dir2_leaf_log_header(tp, bp);
+	INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_leaf_log_ents(tp, bp, index, index);
+	/*
+	 * Make the data entry free.  Keep track of the longest freespace
+	 * in the data block in case it changes.
+	 */
+	dbp = dblk->bp;
+	data = dbp->data;
+	dep = (xfs_dir2_data_entry_t *)((char *)data + off);
+	longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(tp, dbp, off,
+		XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+	/*
+	 * Rescan the data block freespaces for bestfree.
+	 * Log the data block header if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * If the longest data block freespace changes, need to update
+	 * the corresponding freeblock entry.
+	 */
+	if (longest < INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+		int		error;		/* error return value */
+		xfs_dabuf_t	*fbp;		/* freeblock buffer */
+		xfs_dir2_db_t	fdb;		/* freeblock block number */
+		int		findex;		/* index in freeblock entries */
+		xfs_dir2_free_t *free;		/* freeblock structure */
+		int		logfree;	/* need to log free entry */
+
+		/*
+		 * Convert the data block number to a free block,
+		 * read in the free block.
+		 */
+		fdb = XFS_DIR2_DB_TO_FDB(mp, db);
+		if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb),
+				-1, &fbp, XFS_DATA_FORK))) {
+			return error;
+		}
+		free = fbp->data;
+		ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+		ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) ==
+		       XFS_DIR2_MAX_FREE_BESTS(mp) *
+		       (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+		/*
+		 * Calculate which entry we need to fix.
+		 */
+		findex = XFS_DIR2_DB_TO_FDINDEX(mp, db);
+		longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+		/*
+		 * If the data block is now empty we can get rid of it
+		 * (usually).
+		 */
+		if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+			/*
+			 * Try to punch out the data block.
+			 */
+			error = xfs_dir2_shrink_inode(args, db, dbp);
+			if (error == 0) {
+				dblk->bp = NULL;
+				data = NULL;
+			}
+			/*
+			 * We can get ENOSPC if there's no space reservation.
+			 * In this case just drop the buffer and some one else
+			 * will eventually get rid of the empty block.
+			 */
+			else if (error == ENOSPC && args->total == 0)
+				xfs_da_buf_done(dbp);
+			else
+				return error;
+		}
+		/*
+		 * If we got rid of the data block, we can eliminate that entry
+		 * in the free block.
+		 */
+		if (data == NULL) {
+			/*
+			 * One less used entry in the free table.
+			 */
+			INT_MOD(free->hdr.nused, ARCH_CONVERT, -1);
+			xfs_dir2_free_log_header(tp, fbp);
+			/*
+			 * If this was the last entry in the table, we can
+			 * trim the table size back.  There might be other
+			 * entries at the end referring to non-existent
+			 * data blocks, get those too.
+			 */
+			if (findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1) {
+				int	i;		/* free entry index */
+
+				for (i = findex - 1;
+				     i >= 0 && INT_GET(free->bests[i], ARCH_CONVERT) == NULLDATAOFF;
+				     i--)
+					continue;
+				INT_SET(free->hdr.nvalid, ARCH_CONVERT, i + 1);
+				logfree = 0;
+			}
+			/*
+			 * Not the last entry, just punch it out.
+			 */
+			else {
+				INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
+				logfree = 1;
+			}
+			/*
+			 * If there are no useful entries left in the block,
+			 * get rid of the block if we can.
+			 */
+			if (INT_ISZERO(free->hdr.nused, ARCH_CONVERT)) {
+				error = xfs_dir2_shrink_inode(args, fdb, fbp);
+				if (error == 0) {
+					fbp = NULL;
+					logfree = 0;
+				} else if (error != ENOSPC || args->total != 0)
+					return error;
+				/*
+				 * It's possible to get ENOSPC if there is no
+				 * space reservation.  In this case some one
+				 * else will eventually get rid of this block.
+				 */
+			}
+		}
+		/*
+		 * Data block is not empty, just set the free entry to
+		 * the new value.
+		 */
+		else {
+			INT_SET(free->bests[findex], ARCH_CONVERT, longest);
+			logfree = 1;
+		}
+		/*
+		 * Log the free entry that changed, unless we got rid of it.
+		 */
+		if (logfree)
+			xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+		/*
+		 * Drop the buffer if we still have it.
+		 */
+		if (fbp)
+			xfs_da_buf_done(fbp);
+	}
+	xfs_dir2_leafn_check(dp, bp);
+	/*
+	 * Return indication of whether this leaf block is emtpy enough
+	 * to justify trying to join it with a neighbor.
+	 */
+	*rval =
+		((uint)sizeof(leaf->hdr) +
+		 (uint)sizeof(leaf->ents[0]) *
+		 (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT))) <
+		mp->m_dir_magicpct;
+	return 0;
+}
+
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int						/* error */
+xfs_dir2_leafn_split(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*oldblk,	/* original block */
+	xfs_da_state_blk_t	*newblk)	/* newly created block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dablk_t		blkno;		/* new leaf block number */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	args = state->args;
+	mp = args->dp->i_mount;
+	ASSERT(args != NULL);
+	ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Initialize the new leaf block.
+	 */
+	error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno),
+		&newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+	if (error) {
+		return error;
+	}
+	newblk->blkno = blkno;
+	newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+	/*
+	 * Rebalance the entries across the two leaves, link the new
+	 * block into the leaves.
+	 */
+	xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+	error = xfs_da_blk_link(state, oldblk, newblk);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Insert the new entry in the correct block.
+	 */
+	if (state->inleaf)
+		error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+	else
+		error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
+	xfs_dir2_leafn_check(args->dp, oldblk->bp);
+	xfs_dir2_leafn_check(args->dp, newblk->bp);
+	return error;
+}
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int						/* error */
+xfs_dir2_leafn_toosmall(
+	xfs_da_state_t		*state,		/* btree cursor */
+	int			*action)	/* resulting action to take */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dablk_t		blkno;		/* leaf block number */
+	xfs_dabuf_t		*bp;		/* leaf buffer */
+	int			bytes;		/* bytes in use */
+	int			count;		/* leaf live entry count */
+	int			error;		/* error return value */
+	int			forward;	/* sibling block direction */
+	int			i;		/* sibling counter */
+	xfs_da_blkinfo_t	*info;		/* leaf block header */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			rval;		/* result from path_shift */
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[state->path.active - 1];
+	info = blk->bp->data;
+	ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	leaf = (xfs_dir2_leaf_t *)info;
+	count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
+	bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
+	if (bytes > (state->blocksize >> 1)) {
+		/*
+		 * Blk over 50%, don't try to join.
+		 */
+		*action = 0;
+		return 0;
+	}
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = !INT_ISZERO(info->forw, ARCH_CONVERT);
+		bcopy(&state->path, &state->altpath, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+		if (error)
+			return error;
+		*action = rval ? 2 : 0;
+		return 0;
+	}
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	forward = INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT);
+	for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+		blkno = forward ?INT_GET( info->forw, ARCH_CONVERT) : INT_GET(info->back, ARCH_CONVERT);
+		if (blkno == 0)
+			continue;
+		/*
+		 * Read the sibling leaf block.
+		 */
+		if ((error =
+		    xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
+			    -1, &bp, XFS_DATA_FORK))) {
+			return error;
+		}
+		ASSERT(bp != NULL);
+		/*
+		 * Count bytes in the two blocks combined.
+		 */
+		leaf = (xfs_dir2_leaf_t *)info;
+		count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
+		bytes = state->blocksize - (state->blocksize >> 2);
+		leaf = bp->data;
+		ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+		count += INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
+		bytes -= count * (uint)sizeof(leaf->ents[0]);
+		/*
+		 * Fits with at least 25% to spare.
+		 */
+		if (bytes >= 0)
+			break;
+		xfs_da_brelse(state->args->trans, bp);
+	}
+	/*
+	 * Didn't like either block, give up.
+	 */
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+	/*
+	 * Done with the sibling leaf block here, drop the dabuf
+	 * so path_shift can get it.
+	 */
+	xfs_da_buf_done(bp);
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	bcopy(&state->path, &state->altpath, sizeof(state->path));
+	if (blkno < blk->blkno)
+		error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+	else
+		error = xfs_da_path_shift(state, &state->path, forward, 0,
+			&rval);
+	if (error) {
+		return error;
+	}
+	*action = rval ? 0 : 1;
+	return 0;
+}
+
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+	xfs_da_state_t		*state,		/* cursor */
+	xfs_da_state_blk_t	*drop_blk,	/* dead block */
+	xfs_da_state_blk_t	*save_blk)	/* surviving block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dir2_leaf_t		*drop_leaf;	/* dead leaf structure */
+	xfs_dir2_leaf_t		*save_leaf;	/* surviving leaf structure */
+
+	args = state->args;
+	ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	drop_leaf = drop_blk->bp->data;
+	save_leaf = save_blk->bp->data;
+	ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * If there are any stale leaf entries, take this opportunity
+	 * to purge them.
+	 */
+	if (INT_GET(drop_leaf->hdr.stale, ARCH_CONVERT))
+		xfs_dir2_leaf_compact(args, drop_blk->bp);
+	if (INT_GET(save_leaf->hdr.stale, ARCH_CONVERT))
+		xfs_dir2_leaf_compact(args, save_blk->bp);
+	/*
+	 * Move the entries from drop to the appropriate end of save.
+	 */
+	drop_blk->hashval = INT_GET(drop_leaf->ents[INT_GET(drop_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
+		xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
+			INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
+	else
+		xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
+			INT_GET(save_leaf->hdr.count, ARCH_CONVERT), INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
+	save_blk->hashval = INT_GET(save_leaf->ents[INT_GET(save_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	xfs_dir2_leafn_check(args->dp, save_blk->bp);
+}
+
+/*
+ * Top-level node form directory addname routine.
+ */
+int						/* error */
+xfs_dir2_node_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block for insert */
+	int			error;		/* error return value */
+	int			rval;		/* sub-return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	xfs_dir2_trace_args("node_addname", args);
+	/*
+	 * Allocate and initialize the state (btree cursor).
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	/*
+	 * Look up the name.  We're not supposed to find it, but
+	 * this gives us the insertion point.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	if (rval != ENOENT) {
+		goto done;
+	}
+	/*
+	 * Add the data entry to a data block.
+	 * Extravalid is set to a freeblock found by lookup.
+	 */
+	rval = xfs_dir2_node_addname_int(args,
+		state->extravalid ? &state->extrablk : NULL);
+	if (rval) {
+		goto done;
+	}
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Add the new leaf entry.
+	 */
+	rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+	if (rval == 0) {
+		/*
+		 * It worked, fix the hash values up the btree.
+		 */
+		if (!args->justcheck)
+			xfs_da_fixhashpath(state, &state->path);
+	} else {
+		/*
+		 * It didn't work, we need to split the leaf block.
+		 */
+		if (args->total == 0) {
+			ASSERT(rval == ENOSPC);
+			goto done;
+		}
+		/*
+		 * Split the leaf block and insert the new entry.
+		 */
+		rval = xfs_da_split(state);
+	}
+done:
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int					/* error */
+xfs_dir2_node_addname_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_da_state_blk_t	*fblk)		/* optional freespace block */
+{
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_db_t		dbno;		/* data block number */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry pointer */
+	int			error;		/* error return value */
+	xfs_dir2_db_t		fbno;		/* freespace block number */
+	xfs_dabuf_t		*fbp;		/* freespace buffer */
+	int			findex;		/* freespace entry index */
+	xfs_dir2_db_t		foundbno=0;	/* found freespace block no */
+	int			foundindex=0;	/* found freespace entry idx */
+	xfs_dir2_free_t		*free=NULL;	/* freespace block structure */
+	xfs_dir2_db_t		ifbno;		/* initial freespace block no */
+	xfs_dir2_db_t		lastfbno=0;	/* highest freespace block no */
+	int			length;		/* length of the new entry */
+	int			logfree;	/* need to log free entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_dir2_data_off_t	*tagp;		/* data entry tag pointer */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+	/*
+	 * If we came in with a freespace block that means that lookup
+	 * found an entry with our hash value.	This is the freespace
+	 * block for that data entry.
+	 */
+	if (fblk) {
+		fbp = fblk->bp;
+		/*
+		 * Remember initial freespace block number.
+		 */
+		ifbno = fblk->blkno;
+		free = fbp->data;
+		ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+		findex = fblk->index;
+		/*
+		 * This means the free entry showed that the data block had
+		 * space for our entry, so we remembered it.
+		 * Use that data block.
+		 */
+		if (findex >= 0) {
+			ASSERT(findex < INT_GET(free->hdr.nvalid, ARCH_CONVERT));
+			ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF);
+			ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) >= length);
+			dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
+		}
+		/*
+		 * The data block looked at didn't have enough room.
+		 * We'll start at the beginning of the freespace entries.
+		 */
+		else {
+			dbno = -1;
+			findex = 0;
+		}
+	}
+	/*
+	 * Didn't come in with a freespace block, so don't have a data block.
+	 */
+	else {
+		ifbno = dbno = -1;
+		fbp = NULL;
+		findex = 0;
+	}
+	/*
+	 * If we don't have a data block yet, we're going to scan the
+	 * freespace blocks looking for one.  Figure out what the
+	 * highest freespace block number is.
+	 */
+	if (dbno == -1) {
+		xfs_fileoff_t	fo;		/* freespace block number */
+
+		if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+			return error;
+		lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo);
+		fbno = ifbno;
+		foundindex = -1;
+	}
+	/*
+	 * While we haven't identified a data block, search the freeblock
+	 * data for a good data block.	If we find a null freeblock entry,
+	 * indicating a hole in the data blocks, remember that.
+	 */
+	while (dbno == -1) {
+		/*
+		 * If we don't have a freeblock in hand, get the next one.
+		 */
+		if (fbp == NULL) {
+			/*
+			 * Happens the first time through unless lookup gave
+			 * us a freespace block to start with.
+			 */
+			if (++fbno == 0)
+				fbno = XFS_DIR2_FREE_FIRSTDB(mp);
+			/*
+			 * If it's ifbno we already looked at it.
+			 */
+			if (fbno == ifbno)
+				fbno++;
+			/*
+			 * If it's off the end we're done.
+			 */
+			if (fbno >= lastfbno)
+				break;
+			/*
+			 * Read the block.  There can be holes in the
+			 * freespace blocks, so this might not succeed.
+			 * This should be really rare, so there's no reason
+			 * to avoid it.
+			 */
+			if ((error = xfs_da_read_buf(tp, dp,
+					XFS_DIR2_DB_TO_DA(mp, fbno), -1, &fbp,
+					XFS_DATA_FORK))) {
+				return error;
+			}
+			if (fbp == NULL) {
+				continue;
+			}
+			free = fbp->data;
+			ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+			findex = 0;
+		}
+		/*
+		 * Look at the current free entry.  Is it good enough?
+		 */
+		if (INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF &&
+		    INT_GET(free->bests[findex], ARCH_CONVERT) >= length)
+			dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
+		else {
+			/*
+			 * If we haven't found an empty entry yet, and this
+			 * one is empty, remember this slot.
+			 */
+			if (foundindex == -1 &&
+			    INT_GET(free->bests[findex], ARCH_CONVERT) == NULLDATAOFF) {
+				foundindex = findex;
+				foundbno = fbno;
+			}
+			/*
+			 * Are we done with the freeblock?
+			 */
+			if (++findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
+				/*
+				 * If there is space left in this freeblock,
+				 * and we don't have an empty entry yet,
+				 * remember this slot.
+				 */
+				if (foundindex == -1 &&
+				    findex < XFS_DIR2_MAX_FREE_BESTS(mp)) {
+					foundindex = findex;
+					foundbno = fbno;
+				}
+				/*
+				 * Drop the block.
+				 */
+				xfs_da_brelse(tp, fbp);
+				fbp = NULL;
+				if (fblk && fblk->bp)
+					fblk->bp = NULL;
+			}
+		}
+	}
+	/*
+	 * If we don't have a data block, and there's no free slot in a
+	 * freeblock, we need to add a new freeblock.
+	 */
+	if (dbno == -1 && foundindex == -1) {
+		/*
+		 * Not allowed to allocate, so return failure.
+		 */
+		if (args->justcheck || args->total == 0) {
+			return XFS_ERROR(ENOSPC);
+		}
+		/*
+		 * Add the new freeblock.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+				&fbno))) {
+			return error;
+		}
+		/*
+		 * Get a buffer for the new block.
+		 */
+		if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fbno),
+				-1, &fbp, XFS_DATA_FORK))) {
+			return error;
+		}
+		ASSERT(fbp != NULL);
+		/*
+		 * Initialize the new block to be empty, and remember
+		 * its first slot as our empty slot.
+		 */
+		free = fbp->data;
+		INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
+		INT_SET(free->hdr.firstdb, ARCH_CONVERT, (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
+			XFS_DIR2_MAX_FREE_BESTS(mp));
+		INT_ZERO(free->hdr.nused, ARCH_CONVERT);
+		INT_ZERO(free->hdr.nvalid, ARCH_CONVERT);
+		foundindex = 0;
+		foundbno = fbno;
+	}
+	/*
+	 * If we don't have a data block, and we don't have a freeblock buffer
+	 * in hand (we dropped the one with the free slot in it),
+	 * go read the freeblock again.
+	 */
+	if (dbno == -1 && fbp == NULL) {
+		/*
+		 * We're going to use the empty slot we found before.
+		 */
+		findex = foundindex;
+		fbno = foundbno;
+		if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fbno),
+				-1, &fbp, XFS_DATA_FORK))) {
+			return error;
+		}
+		ASSERT(fbp != NULL);
+		free = fbp->data;
+		ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	}
+	/*
+	 * If we don't have a data block, we need to allocate one and make
+	 * the freespace entries refer to it.
+	 */
+	if (dbno == -1) {
+		/*
+		 * Not allowed to allocate, return failure.
+		 */
+		if (args->justcheck || args->total == 0) {
+			/*
+			 * Drop the freespace buffer unless it came from our
+			 * caller.
+			 */
+			if (fblk == NULL || fblk->bp == NULL)
+				xfs_da_buf_done(fbp);
+			return XFS_ERROR(ENOSPC);
+		}
+		/*
+		 * Allocate and initialize the new data block.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+				&dbno)) ||
+		    (error = xfs_dir2_data_init(args, dbno, &dbp))) {
+			/*
+			 * Drop the freespace buffer unless it came from our
+			 * caller.
+			 */
+			if (fblk == NULL || fblk->bp == NULL)
+				xfs_da_buf_done(fbp);
+			return error;
+		}
+		/*
+		 * If the freespace entry for this data block is not in the
+		 * freespace block we have in hand, drop the one we have
+		 * and get the right one.
+		 */
+		if (XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno) {
+			xfs_da_brelse(tp, fbp);
+			if (fblk && fblk->bp)
+				fblk->bp = NULL;
+			fbno = XFS_DIR2_DB_TO_FDB(mp, dbno);
+			if ((error = xfs_da_read_buf(tp, dp,
+					XFS_DIR2_DB_TO_DA(mp, fbno), -1, &fbp,
+					XFS_DATA_FORK))) {
+				xfs_da_buf_done(dbp);
+				return error;
+			}
+			ASSERT(fbp != NULL);
+			free = fbp->data;
+			ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+		}
+		/*
+		 * Set the freespace block index from the data block number.
+		 */
+		findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno);
+		/*
+		 * If it's after the end of the current entries in the
+		 * freespace block, extend that table.
+		 */
+		if (findex >= INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
+			ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp));
+			INT_SET(free->hdr.nvalid, ARCH_CONVERT, findex + 1);
+			/*
+			 * Tag new entry so nused will go up.
+			 */
+			INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
+		}
+		/*
+		 * If this entry was for an empty data block
+		 * (this should always be true) then update the header.
+		 */
+		if (INT_GET(free->bests[findex], ARCH_CONVERT) == NULLDATAOFF) {
+			INT_MOD(free->hdr.nused, ARCH_CONVERT, +1);
+			xfs_dir2_free_log_header(tp, fbp);
+		}
+		/*
+		 * Update the real value in the table.
+		 * We haven't allocated the data entry yet so this will
+		 * change again.
+		 */
+		data = dbp->data;
+		INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		logfree = 1;
+	}
+	/*
+	 * We had a data block so we don't have to make a new one.
+	 */
+	else {
+		/*
+		 * If just checking, we succeeded.
+		 */
+		if (args->justcheck) {
+			if (fblk == NULL || fblk->bp == NULL)
+				xfs_da_buf_done(fbp);
+			return 0;
+		}
+		/*
+		 * Read the data block in.
+		 */
+		if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno),
+				-1, &dbp, XFS_DATA_FORK))) {
+			if (fblk == NULL || fblk->bp == NULL)
+				xfs_da_buf_done(fbp);
+			return error;
+		}
+		data = dbp->data;
+		logfree = 0;
+	}
+	ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) >= length);
+	/*
+	 * Point to the existing unused space.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
+	needscan = needlog = 0;
+	/*
+	 * Mark the first part of the unused space, inuse for us.
+	 */
+	xfs_dir2_data_use_free(tp, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+		&needlog, &needscan);
+	/*
+	 * Fill in the new entry and log it.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->namelen = args->namelen;
+	bcopy(args->name, dep->name, dep->namelen);
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
+	xfs_dir2_data_log_entry(tp, dbp, dep);
+	/*
+	 * Rescan the block for bestfree if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+	/*
+	 * Log the data block header if needed.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	/*
+	 * If the freespace entry is now wrong, update it.
+	 */
+	if (INT_GET(free->bests[findex], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+		INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		logfree = 1;
+	}
+	/*
+	 * Log the freespace entry if needed.
+	 */
+	if (logfree)
+		xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+	/*
+	 * If the caller didn't hand us the freespace block, drop it.
+	 */
+	if (fblk == NULL || fblk->bp == NULL)
+		xfs_da_buf_done(fbp);
+	/*
+	 * Return the data block and offset in args, then drop the data block.
+	 */
+	args->blkno = (xfs_dablk_t)dbno;
+	args->index = INT_GET(*tagp, ARCH_CONVERT);
+	xfs_da_buf_done(dbp);
+	return 0;
+}
+
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int						/* error */
+xfs_dir2_node_lookup(
+	xfs_da_args_t	*args)			/* operation arguments */
+{
+	int		error;			/* error return value */
+	int		i;			/* btree level */
+	int		rval;			/* operation return value */
+	xfs_da_state_t	*state;			/* btree cursor */
+
+	xfs_dir2_trace_args("node_lookup", args);
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	/*
+	 * Fill in the path to the entry in the cursor.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	/*
+	 * Release the btree blocks and leaf block.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	/*
+	 * Release the data block if we have it.
+	 */
+	if (state->extravalid && state->extrablk.bp) {
+		xfs_da_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Remove an entry from a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_removename(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	int			error;		/* error return value */
+	int			rval;		/* operation return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	xfs_dir2_trace_args("node_removename", args);
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	/*
+	 * Look up the entry we're deleting, set up the cursor.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error) {
+		rval = error;
+	}
+	/*
+	 * Didn't find it, upper layer screwed up.
+	 */
+	if (rval != EEXIST) {
+		xfs_da_state_free(state);
+		return rval;
+	}
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(state->extravalid);
+	/*
+	 * Remove the leaf and data entries.
+	 * Extrablk refers to the data block.
+	 */
+	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+		&state->extrablk, &rval);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Fix the hash values up the btree.
+	 */
+	xfs_da_fixhashpath(state, &state->path);
+	/*
+	 * If we need to join leaf blocks, do it.
+	 */
+	if (rval && state->path.active > 1)
+		error = xfs_da_join(state);
+	/*
+	 * If no errors so far, try conversion to leaf format.
+	 */
+	if (!error)
+		error = xfs_dir2_node_to_leaf(state);
+	xfs_da_state_free(state);
+	return error;
+}
+
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_data_entry_t	*dep;		/* data entry changed */
+	int			error;		/* error return value */
+	int			i;		/* btree level */
+	xfs_ino_t		inum;		/* new inode number */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry being changed */
+	int			rval;		/* internal return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	xfs_dir2_trace_args("node_replace", args);
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	inum = args->inumber;
+	/*
+	 * Lookup the entry to change in the btree.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error) {
+		rval = error;
+	}
+	/*
+	 * It should be found, since the vnodeops layer has looked it up
+	 * and locked it.  But paranoia is good.
+	 */
+	if (rval == EEXIST) {
+		/*
+		 * Find the leaf entry.
+		 */
+		blk = &state->path.blk[state->path.active - 1];
+		ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+		leaf = blk->bp->data;
+		lep = &leaf->ents[blk->index];
+		ASSERT(state->extravalid);
+		/*
+		 * Point to the data entry.
+		 */
+		data = state->extrablk.bp->data;
+		ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)data +
+		       XFS_DIR2_DATAPTR_TO_OFF(state->mp, INT_GET(lep->address, ARCH_CONVERT)));
+		ASSERT(inum != INT_GET(dep->inumber, ARCH_CONVERT));
+		/*
+		 * Fill in the new inode number and log the entry.
+		 */
+		INT_SET(dep->inumber, ARCH_CONVERT, inum);
+		xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
+		rval = 0;
+	}
+	/*
+	 * Didn't find it, and we're holding a data block.  Drop it.
+	 */
+	else if (state->extravalid) {
+		xfs_da_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	/*
+	 * Release all the buffers in the cursor.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int						/* error */
+xfs_dir2_node_trim_free(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_fileoff_t		fo,		/* free block number */
+	int			*rvalp)		/* out: did something */
+{
+	xfs_dabuf_t		*bp;		/* freespace buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Read the freespace block.
+	 */
+	if ((error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -1, &bp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+	free = bp->data;
+	ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	/*
+	 * If there are used entries, there's nothing to do.
+	 */
+	if (INT_GET(free->hdr.nused, ARCH_CONVERT) > 0) {
+		xfs_da_brelse(tp, bp);
+		*rvalp = 0;
+		return 0;
+	}
+	/*
+	 * Blow the block away.
+	 */
+	if ((error =
+	    xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo),
+		    bp))) {
+		/*
+		 * Can't fail with ENOSPC since that only happens with no
+		 * space reservation, when breaking up an extent into two
+		 * pieces.  This is the last block of an extent.
+		 */
+		ASSERT(error != ENOSPC);
+		xfs_da_brelse(tp, bp);
+		return error;
+	}
+	/*
+	 * Return that we succeeded.
+	 */
+	*rvalp = 1;
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_node.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_node.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_node.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_node.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_NODE_H__
+#define __XFS_DIR2_NODE_H__
+
+/*
+ * Directory version 2, btree node format structures
+ */
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Constants.
+ */
+
+/*
+ * Offset of the freespace index.
+ */
+#define XFS_DIR2_FREE_SPACE	2
+#define XFS_DIR2_FREE_OFFSET	(XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_FREE_FIRSTDB(mp)	\
+	XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET)
+
+#define XFS_DIR2_FREE_MAGIC	0x58443246	/* XD2F */
+
+/*
+ * Structures.
+ */
+typedef struct xfs_dir2_free_hdr {
+	__uint32_t		magic;		/* XFS_DIR2_FREE_MAGIC */
+	__int32_t		firstdb;	/* db of first entry */
+	__int32_t		nvalid;		/* count of valid entries */
+	__int32_t		nused;		/* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+	xfs_dir2_free_hdr_t	hdr;		/* block header */
+	xfs_dir2_data_off_t	bests[1];	/* best free counts */
+						/* unused entries are -1 */
+} xfs_dir2_free_t;
+#define XFS_DIR2_MAX_FREE_BESTS(mp)	\
+	(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \
+	 (uint)sizeof(xfs_dir2_data_off_t))
+
+/*
+ * Macros.
+ */
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDB)
+xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db);
+#define XFS_DIR2_DB_TO_FDB(mp,db)	xfs_dir2_db_to_fdb(mp, db)
+#else
+#define XFS_DIR2_DB_TO_FDB(mp,db)	\
+	(XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp))
+#endif
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDINDEX)
+int
+xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db);
+#define XFS_DIR2_DB_TO_FDINDEX(mp,db)	xfs_dir2_db_to_fdindex(mp, db)
+#else
+#define XFS_DIR2_DB_TO_FDINDEX(mp,db)	((db) % XFS_DIR2_MAX_FREE_BESTS(mp))
+#endif
+
+/*
+ * Functions.
+ */
+
+extern void
+	xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				int first, int last);
+
+extern int
+	xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_dabuf *lbp);
+
+extern xfs_dahash_t
+	xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
+
+extern int
+	xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp,
+				  struct xfs_da_args *args, int *indexp,
+				  struct xfs_da_state *state);
+
+extern int
+	xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp,
+			     struct xfs_dabuf *leaf2_bp);
+
+extern int
+	xfs_dir2_leafn_split(struct xfs_da_state *state,
+			     struct xfs_da_state_blk *oldblk,
+			     struct xfs_da_state_blk *newblk);
+
+extern int
+	xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+
+extern void
+	xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+				 struct xfs_da_state_blk *drop_blk,
+				 struct xfs_da_state_blk *save_blk);
+
+extern int
+	xfs_dir2_node_addname(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_node_lookup(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_node_removename(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_node_replace(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+				int *rvalp);
+
+#endif	/* __XFS_DIR2_NODE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_sf.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_sf.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_sf.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_sf.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_sf.c
+ * Shortform directory implementation for v2 directories.
+ */
+
+#include <xfs.h>
+
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+				     xfs_dir2_sf_entry_t *sfep,
+				     xfs_dir2_data_aoff_t offset,
+				     int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+				     int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+				    xfs_dir2_sf_entry_t **sfepp,
+				    xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+#if XFS_BIG_FILESYSTEMS
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+#endif /* XFS_BIG_FILESYSTEMS */
+
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode.  If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int						/* size for sf form */
+xfs_dir2_block_sfsize(
+	xfs_inode_t		*dp,		/* incore inode pointer */
+	xfs_dir2_block_t	*block,		/* block directory data */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* output: header for sf form */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_leaf_entry_t	*blp;		/* leaf area of the block */
+	xfs_dir2_block_tail_t	*btp;		/* tail area of the block */
+	int			count;		/* shortform entry count */
+	xfs_dir2_data_entry_t	*dep;		/* data entry in the block */
+	int			i;		/* block entry index */
+	int			i8count;	/* count of big-inode entries */
+	int			isdot;		/* entry is "." */
+	int			isdotdot;	/* entry is ".." */
+	xfs_mount_t		*mp;		/* mount structure pointer */
+	int			namelen;	/* total name bytes */
+	xfs_ino_t		parent;		/* parent inode number */
+	int			size=0;		/* total computed size */
+
+	mp = dp->i_mount;
+
+	count = i8count = namelen = 0;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+
+	/*
+	 * Iterate over the block's data entries by using the leaf pointers.
+	 */
+	for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
+		if ((addr = INT_GET(blp[i].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Calculate the pointer to the entry at hand.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
+		/*
+		 * Detect . and .., so we can special-case them.
+		 * . is not included in sf directories.
+		 * .. is included by just the parent inode number.
+		 */
+		isdot = dep->namelen == 1 && dep->name[0] == '.';
+		isdotdot =
+			dep->namelen == 2 &&
+			dep->name[0] == '.' && dep->name[1] == '.';
+#if XFS_BIG_FILESYSTEMS
+		if (!isdot)
+			i8count += INT_GET(dep->inumber, ARCH_CONVERT) > XFS_DIR2_MAX_SHORT_INUM;
+#endif
+		if (!isdot && !isdotdot) {
+			count++;
+			namelen += dep->namelen;
+		} else if (isdotdot)
+			parent = INT_GET(dep->inumber, ARCH_CONVERT);
+		/*
+		 * Calculate the new size, see if we should give up yet.
+		 */
+		size = XFS_DIR2_SF_HDR_SIZE(i8count) +		/* header */
+		       count +					/* namelen */
+		       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+		       namelen +				/* name */
+		       (i8count ?				/* inumber */
+				(uint)sizeof(xfs_dir2_ino8_t) * count :
+				(uint)sizeof(xfs_dir2_ino4_t) * count);
+		if (size > XFS_IFORK_DSIZE(dp))
+			return size;		/* size value is a failure */
+	}
+	/*
+	 * Create the output header, if it worked.
+	 */
+	sfhp->count = count;
+	sfhp->i8count = i8count;
+	XFS_DIR2_SF_PUT_INUMBER_ARCH((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent, ARCH_CONVERT);
+	return size;
+}
+
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int						/* error */
+xfs_dir2_block_to_sf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	int			size,		/* shortform directory size */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* shortform directory hdr */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data pointer */
+	char			*endptr;	/* end of data entries */
+	int			error;		/* error return value */
+	int			logflags;	/* inode logging flags */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*ptr;		/* current data pointer */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_ino_t		temp;
+
+	xfs_dir2_trace_args_sb("block_to_sf", args, size, bp);
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	/*
+	 * Make a copy of the block data, so we can shrink the inode
+	 * and add local data.
+	 */
+	block = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
+	bcopy(bp->data, block, mp->m_dirblksize);
+	logflags = XFS_ILOG_CORE;
+	if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
+		ASSERT(error != ENOSPC);
+		goto out;
+	}
+	/*
+	 * The buffer is now unconditionally gone, whether
+	 * xfs_dir2_shrink_inode worked or not.
+	 *
+	 * Convert the inode to local format.
+	 */
+	dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+	dp->i_df.if_flags |= XFS_IFINLINE;
+	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+	ASSERT(dp->i_df.if_bytes == 0);
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	logflags |= XFS_ILOG_DDATA;
+	/*
+	 * Copy the header into the newly allocate local space.
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	bcopy(sfhp, sfp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count));
+	dp->i_d.di_size = size;
+	/*
+	 * Set up to loop over the block's entries.
+	 */
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	ptr = (char *)block->u;
+	endptr = (char *)XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	/*
+	 * Loop over the active and unused entries.
+	 * Stop when we reach the leaf/tail portion of the block.
+	 */
+	while (ptr < endptr) {
+		/*
+		 * If it's unused, just skip over it.
+		 */
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			ptr += INT_GET(dup->length, ARCH_CONVERT);
+			continue;
+		}
+		dep = (xfs_dir2_data_entry_t *)ptr;
+		/*
+		 * Skip .
+		 */
+		if (dep->namelen == 1 && dep->name[0] == '.')
+			ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == dp->i_ino);
+		/*
+		 * Skip .., but make sure the inode number is right.
+		 */
+		else if (dep->namelen == 2 &&
+			 dep->name[0] == '.' && dep->name[1] == '.')
+			ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) ==
+			       XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT));
+		/*
+		 * Normal entry, copy it into shortform.
+		 */
+		else {
+			sfep->namelen = dep->namelen;
+			XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep,
+				(xfs_dir2_data_aoff_t)
+				((char *)dep - (char *)block), ARCH_CONVERT);
+			bcopy(dep->name, sfep->name, dep->namelen);
+			temp=INT_GET(dep->inumber, ARCH_CONVERT);
+			XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &temp,
+				XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+			sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+		}
+		ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+	}
+	ASSERT((char *)sfep - (char *)sfp == size);
+	xfs_dir2_sf_check(args);
+out:
+	xfs_trans_log_inode(args->trans, dp, logflags);
+	kmem_free(block, mp->m_dirblksize);
+	return error;
+}
+
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int						/* error */
+xfs_dir2_sf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	int			add_entsize;	/* size of the new entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			incr_isize;	/* total change in size */
+	int			new_isize;	/* di_size after adding name */
+	int			objchange;	/* changing to 8-byte inodes */
+	xfs_dir2_data_aoff_t	offset;		/* offset for new entry */
+	int			old_isize;	/* di_size before adding name */
+	int			pick;		/* which algorithm to use */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+
+	xfs_dir2_trace_args("sf_addname", args);
+	ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Make sure the shortform value has some of its header.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+	/*
+	 * Compute entry (and change in) size.
+	 */
+	add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
+	incr_isize = add_entsize;
+#if XFS_BIG_FILESYSTEMS
+	/*
+	 * Do we have to change to 8 byte inodes?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+		/*
+		 * Yes, adjust the entry size and the total size.
+		 */
+		add_entsize +=
+			(uint)sizeof(xfs_dir2_ino8_t) -
+			(uint)sizeof(xfs_dir2_ino4_t);
+		incr_isize +=
+			(sfp->hdr.count + 2) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		objchange = 1;
+	} else
+		objchange = 0;
+#else
+	objchange = 0;
+#endif
+	old_isize = (int)dp->i_d.di_size;
+	new_isize = old_isize + incr_isize;
+	/*
+	 * Won't fit as shortform any more (due to size),
+	 * or the pick routine says it won't (due to offset values).
+	 */
+	if (new_isize > XFS_IFORK_DSIZE(dp) ||
+	    (pick =
+	     xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+		/*
+		 * Just checking or no space reservation, it doesn't fit.
+		 */
+		if (args->justcheck || args->total == 0)
+			return XFS_ERROR(ENOSPC);
+		/*
+		 * Convert to block form then add the name.
+		 */
+		error = xfs_dir2_sf_to_block(args);
+		if (error)
+			return error;
+		return xfs_dir2_block_addname(args);
+	}
+	/*
+	 * Just checking, it fits.
+	 */
+	if (args->justcheck)
+		return 0;
+	/*
+	 * Do it the easy way - just add it at the end.
+	 */
+	if (pick == 1)
+		xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+	/*
+	 * Do it the hard way - look for a place to insert the new entry.
+	 * Convert to 8 byte inode numbers first if necessary.
+	 */
+	else {
+		ASSERT(pick == 2);
+#if XFS_BIG_FILESYSTEMS
+		if (objchange)
+			xfs_dir2_sf_toino8(args);
+#endif
+		xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+	}
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dir2_sf_entry_t	*sfep,		/* pointer to new entry */
+	xfs_dir2_data_aoff_t	offset,		/* offset to use for new ent */
+	int			new_isize)	/* new directory size */
+{
+	int			byteoff;	/* byte offset in sf dir */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	/*
+	 * Grow the in-inode space.
+	 */
+	xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen),
+		XFS_DATA_FORK);
+	/*
+	 * Need to set up again due to realloc of the inode data.
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+	/*
+	 * Fill in the new entry.
+	 */
+	sfep->namelen = args->namelen;
+	XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep, offset, ARCH_CONVERT);
+	bcopy(args->name, sfep->name, sfep->namelen);
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &args->inumber,
+		XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+	/*
+	 * Update the header and inode.
+	 */
+	sfp->hdr.count++;
+#if XFS_BIG_FILESYSTEMS
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+		sfp->hdr.i8count++;
+#endif
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* changing inode number size */
+	int			new_isize)	/* new directory size */
+{
+	int			add_datasize;	/* data size need for new ent */
+	char			*buf;		/* buffer for old */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			eof;		/* reached end of old dir */
+	int			nbytes;		/* temp for byte copies */
+	xfs_dir2_data_aoff_t	new_offset;	/* next offset value */
+	xfs_dir2_data_aoff_t	offset;		/* current offset value */
+	int			old_isize;	/* previous di_size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* entry in original dir */
+	xfs_dir2_sf_t		*oldsfp;	/* original shortform dir */
+	xfs_dir2_sf_entry_t	*sfep;		/* entry in new dir */
+	xfs_dir2_sf_t		*sfp;		/* new shortform dir */
+
+	/*
+	 * Copy the old directory to the stack buffer.
+	 */
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	old_isize = (int)dp->i_d.di_size;
+	buf = kmem_alloc(old_isize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_t *)buf;
+	bcopy(sfp, oldsfp, old_isize);
+	/*
+	 * Loop over the old directory finding the place we're going
+	 * to insert the new entry.
+	 * If it's going to end up at the end then oldsfep will point there.
+	 */
+	for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
+	      oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp),
+	      add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen),
+	      eof = (char *)oldsfep == &buf[old_isize];
+	     !eof;
+	     offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen),
+	      oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep),
+	      eof = (char *)oldsfep == &buf[old_isize]) {
+		new_offset = XFS_DIR2_SF_GET_OFFSET_ARCH(oldsfep, ARCH_CONVERT);
+		if (offset + add_datasize <= new_offset)
+			break;
+	}
+	/*
+	 * Get rid of the old directory, then allocate space for
+	 * the new one.	 We do this so xfs_idata_realloc won't copy
+	 * the data.
+	 */
+	xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+	/*
+	 * Reset the pointer since the buffer was reallocated.
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Copy the first part of the directory, including the header.
+	 */
+	nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+	bcopy(oldsfp, sfp, nbytes);
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+	/*
+	 * Fill in the new entry, and update the header counts.
+	 */
+	sfep->namelen = args->namelen;
+	XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep, offset, ARCH_CONVERT);
+	bcopy(args->name, sfep->name, sfep->namelen);
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &args->inumber,
+		XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+	sfp->hdr.count++;
+#if XFS_BIG_FILESYSTEMS
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+		sfp->hdr.i8count++;
+#endif
+	/*
+	 * If there's more left to copy, do that.
+	 */
+	if (!eof) {
+		sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+		bcopy(oldsfep, sfep, old_isize - nbytes);
+	}
+	kmem_free(buf, old_isize);
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int					/* pick result */
+xfs_dir2_sf_addname_pick(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* inode # size changes */
+	xfs_dir2_sf_entry_t	**sfepp,	/* out(1): new entry ptr */
+	xfs_dir2_data_aoff_t	*offsetp)	/* out(1): new offset */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			holefit;	/* found hole it will fit in */
+	int			i;		/* entry number */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_data_aoff_t	offset;		/* data block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	int			size;		/* entry's data size */
+	int			used;		/* data bytes used */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	size = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	holefit = 0;
+	/*
+	 * Loop over sf entries.
+	 * Keep track of data offset and whether we've seen a place
+	 * to insert the new entry.
+	 */
+	for (i = 0; i < sfp->hdr.count; i++) {
+		if (!holefit)
+			holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT);
+		offset = XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT) +
+			 XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
+		sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+	}
+	/*
+	 * Calculate data bytes used excluding the new entry, if this
+	 * was a data block (block form directory).
+	 */
+	used = offset +
+	       (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t);
+	/*
+	 * If it won't fit in a block form then we can't insert it,
+	 * we'll go back, convert to block, then try the insert and convert
+	 * to leaf.
+	 */
+	if (used + (holefit ? 0 : size) > mp->m_dirblksize)
+		return 0;
+	/*
+	 * If changing the inode number size, do it the hard way.
+	 */
+#if XFS_BIG_FILESYSTEMS
+	if (objchange) {
+		return 2;
+	}
+#else
+	ASSERT(objchange == 0);
+#endif
+	/*
+	 * If it won't fit at the end then do it the hard way (use the hole).
+	 */
+	if (used + size > mp->m_dirblksize)
+		return 2;
+	/*
+	 * Do it the easy way.
+	 */
+	*sfepp = sfep;
+	*offsetp = offset;
+	return 1;
+}
+
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry number */
+	int			i8count;	/* number of big inode#s */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			offset;		/* data offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform dir entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT);
+	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+		ASSERT(XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT) >= offset);
+		ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+		offset =
+			XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT) +
+			XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
+	}
+	ASSERT(i8count == sfp->hdr.i8count);
+#if !XFS_BIG_FILESYSTEMS
+	ASSERT(i8count == 0);
+#endif
+	ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+	ASSERT(offset +
+	       (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t) <=
+	       dp->i_mount->m_dirblksize);
+}
+#endif	/* DEBUG */
+
+/*
+ * Create a new (shortform) directory.
+ */
+int					/* error, always 0 */
+xfs_dir2_sf_create(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_ino_t	pino)		/* parent inode number */
+{
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		i8count;	/* parent inode is an 8-byte number */
+	xfs_dir2_sf_t	*sfp;		/* shortform structure */
+	int		size;		/* directory size */
+
+	xfs_dir2_trace_args_i("sf_create", args, pino);
+	dp = args->dp;
+
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_d.di_size == 0);
+	/*
+	 * If it's currently a zero-length extent file,
+	 * convert it to local format.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		dp->i_df.if_flags |= XFS_IFINLINE;
+	}
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	ASSERT(dp->i_df.if_bytes == 0);
+	i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+	size = XFS_DIR2_SF_HDR_SIZE(i8count);
+	/*
+	 * Make a buffer for the data.
+	 */
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	/*
+	 * Fill in the header,
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	sfp->hdr.i8count = i8count;
+	/*
+	 * Now can put in the inode number, since i8count is set.
+	 */
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &pino, &sfp->hdr.parent, ARCH_CONVERT);
+	sfp->hdr.count = 0;
+	dp->i_d.di_size = size;
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+int						/* error */
+xfs_dir2_sf_getdents(
+	xfs_inode_t		*dp,		/* incore directory inode */
+	uio_t			*uio,		/* caller's buffer control */
+	int			*eofp,		/* eof reached? (out) */
+	xfs_dirent_t		*dbp,		/* caller's buffer */
+	xfs_dir2_put_t		put)		/* abi's formatting function */
+{
+	int			error;		/* error return value */
+	int			i;		/* shortform entry number */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_dataptr_t	off;		/* current entry's offset */
+	xfs_dir2_put_args_t	p;		/* arg package for put rtn */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_off_t			dir_offset;
+
+	mp = dp->i_mount;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Give up if the directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		return XFS_ERROR(EIO);
+	}
+
+	dir_offset = uio->uio_offset;
+
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) {
+		*eofp = 1;
+		return 0;
+	}
+
+	/*
+	 * Set up putargs structure.
+	 */
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+	/*
+	 * Put . entry unless we're starting past it.
+	 */
+	if (dir_offset <=
+		    XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+					       XFS_DIR2_DATA_DOT_OFFSET)) {
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0,
+						XFS_DIR2_DATA_DOT_OFFSET);
+#if XFS_BIG_FILESYSTEMS
+		p.ino = dp->i_ino + mp->m_inoadd;
+#else
+		p.ino = dp->i_ino;
+#endif
+		p.name = ".";
+		p.namelen = 1;
+
+		error = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset =
+				XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+						XFS_DIR2_DATA_DOT_OFFSET);
+			return error;
+		}
+	}
+
+	/*
+	 * Put .. entry unless we're starting past it.
+	 */
+	if (dir_offset <=
+		    XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+					       XFS_DIR2_DATA_DOTDOT_OFFSET)) {
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+						XFS_DIR2_DATA_DOTDOT_OFFSET);
+#if XFS_BIG_FILESYSTEMS
+		p.ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT) +
+			mp->m_inoadd;
+#else
+		p.ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT);
+#endif
+		p.name = "..";
+		p.namelen = 2;
+
+		error = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset =
+				XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+					XFS_DIR2_DATA_DOTDOT_OFFSET);
+			return error;
+		}
+	}
+
+	/*
+	 * Loop while there are more entries and put'ing works.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+		     i < sfp->hdr.count;
+			     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+
+		off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+				XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT));
+
+		if (dir_offset > off)
+			continue;
+
+		p.namelen = sfep->namelen;
+
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+				XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT));
+
+#if XFS_BIG_FILESYSTEMS
+		p.ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+				XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT) +
+			mp->m_inoadd;
+#else
+		p.ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+				XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+#endif
+		p.name = (char *)sfep->name;
+
+		error = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset = off;
+			return error;
+		}
+	}
+
+	/*
+	 * They all fit.
+	 */
+	*eofp = 1;
+
+	uio->uio_offset =
+		XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
+
+	return 0;
+}
+
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int						/* error */
+xfs_dir2_sf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	xfs_dir2_trace_args("sf_lookup", args);
+	xfs_dir2_sf_check(args);
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+	/*
+	 * Special case for .
+	 */
+	if (args->namelen == 1 && args->name[0] == '.') {
+		args->inumber = dp->i_ino;
+		return XFS_ERROR(EEXIST);
+	}
+	/*
+	 * Special case for ..
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		args->inumber = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT);
+		return XFS_ERROR(EEXIST);
+	}
+	/*
+	 * Loop over all the entries trying to match ours.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+		if (sfep->namelen == args->namelen &&
+		    sfep->name[0] == args->name[0] &&
+		    bcmp(args->name, sfep->name, args->namelen) == 0) {
+			args->inumber =
+				XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+					XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+			return XFS_ERROR(EEXIST);
+		}
+	}
+	/*
+	 * Didn't find it.
+	 */
+	ASSERT(args->oknoent);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_removename(
+	xfs_da_args_t		*args)
+{
+	int			byteoff;	/* offset of removed entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			entsize;	/* this entry's size */
+	int			i;		/* shortform entry index */
+	int			newsize;	/* new inode size */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	xfs_dir2_trace_args("sf_removename", args);
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	oldsize = (int)dp->i_d.di_size;
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == oldsize);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+	/*
+	 * Loop over the old directory entries.
+	 * Find the one we're deleting.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+		if (sfep->namelen == args->namelen &&
+		    sfep->name[0] == args->name[0] &&
+		    bcmp(sfep->name, args->name, args->namelen) == 0) {
+			ASSERT(XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+					XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT) ==
+				args->inumber);
+			break;
+		}
+	}
+	/*
+	 * Didn't find it.
+	 */
+	if (i == sfp->hdr.count) {
+		return XFS_ERROR(ENOENT);
+	}
+	/*
+	 * Calculate sizes.
+	 */
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
+	newsize = oldsize - entsize;
+	/*
+	 * Copy the part if any after the removed entry, sliding it down.
+	 */
+	if (byteoff + entsize < oldsize)
+		ovbcopy((char *)sfp + byteoff + entsize, (char *)sfp + byteoff,
+			oldsize - (byteoff + entsize));
+	/*
+	 * Fix up the header and file size.
+	 */
+	sfp->hdr.count--;
+	dp->i_d.di_size = newsize;
+	/*
+	 * Reallocate, making it smaller.
+	 */
+	xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+#if XFS_BIG_FILESYSTEMS
+	/*
+	 * Are we changing inode number size?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		if (sfp->hdr.i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->hdr.i8count--;
+	}
+#endif
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+#if XFS_BIG_FILESYSTEMS || defined(DEBUG)
+	xfs_ino_t		ino=0;		/* entry old inode number */
+#endif
+#if XFS_BIG_FILESYSTEMS
+	int			i8elevated;	/* sf_toino8 set i8count=1 */
+#endif
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	xfs_dir2_trace_args("sf_replace", args);
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the shortform directory is way too small.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+#if XFS_BIG_FILESYSTEMS
+	/*
+	 * New inode number is large, and need to convert to 8-byte inodes.
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+		int	error;			/* error return value */
+		int	newsize;		/* new inode size */
+
+		newsize =
+			dp->i_df.if_bytes +
+			(sfp->hdr.count + 1) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		/*
+		 * Won't fit as shortform, convert to block then do replace.
+		 */
+		if (newsize > XFS_IFORK_DSIZE(dp)) {
+			error = xfs_dir2_sf_to_block(args);
+			if (error) {
+				return error;
+			}
+			return xfs_dir2_block_replace(args);
+		}
+		/*
+		 * Still fits, convert to 8-byte now.
+		 */
+		xfs_dir2_sf_toino8(args);
+		i8elevated = 1;
+		sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	} else
+		i8elevated = 0;
+#endif
+	ASSERT(args->namelen != 1 || args->name[0] != '.');
+	/*
+	 * Replace ..'s entry.
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+#if XFS_BIG_FILESYSTEMS || defined(DEBUG)
+		ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT);
+		ASSERT(args->inumber != ino);
+#endif
+		XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &args->inumber, &sfp->hdr.parent, ARCH_CONVERT);
+	}
+	/*
+	 * Normal entry, look for the name.
+	 */
+	else {
+		for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+		     i < sfp->hdr.count;
+		     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+			if (sfep->namelen == args->namelen &&
+			    sfep->name[0] == args->name[0] &&
+			    bcmp(args->name, sfep->name, args->namelen) == 0) {
+#if XFS_BIG_FILESYSTEMS || defined(DEBUG)
+				ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+					XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+				ASSERT(args->inumber != ino);
+#endif
+				XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &args->inumber,
+					XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+				break;
+			}
+		}
+		/*
+		 * Didn't find it.
+		 */
+		if (i == sfp->hdr.count) {
+			ASSERT(args->oknoent);
+#if XFS_BIG_FILESYSTEMS
+			if (i8elevated)
+				xfs_dir2_sf_toino4(args);
+#endif
+			return XFS_ERROR(ENOENT);
+		}
+	}
+#if XFS_BIG_FILESYSTEMS
+	/*
+	 * See if the old number was large, the new number is small.
+	 */
+	if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * And the old count was one, so need to convert to small.
+		 */
+		if (sfp->hdr.i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->hdr.i8count--;
+	}
+	/*
+	 * See if the old number was small, the new number is large.
+	 */
+	if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * add to the i8count unless we just converted to 8-byte
+		 * inodes (which does an implied i8count = 1)
+		 */
+		ASSERT(sfp->hdr.i8count != 0);
+		if (!i8elevated)
+			sfp->hdr.i8count++;
+	}
+#endif
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+	return 0;
+}
+
+#if XFS_BIG_FILESYSTEMS
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_t		*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_t		*sfp;		/* new sf directory */
+
+	xfs_dir2_trace_args("sf_toino4", args);
+	dp = args->dp;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->hdr.i8count == 1);
+	bcopy(oldsfp, buf, oldsize);
+	/*
+	 * Compute the new inode size.
+	 */
+	newsize =
+		oldsize -
+		(oldsfp->hdr.count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_t *)buf;
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->hdr.count = oldsfp->hdr.count;
+	sfp->hdr.i8count = 0;
+	ino = XFS_DIR2_SF_GET_INUMBER_ARCH(oldsfp, &oldsfp->hdr.parent, ARCH_CONVERT);
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &ino, &sfp->hdr.parent, ARCH_CONVERT);
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
+		    oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
+		  oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		bcopy(oldsfep->name, sfep->name, sfep->namelen);
+		ino = XFS_DIR2_SF_GET_INUMBER_ARCH(oldsfp,
+			XFS_DIR2_SF_INUMBERP(oldsfep), ARCH_CONVERT);
+		XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf, oldsize);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+/*
+ * Convert from 4-byte inode numbers to 8-byte inode numbers.
+ * The new 8-byte inode number is not there yet, we leave with the
+ * count 1 but no corresponding entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_t		*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_t		*sfp;		/* new sf directory */
+
+	xfs_dir2_trace_args("sf_toino8", args);
+	dp = args->dp;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->hdr.i8count == 0);
+	bcopy(oldsfp, buf, oldsize);
+	/*
+	 * Compute the new inode size.
+	 */
+	newsize =
+		oldsize +
+		(oldsfp->hdr.count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_t *)buf;
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->hdr.count = oldsfp->hdr.count;
+	sfp->hdr.i8count = 1;
+	ino = XFS_DIR2_SF_GET_INUMBER_ARCH(oldsfp, &oldsfp->hdr.parent, ARCH_CONVERT);
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &ino, &sfp->hdr.parent, ARCH_CONVERT);
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
+		    oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
+		  oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		bcopy(oldsfep->name, sfep->name, sfep->namelen);
+		ino = XFS_DIR2_SF_GET_INUMBER_ARCH(oldsfp,
+			XFS_DIR2_SF_INUMBERP(oldsfep), ARCH_CONVERT);
+		XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf, oldsize);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+#endif	/* XFS_BIG_FILESYSTEMS */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_sf.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_sf.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_sf.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_sf.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_SF_H__
+#define __XFS_DIR2_SF_H__
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to
+ * fit into the literal area of the inode.
+ */
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_block;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Maximum size of a shortform directory.
+ */
+#define XFS_DIR2_SF_MAX_SIZE	\
+	(XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
+	 (uint)sizeof(xfs_agino_t))
+
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+
+#define XFS_DIR2_SF_GET_INO8_ARCH(di,arch)	\
+	(xfs_ino_t)(DIRINO_GET_ARCH(&di,arch))
+#define XFS_DIR2_SF_GET_INO8(di)		\
+	XFS_DIR2_SF_GET_INO8_ARCH(di,ARCH_NOCONVERT)
+
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+#define XFS_DIR2_SF_GET_INO4_ARCH(di,arch)	\
+	(xfs_ino_t)(DIRINO4_GET_ARCH(&di,arch))
+#define XFS_DIR2_SF_GET_INO4(di)		\
+	XFS_DIR2_SF_GET_INO4_ARCH(di,ARCH_NOCONVERT)
+
+typedef union {
+	xfs_dir2_ino8_t i8;
+	xfs_dir2_ino4_t i4;
+} xfs_dir2_inou_t;
+#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t;
+
+/*
+ * The parent directory has a dedicated field, and the self-pointer must
+ * be calculated on the fly.
+ *
+ * Entries are packed toward the top as tightly as possible.  The header
+ * and the elements must be bcopy()'d out into a work area to get correct
+ * alignment for the inode number fields.
+ */
+typedef struct xfs_dir2_sf_hdr {
+	__uint8_t		count;		/* count of entries */
+	__uint8_t		i8count;	/* count of 8-byte inode #s */
+	xfs_dir2_inou_t		parent;		/* parent dir inode number */
+} xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+	__uint8_t		namelen;	/* actual name length */
+	xfs_dir2_sf_off_t	offset;		/* saved offset */
+	__uint8_t		name[1];	/* name, variable size */
+	xfs_dir2_inou_t		inumber;	/* inode number, var. offset */
+} xfs_dir2_sf_entry_t;
+
+typedef struct xfs_dir2_sf {
+	xfs_dir2_sf_hdr_t	hdr;		/* shortform header */
+	xfs_dir2_sf_entry_t	list[1];	/* shortform entries */
+} xfs_dir2_sf_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_HDR_SIZE)
+int xfs_dir2_sf_hdr_size(int i8count);
+#define XFS_DIR2_SF_HDR_SIZE(i8count)	xfs_dir2_sf_hdr_size(i8count)
+#else
+#define XFS_DIR2_SF_HDR_SIZE(i8count)	\
+	((uint)sizeof(xfs_dir2_sf_hdr_t) - \
+	 ((i8count) == 0) * \
+	 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_INUMBERP)
+xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_INUMBERP(sfep)	xfs_dir2_sf_inumberp(sfep)
+#else
+#define XFS_DIR2_SF_INUMBERP(sfep)	\
+	((xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen])
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_INUMBER)
+xfs_intino_t xfs_dir2_sf_get_inumber_arch(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from,
+					    xfs_arch_t arch);
+#define XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, from, arch)	\
+	xfs_dir2_sf_get_inumber_arch(sfp, from, arch)
+
+#else
+#define XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, from, arch)	\
+	((sfp)->hdr.i8count == 0 ? \
+		(xfs_intino_t)XFS_DIR2_SF_GET_INO4_ARCH(*(from), arch) : \
+		(xfs_intino_t)XFS_DIR2_SF_GET_INO8_ARCH(*(from), arch))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_INUMBER)
+void xfs_dir2_sf_put_inumber_arch(xfs_dir2_sf_t *sfp, xfs_ino_t *from,
+				     xfs_dir2_inou_t *to, xfs_arch_t arch);
+#define XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp,from,to,arch)	\
+	xfs_dir2_sf_put_inumber_arch(sfp,from,to,arch)
+#else
+#define XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp,from,to,arch)	\
+	if ((sfp)->hdr.i8count == 0) { \
+	    DIRINO4_COPY_ARCH(from,to,arch); \
+	} else { \
+	    DIRINO_COPY_ARCH(from,to,arch); \
+	}
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_OFFSET)
+xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset_arch(xfs_dir2_sf_entry_t *sfep,
+						    xfs_arch_t arch);
+xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_GET_OFFSET_ARCH(sfep,arch)	\
+	xfs_dir2_sf_get_offset_arch(sfep,arch)
+#else
+#define XFS_DIR2_SF_GET_OFFSET_ARCH(sfep,arch)	\
+	INT_GET_UNALIGNED_16_ARCH(&(sfep)->offset.i,arch)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_OFFSET)
+void xfs_dir2_sf_put_offset_arch(xfs_dir2_sf_entry_t *sfep,
+				    xfs_dir2_data_aoff_t off, xfs_arch_t arch);
+#define XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep,off,arch) \
+	xfs_dir2_sf_put_offset_arch(sfep,off,arch)
+#else
+#define XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep,off,arch)	\
+	INT_SET_UNALIGNED_16_ARCH(&(sfep)->offset.i,off,arch)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYNAME)
+int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len);
+#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len)	\
+	xfs_dir2_sf_entsize_byname(sfp,len)
+#else
+#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len)	/* space a name uses */ \
+	((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \
+	 ((sfp)->hdr.i8count == 0) * \
+	 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYENTRY)
+int xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)	\
+	xfs_dir2_sf_entsize_byentry(sfp,sfep)
+#else
+#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)	/* space an entry uses */ \
+	((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \
+	 ((sfp)->hdr.i8count == 0) * \
+	 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_FIRSTENTRY)
+xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp);
+#define XFS_DIR2_SF_FIRSTENTRY(sfp)	xfs_dir2_sf_firstentry(sfp)
+#else
+#define XFS_DIR2_SF_FIRSTENTRY(sfp)	/* first entry in struct */ \
+	((xfs_dir2_sf_entry_t *) \
+	 ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_NEXTENTRY)
+xfs_dir2_sf_entry_t *xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp,
+					   xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep)		xfs_dir2_sf_nextentry(sfp,sfep)
+#else
+#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep)		/* next entry in struct */ \
+	((xfs_dir2_sf_entry_t *) \
+		((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)))
+#endif
+
+/*
+ * Functions.
+ */
+
+extern int
+	xfs_dir2_block_sfsize(struct xfs_inode *dp,
+			      struct xfs_dir2_block *block,
+			      xfs_dir2_sf_hdr_t *sfhp);
+
+extern int
+	xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp,
+			     int size, xfs_dir2_sf_hdr_t *sfhp);
+
+extern int
+	xfs_dir2_sf_addname(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+
+extern int
+	xfs_dir2_sf_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
+			     struct xfs_dirent *dbp, xfs_dir2_put_t put);
+
+extern int
+	xfs_dir2_sf_lookup(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_sf_removename(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_sf_replace(struct xfs_da_args *args);
+
+#endif	/* __XFS_DIR2_SF_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_trace.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_trace.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_trace.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_trace.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_trace.c
+ * Tracing for xfs v2 directories.
+ */
+#include <xfs.h>
+
+
+#ifdef DEBUG
+ktrace_t	*xfs_dir2_trace_buf;
+#endif /* DEBUG */
+
+#ifdef XFS_DIR2_TRACE
+/*
+ * Enter something in the trace buffers.
+ */
+static void
+xfs_dir2_trace_enter(
+	xfs_inode_t	*dp,
+	int		type,
+	char		*where,
+	char		*name,
+	int		namelen,
+	__psunsigned_t	a0,
+	__psunsigned_t	a1,
+	__psunsigned_t	a2,
+	__psunsigned_t	a3,
+	__psunsigned_t	a4,
+	__psunsigned_t	a5,
+	__psunsigned_t	a6)
+{
+	__psunsigned_t	n[6];
+
+	ASSERT(xfs_dir2_trace_buf);
+	ASSERT(dp->i_dir_trace);
+	if (name)
+		bcopy(name, n, min(sizeof(n), namelen));
+	else
+		bzero((char *)n, sizeof(n));
+	ktrace_enter(xfs_dir2_trace_buf,
+		(void *)(__psunsigned_t)type, (void *)where,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6,
+		(void *)(__psunsigned_t)namelen,
+		(void *)n[0], (void *)n[1], (void *)n[2],
+		(void *)n[3], (void *)n[4], (void *)n[5]);
+	ktrace_enter(dp->i_dir_trace,
+		(void *)(__psunsigned_t)type, (void *)where,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6,
+		(void *)(__psunsigned_t)namelen,
+		(void *)n[0], (void *)n[1], (void *)n[2],
+		(void *)n[3], (void *)n[4], (void *)n[5]);
+}
+
+void
+xfs_dir2_trace_args(
+	char		*where,
+	xfs_da_args_t	*args)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, 0, 0);
+}
+
+void
+xfs_dir2_trace_args_b(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_dabuf_t	*bp)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck,
+		(__psunsigned_t)(bp ? bp->bps[0] : NULL), 0);
+}
+
+void
+xfs_dir2_trace_args_bb(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_dabuf_t	*lbp,
+	xfs_dabuf_t	*dbp)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck,
+		(__psunsigned_t)(lbp ? lbp->bps[0] : NULL),
+		(__psunsigned_t)(dbp ? dbp->bps[0] : NULL));
+}
+
+void
+xfs_dir2_trace_args_bibii(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_dabuf_t	*bs,
+	int		ss,
+	xfs_dabuf_t	*bd,
+	int		sd,
+	int		c)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)(bs ? bs->bps[0] : NULL), (__psunsigned_t)ss,
+		(__psunsigned_t)(bd ? bd->bps[0] : NULL), (__psunsigned_t)sd,
+		(__psunsigned_t)c);
+}
+
+void
+xfs_dir2_trace_args_db(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_dir2_db_t	db,
+	xfs_dabuf_t	*bp)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, (__psunsigned_t)db,
+		(__psunsigned_t)(bp ? bp->bps[0] : NULL));
+}
+
+void
+xfs_dir2_trace_args_i(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_ino_t	i)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, (__psunsigned_t)i, 0);
+}
+
+void
+xfs_dir2_trace_args_s(
+	char		*where,
+	xfs_da_args_t	*args,
+	int		s)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, (__psunsigned_t)s, 0);
+}
+
+void
+xfs_dir2_trace_args_sb(
+	char		*where,
+	xfs_da_args_t	*args,
+	int		s,
+	xfs_dabuf_t	*bp)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, (__psunsigned_t)s,
+		(__psunsigned_t)(bp ? bp->bps[0] : NULL));
+}
+#endif	/* XFS_DIR2_TRACE */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_trace.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_trace.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir2_trace.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir2_trace.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_TRACE_H__
+#define __XFS_DIR2_TRACE_H__
+
+/*
+ * Tracing for xfs v2 directories.
+ */
+
+struct ktrace;
+struct xfs_dabuf;
+struct xfs_da_args;
+
+#ifdef XFS_ALL_TRACE
+#define XFS_DIR2_TRACE
+#endif	/* XFS_ALL_TRACE */
+
+#if !defined(DEBUG)
+#undef XFS_DIR2_TRACE
+#endif	/* !DEBUG */
+
+#define XFS_DIR2_GTRACE_SIZE		4096	/* global buffer */
+#define XFS_DIR2_KTRACE_SIZE		32	/* per-inode buffer */
+
+#define XFS_DIR2_KTRACE_ARGS		1	/* args only */
+#define XFS_DIR2_KTRACE_ARGS_B		2	/* args + buffer */
+#define XFS_DIR2_KTRACE_ARGS_BB		3	/* args + 2 buffers */
+#define XFS_DIR2_KTRACE_ARGS_DB		4	/* args, db, buffer */
+#define XFS_DIR2_KTRACE_ARGS_I		5	/* args, inum */
+#define XFS_DIR2_KTRACE_ARGS_S		6	/* args, int */
+#define XFS_DIR2_KTRACE_ARGS_SB		7	/* args, int, buffer */
+#define XFS_DIR2_KTRACE_ARGS_BIBII	8	/* args, buf/int/buf/int/int */
+
+#ifdef XFS_DIR2_TRACE
+
+void xfs_dir2_trace_args(char *where, struct xfs_da_args *args);
+void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args,
+			   struct xfs_dabuf *bp);
+void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args,
+			    struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
+void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args,
+			       struct xfs_dabuf *bs, int ss,
+			       struct xfs_dabuf *bd, int sd, int c);
+void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args,
+			    xfs_dir2_db_t db, struct xfs_dabuf *bp);
+void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i);
+void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s);
+void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s,
+			    struct xfs_dabuf *bp);
+
+#else	/* XFS_DIR2_TRACE */
+
+#define xfs_dir2_trace_args(where, args)
+#define xfs_dir2_trace_args_b(where, args, bp)
+#define xfs_dir2_trace_args_bb(where, args, lbp, dbp)
+#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c)
+#define xfs_dir2_trace_args_db(where, args, db, bp)
+#define xfs_dir2_trace_args_i(where, args, i)
+#define xfs_dir2_trace_args_s(where, args, s)
+#define xfs_dir2_trace_args_sb(where, args, s, bp)
+
+#endif	/* XFS_DIR2_TRACE */
+
+extern struct ktrace *xfs_dir2_trace_buf;
+
+#endif	/* __XFS_DIR2_TRACE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir_leaf.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir_leaf.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir_leaf.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir_leaf.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,2224 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir_leaf.c
+ *
+ * GROT: figure out how to recover gracefully when bmap returns ENOSPC.
+ */
+
+#include <xfs.h>
+
+
+/*
+ * xfs_dir_leaf.c
+ *
+ * Routines to implement leaf blocks of directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC void xfs_dir_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
+					      int insertion_index,
+					      int freemap_index);
+STATIC int xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer,
+					    int musthave, int justcheck);
+STATIC void xfs_dir_leaf_rebalance(xfs_da_state_t *state,
+						  xfs_da_state_blk_t *blk1,
+						  xfs_da_state_blk_t *blk2);
+STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
+					  xfs_da_state_blk_t *leaf_blk_1,
+					  xfs_da_state_blk_t *leaf_blk_2,
+					  int *number_entries_in_blk1,
+					  int *number_namebytes_in_blk1);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_dir_leaf_moveents(xfs_dir_leafblock_t *src_leaf,
+					      int src_start,
+					      xfs_dir_leafblock_t *dst_leaf,
+					      int dst_start, int move_count,
+					      xfs_mount_t *mp);
+
+
+/*========================================================================
+ * External routines when dirsize < XFS_IFORK_DSIZE(dp).
+ *========================================================================*/
+
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(xfs_mount_t *mp, xfs_ino_t ino)
+{
+	xfs_agblock_t	agblkno;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		ino_ok;
+	int		ioff;
+
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agblkno = XFS_INO_TO_AGBNO(mp, ino);
+	ioff = XFS_INO_TO_OFFSET(mp, ino);
+	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+	ino_ok =
+		agno < mp->m_sb.sb_agcount &&
+		agblkno < mp->m_sb.sb_agblocks &&
+		agblkno != 0 &&
+		ioff < (1 << mp->m_sb.sb_inopblog) &&
+		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+	if (XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+			XFS_RANDOM_DIR_INO_VALIDATE)) {
+		xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx\n",
+				(unsigned long long) ino);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * Create the initial contents of a shortform directory.
+ */
+int
+xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent)
+{
+	xfs_dir_sf_hdr_t *hdr;
+	xfs_inode_t *dp;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_d.di_size == 0);
+	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		dp->i_df.if_flags |= XFS_IFINLINE;
+	}
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	ASSERT(dp->i_df.if_bytes == 0);
+	xfs_idata_realloc(dp, sizeof(*hdr), XFS_DATA_FORK);
+	hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	XFS_DIR_SF_PUT_DIRINO_ARCH(&parent, &hdr->parent, ARCH_CONVERT);
+
+	INT_ZERO(hdr->count, ARCH_CONVERT);
+	dp->i_d.di_size = sizeof(*hdr);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return(0);
+}
+
+/*
+ * Add a name to the shortform directory structure.
+ * Overflow from the inode has already been checked for.
+ */
+int
+xfs_dir_shortform_addname(xfs_da_args_t *args)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	int i, offset, size;
+	xfs_inode_t *dp;
+
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+		if (sfe->namelen == args->namelen &&
+		    args->name[0] == sfe->name[0] &&
+		    bcmp(args->name, sfe->name, args->namelen) == 0)
+			return(XFS_ERROR(EEXIST));
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+
+	offset = (int)((char *)sfe - (char *)sf);
+	size = XFS_DIR_SF_ENTSIZE_BYNAME(args->namelen);
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	sfe = (xfs_dir_sf_entry_t *)((char *)sf + offset);
+
+	XFS_DIR_SF_PUT_DIRINO_ARCH(&args->inumber, &sfe->inumber, ARCH_CONVERT);
+	sfe->namelen = args->namelen;
+	bcopy(args->name, sfe->name, sfe->namelen);
+	INT_MOD(sf->hdr.count, ARCH_CONVERT, +1);
+
+	dp->i_d.di_size += size;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+
+	return(0);
+}
+
+/*
+ * Remove a name from the shortform directory structure.
+ */
+int
+xfs_dir_shortform_removename(xfs_da_args_t *args)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	int base, size = 0, i;
+	xfs_inode_t *dp;
+
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	base = sizeof(xfs_dir_sf_hdr_t);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+		size = XFS_DIR_SF_ENTSIZE_BYENTRY(sfe);
+		if (sfe->namelen == args->namelen &&
+		    sfe->name[0] == args->name[0] &&
+		    bcmp(sfe->name, args->name, args->namelen) == 0)
+			break;
+		base += size;
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	if (i < 0) {
+		ASSERT(args->oknoent);
+		return(XFS_ERROR(ENOENT));
+	}
+
+	if ((base + size) != dp->i_d.di_size) {
+		ovbcopy(&((char *)sf)[base+size], &((char *)sf)[base],
+					      dp->i_d.di_size - (base+size));
+	}
+	INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
+
+	xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
+	dp->i_d.di_size -= size;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+
+	return(0);
+}
+
+/*
+ * Look up a name in a shortform directory structure.
+ */
+int
+xfs_dir_shortform_lookup(xfs_da_args_t *args)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	int i;
+	xfs_inode_t *dp;
+
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		XFS_DIR_SF_GET_DIRINO_ARCH(&sf->hdr.parent, &args->inumber, ARCH_CONVERT);
+		return(XFS_ERROR(EEXIST));
+	}
+	if (args->namelen == 1 && args->name[0] == '.') {
+		args->inumber = dp->i_ino;
+		return(XFS_ERROR(EEXIST));
+	}
+	sfe = &sf->list[0];
+	for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+		if (sfe->namelen == args->namelen &&
+		    sfe->name[0] == args->name[0] &&
+		    bcmp(args->name, sfe->name, args->namelen) == 0) {
+			XFS_DIR_SF_GET_DIRINO_ARCH(&sfe->inumber, &args->inumber, ARCH_CONVERT);
+			return(XFS_ERROR(EEXIST));
+		}
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	ASSERT(args->oknoent);
+	return(XFS_ERROR(ENOENT));
+}
+
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs)
+{
+	xfs_inode_t *dp;
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	xfs_da_args_t args;
+	xfs_ino_t inumber;
+	char *tmpbuffer;
+	int retval, i, size;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	dp = iargs->dp;
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	size = dp->i_df.if_bytes;
+	tmpbuffer = kmem_alloc(size, KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+
+	bcopy(dp->i_df.if_u1.if_data, tmpbuffer, size);
+
+	sf = (xfs_dir_shortform_t *)tmpbuffer;
+	XFS_DIR_SF_GET_DIRINO_ARCH(&sf->hdr.parent, &inumber, ARCH_CONVERT);
+
+	xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
+	dp->i_d.di_size = 0;
+	xfs_trans_log_inode(iargs->trans, dp, XFS_ILOG_CORE);
+	retval = xfs_da_grow_inode(iargs, &blkno);
+	if (retval)
+		goto out;
+
+	ASSERT(blkno == 0);
+	retval = xfs_dir_leaf_create(iargs, blkno, &bp);
+	if (retval)
+		goto out;
+	xfs_da_buf_done(bp);
+
+	args.name = ".";
+	args.namelen = 1;
+	args.hashval = xfs_dir_hash_dot;
+	args.inumber = dp->i_ino;
+	args.dp = dp;
+	args.firstblock = iargs->firstblock;
+	args.flist = iargs->flist;
+	args.total = iargs->total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = iargs->trans;
+	args.justcheck = 0;
+	args.addname = args.oknoent = 1;
+	retval = xfs_dir_leaf_addname(&args);
+	if (retval)
+		goto out;
+
+	args.name = "..";
+	args.namelen = 2;
+	args.hashval = xfs_dir_hash_dotdot;
+	args.inumber = inumber;
+	retval = xfs_dir_leaf_addname(&args);
+	if (retval)
+		goto out;
+
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+		args.name = (char *)(sfe->name);
+		args.namelen = sfe->namelen;
+		args.hashval = xfs_da_hashname((char *)(sfe->name),
+					       sfe->namelen);
+		XFS_DIR_SF_GET_DIRINO_ARCH(&sfe->inumber, &args.inumber, ARCH_CONVERT);
+		retval = xfs_dir_leaf_addname(&args);
+		if (retval)
+			goto out;
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	retval = 0;
+
+out:
+	kmem_free(tmpbuffer, size);
+	return(retval);
+}
+
+STATIC int
+xfs_dir_shortform_compare(const void *a, const void *b)
+{
+	xfs_dir_sf_sort_t *sa, *sb;
+
+	sa = (xfs_dir_sf_sort_t *)a;
+	sb = (xfs_dir_sf_sort_t *)b;
+	if (sa->hash < sb->hash)
+		return -1;
+	else if (sa->hash > sb->hash)
+		return 1;
+	else
+		return sa->entno - sb->entno;
+}
+
+/*
+ * Copy out directory entries for getdents(), for shortform directories.
+ */
+/*ARGSUSED*/
+int
+xfs_dir_shortform_getdents(xfs_inode_t *dp, uio_t *uio, int *eofp,
+				       xfs_dirent_t *dbp, xfs_dir_put_t put)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	int retval, i, sbsize, nsbuf, lastresid=0, want_entno;
+	xfs_mount_t *mp;
+	xfs_dahash_t cookhash, hash;
+	xfs_dir_put_args_t p;
+	xfs_dir_sf_sort_t *sbuf, *sbp;
+
+	mp = dp->i_mount;
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
+	want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
+	nsbuf = INT_GET(sf->hdr.count, ARCH_CONVERT) + 2;
+	sbsize = (nsbuf + 1) * sizeof(*sbuf);
+	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+
+	xfs_dir_trace_g_du("sf: start", dp, uio);
+
+	/*
+	 * Collect all the entries into the buffer.
+	 * Entry 0 is .
+	 */
+	sbp->entno = 0;
+	sbp->seqno = 0;
+	sbp->hash = xfs_dir_hash_dot;
+	sbp->ino = dp->i_ino;
+	sbp->name = ".";
+	sbp->namelen = 1;
+	sbp++;
+
+	/*
+	 * Entry 1 is ..
+	 */
+	sbp->entno = 1;
+	sbp->seqno = 0;
+	sbp->hash = xfs_dir_hash_dotdot;
+	sbp->ino = XFS_GET_DIR_INO_ARCH(mp, sf->hdr.parent, ARCH_CONVERT);
+	sbp->name = "..";
+	sbp->namelen = 2;
+	sbp++;
+
+	/*
+	 * Scan the directory data for the rest of the entries.
+	 */
+	for (i = 0, sfe = &sf->list[0];
+			i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+
+		if (((char *)sfe < (char *)sf) ||
+		    ((char *)sfe >= ((char *)sf + dp->i_df.if_bytes)) ||
+		    (sfe->namelen >= MAXNAMELEN)) {
+			xfs_dir_trace_g_du("sf: corrupted", dp, uio);
+			kmem_free(sbuf, sbsize);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		sbp->entno = i + 2;
+		sbp->seqno = 0;
+		sbp->hash = xfs_da_hashname((char *)sfe->name, sfe->namelen);
+		sbp->ino = XFS_GET_DIR_INO_ARCH(mp, sfe->inumber, ARCH_CONVERT);
+		sbp->name = (char *)sfe->name;
+		sbp->namelen = sfe->namelen;
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+		sbp++;
+	}
+
+	/*
+	 * Sort the entries on hash then entno.
+	 */
+	qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_dir_shortform_compare);
+	/*
+	 * Stuff in last entry.
+	 */
+	sbp->entno = nsbuf;
+	sbp->hash = XFS_DA_MAXHASH;
+	sbp->seqno = 0;
+	/*
+	 * Figure out the sequence numbers in case there's a hash duplicate.
+	 */
+	for (hash = sbuf->hash, sbp = sbuf + 1;
+				sbp < &sbuf[nsbuf + 1]; sbp++) {
+		if (sbp->hash == hash)
+			sbp->seqno = sbp[-1].seqno + 1;
+		else
+			hash = sbp->hash;
+	}
+
+	/*
+	 * Set up put routine.
+	 */
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+
+	/*
+	 * Find our place.
+	 */
+	for (sbp = sbuf; sbp < &sbuf[nsbuf + 1]; sbp++) {
+		if (sbp->hash > cookhash ||
+		    (sbp->hash == cookhash && sbp->seqno >= want_entno))
+			break;
+	}
+
+	/*
+	 * Did we fail to find anything?  We stop at the last entry,
+	 * the one we put maxhash into.
+	 */
+	if (sbp == &sbuf[nsbuf]) {
+		kmem_free(sbuf, sbsize);
+		xfs_dir_trace_g_du("sf: hash beyond end", dp, uio);
+		uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
+		*eofp = 1;
+		return 0;
+	}
+
+	/*
+	 * Loop putting entries into the user buffer.
+	 */
+	while (sbp < &sbuf[nsbuf]) {
+		/*
+		 * Save the first resid in a run of equal-hashval entries
+		 * so that we can back them out if they don't all fit.
+		 */
+		if (sbp->seqno == 0 || sbp == sbuf)
+			lastresid = uio->uio_resid;
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		XFS_PUT_COOKIE(p.cook, mp, 0, sbp->seqno, sbp->hash);
+
+#if XFS_BIG_FILESYSTEMS
+		p.ino = sbp->ino + mp->m_inoadd;
+#else
+		p.ino = sbp->ino;
+#endif
+		p.name = sbp->name;
+		p.namelen = sbp->namelen;
+
+		retval = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset =
+				XFS_DA_MAKE_COOKIE(mp, 0, 0, sbp->hash);
+			kmem_free(sbuf, sbsize);
+			uio->uio_resid = lastresid;
+			xfs_dir_trace_g_du("sf: E-O-B", dp, uio);
+			return retval;
+		}
+
+		sbp++;
+	}
+
+	kmem_free(sbuf, sbsize);
+
+	XFS_PUT_COOKIE(p.cook, mp, 0, 0, XFS_DA_MAXHASH);
+
+	uio->uio_offset = p.cook.o;
+
+	*eofp = 1;
+
+	xfs_dir_trace_g_du("sf: E-O-F", dp, uio);
+
+	return 0;
+}
+
+/*
+ * Look up a name in a shortform directory structure, replace the inode number.
+ */
+int
+xfs_dir_shortform_replace(xfs_da_args_t *args)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	xfs_inode_t *dp;
+	int i;
+
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		/* XXX - replace assert? */
+		XFS_DIR_SF_PUT_DIRINO_ARCH(&args->inumber, &sf->hdr.parent, ARCH_CONVERT);
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+		return(0);
+	}
+	ASSERT(args->namelen != 1 || args->name[0] != '.');
+	sfe = &sf->list[0];
+	for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+		if (sfe->namelen == args->namelen &&
+		    sfe->name[0] == args->name[0] &&
+		    bcmp(args->name, sfe->name, args->namelen) == 0) {
+			ASSERT(bcmp((char *)&args->inumber,
+				(char *)&sfe->inumber, sizeof(xfs_ino_t)));
+			XFS_DIR_SF_PUT_DIRINO_ARCH(&args->inumber, &sfe->inumber, ARCH_CONVERT);
+			xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+			return(0);
+		}
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	ASSERT(args->oknoent);
+	return(XFS_ERROR(ENOENT));
+}
+
+/*
+ * Convert a leaf directory to shortform structure
+ */
+int
+xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	xfs_da_args_t args;
+	xfs_inode_t *dp;
+	xfs_ino_t parent;
+	char *tmpbuffer;
+	int retval, i;
+	xfs_dabuf_t *bp;
+
+	dp = iargs->dp;
+	tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+
+	retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp,
+					       XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	bcopy(bp->data, tmpbuffer, XFS_LBSIZE(dp->i_mount));
+	leaf = (xfs_dir_leafblock_t *)tmpbuffer;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	bzero(bp->data, XFS_LBSIZE(dp->i_mount));
+
+	/*
+	 * Find and special case the parent inode number
+	 */
+	hdr = &leaf->hdr;
+	entry = &leaf->entries[0];
+	for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		if ((entry->namelen == 2) &&
+		    (namest->name[0] == '.') &&
+		    (namest->name[1] == '.')) {
+			XFS_DIR_SF_GET_DIRINO_ARCH(&namest->inumber, &parent, ARCH_CONVERT);
+			INT_ZERO(entry->nameidx, ARCH_CONVERT);
+		} else if ((entry->namelen == 1) && (namest->name[0] == '.')) {
+			INT_ZERO(entry->nameidx, ARCH_CONVERT);
+		}
+	}
+	retval = xfs_da_shrink_inode(iargs, 0, bp);
+	if (retval)
+		goto out;
+	retval = xfs_dir_shortform_create(iargs, parent);
+	if (retval)
+		goto out;
+
+	/*
+	 * Copy the rest of the filenames
+	 */
+	entry = &leaf->entries[0];
+	args.dp = dp;
+	args.firstblock = iargs->firstblock;
+	args.flist = iargs->flist;
+	args.total = iargs->total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = iargs->trans;
+	args.justcheck = 0;
+	args.addname = args.oknoent = 1;
+	for (i = 0; i < INT_GET(hdr->count, ARCH_CONVERT); entry++, i++) {
+		if (INT_ISZERO(entry->nameidx, ARCH_CONVERT))
+			continue;
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		args.name = (char *)(namest->name);
+		args.namelen = entry->namelen;
+		args.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+		XFS_DIR_SF_GET_DIRINO_ARCH(&namest->inumber, &args.inumber, ARCH_CONVERT);
+		xfs_dir_shortform_addname(&args);
+	}
+
+out:
+	kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
+	return(retval);
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_dir_leaf_to_node(xfs_da_args_t *args)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_da_intnode_t *node;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp1, *bp2;
+	xfs_dablk_t blkno;
+	int retval;
+
+	dp = args->dp;
+	retval = xfs_da_grow_inode(args, &blkno);
+	ASSERT(blkno == 1);
+	if (retval)
+		return(retval);
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp1 != NULL);
+	retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2,
+					     XFS_DATA_FORK);
+	if (retval) {
+		xfs_da_buf_done(bp1);
+		return(retval);
+	}
+	ASSERT(bp2 != NULL);
+	bcopy(bp1->data, bp2->data, XFS_LBSIZE(dp->i_mount));
+	xfs_da_buf_done(bp1);
+	xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	/*
+	 * Set up the new root node.
+	 */
+	retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK);
+	if (retval) {
+		xfs_da_buf_done(bp2);
+		return(retval);
+	}
+	node = bp1->data;
+	leaf = bp2->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	INT_SET(node->btree[0].hashval, ARCH_CONVERT, INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+	xfs_da_buf_done(bp2);
+	INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
+	INT_SET(node->hdr.count, ARCH_CONVERT, 1);
+	xfs_da_log_buf(args->trans, bp1,
+		XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0])));
+	xfs_da_buf_done(bp1);
+
+	return(retval);
+}
+
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf directory
+ * or a leaf in a node directory.
+ */
+int
+xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int retval;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	bzero((char *)leaf, XFS_LBSIZE(dp->i_mount));
+	hdr = &leaf->hdr;
+	INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_DIR_LEAF_MAGIC);
+	INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
+	if (INT_ISZERO(hdr->firstused, ARCH_CONVERT))
+		INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount) - 1);
+	INT_SET(hdr->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
+	INT_SET(hdr->freemap[0].size, ARCH_CONVERT, INT_GET(hdr->firstused, ARCH_CONVERT) - INT_GET(hdr->freemap[0].base, ARCH_CONVERT));
+
+	xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	*bpp = bp;
+	return(0);
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+				  xfs_da_state_blk_t *newblk)
+{
+	xfs_dablk_t blkno;
+	xfs_da_args_t *args;
+	int error;
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		return(error);
+	error = xfs_dir_leaf_create(args, blkno, &newblk->bp);
+	if (error)
+		return(error);
+	newblk->blkno = blkno;
+	newblk->magic = XFS_DIR_LEAF_MAGIC;
+
+	/*
+	 * Rebalance the entries across the two leaves.
+	 */
+	xfs_dir_leaf_rebalance(state, oldblk, newblk);
+	error = xfs_da_blk_link(state, oldblk, newblk);
+	if (error)
+		return(error);
+
+	/*
+	 * Insert the new entry in the correct block.
+	 */
+	if (state->inleaf) {
+		error = xfs_dir_leaf_add(oldblk->bp, args, oldblk->index);
+	} else {
+		error = xfs_dir_leaf_add(newblk->bp, args, newblk->index);
+	}
+
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL);
+	return(error);
+}
+
+/*
+ * Add a name to the leaf directory structure.
+ *
+ * Must take into account fragmented leaves and leaves where spacemap has
+ * lost some freespace information (ie: holes).
+ */
+int
+xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_dir_leaf_map_t *map;
+	int tablesize, entsize, sum, i, tmp, error;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT((index >= 0) && (index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+	hdr = &leaf->hdr;
+	entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen);
+
+	/*
+	 * Search through freemap for first-fit on new name length.
+	 * (may need to figure in size of entry struct too)
+	 */
+	tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) * (uint)sizeof(xfs_dir_leaf_entry_t)
+			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
+	map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1];
+	for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
+		if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
+			sum += INT_GET(map->size, ARCH_CONVERT);
+			continue;
+		}
+		if (INT_ISZERO(map->size, ARCH_CONVERT))
+			continue;	/* no space in this map */
+		tmp = entsize;
+		if (INT_GET(map->base, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
+			tmp += (uint)sizeof(xfs_dir_leaf_entry_t);
+		if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
+			if (!args->justcheck)
+				xfs_dir_leaf_add_work(bp, args, index, i);
+			return(0);
+		}
+		sum += INT_GET(map->size, ARCH_CONVERT);
+	}
+
+	/*
+	 * If there are no holes in the address space of the block,
+	 * and we don't have enough freespace, then compaction will do us
+	 * no good and we should just give up.
+	 */
+	if (!hdr->holes && (sum < entsize))
+		return(XFS_ERROR(ENOSPC));
+
+	/*
+	 * Compact the entries to coalesce free space.
+	 * Pass the justcheck flag so the checking pass can return
+	 * an error, without changing anything, if it won't fit.
+	 */
+	error = xfs_dir_leaf_compact(args->trans, bp,
+			args->total == 0 ?
+				entsize +
+				(uint)sizeof(xfs_dir_leaf_entry_t) : 0,
+			args->justcheck);
+	if (error)
+		return(error);
+	/*
+	 * After compaction, the block is guaranteed to have only one
+	 * free region, in freemap[0].	If it is not big enough, give up.
+	 */
+	if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) <
+	    (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
+		return(XFS_ERROR(ENOSPC));
+
+	if (!args->justcheck)
+		xfs_dir_leaf_add_work(bp, args, index, 0);
+	return(0);
+}
+
+/*
+ * Add a name to a leaf directory structure.
+ */
+STATIC void
+xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
+		      int mapindex)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	xfs_dir_leaf_map_t *map;
+	/* REFERENCED */
+	xfs_mount_t *mp;
+	int tmp, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE));
+	ASSERT((index >= 0) && (index <= INT_GET(hdr->count, ARCH_CONVERT)));
+
+	/*
+	 * Force open some space in the entry array and fill it in.
+	 */
+	entry = &leaf->entries[index];
+	if (index < INT_GET(hdr->count, ARCH_CONVERT)) {
+		tmp  = INT_GET(hdr->count, ARCH_CONVERT) - index;
+		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
+		ovbcopy(entry, entry + 1, tmp);
+		xfs_da_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
+	}
+	INT_MOD(hdr->count, ARCH_CONVERT, +1);
+
+	/*
+	 * Allocate space for the new string (at the end of the run).
+	 */
+	map = &hdr->freemap[mapindex];
+	mp = args->trans->t_mountp;
+	ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+	ASSERT(INT_GET(map->size, ARCH_CONVERT) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
+	ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+	INT_MOD(map->size, ARCH_CONVERT, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
+	INT_SET(entry->nameidx, ARCH_CONVERT, INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT));
+	INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
+	entry->namelen = args->namelen;
+	xfs_da_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+	/*
+	 * Copy the string and inode number into the new space.
+	 */
+	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+	XFS_DIR_SF_PUT_DIRINO_ARCH(&args->inumber, &namest->inumber, ARCH_CONVERT);
+	bcopy(args->name, namest->name, args->namelen);
+	xfs_da_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, namest, XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)));
+
+	/*
+	 * Update the control info for this leaf node
+	 */
+	if (INT_GET(entry->nameidx, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
+		INT_COPY(hdr->firstused, entry->nameidx, ARCH_CONVERT);
+	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
+	tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
+			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
+		if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
+			INT_MOD(map->base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
+			INT_MOD(map->size, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
+		}
+	}
+	INT_MOD(hdr->namebytes, ARCH_CONVERT, args->namelen);
+	xfs_da_log_buf(args->trans, bp,
+		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+}
+
+/*
+ * Garbage collect a leaf directory block by copying it to a new buffer.
+ */
+STATIC int
+xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
+		     int justcheck)
+{
+	xfs_dir_leafblock_t *leaf_s, *leaf_d;
+	xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+	char *tmpbuffer2=NULL;
+	int rval;
+	int lbsize;
+
+	mp = trans->t_mountp;
+	lbsize = XFS_LBSIZE(mp);
+	tmpbuffer = kmem_alloc(lbsize, KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	bcopy(bp->data, tmpbuffer, lbsize);
+
+	/*
+	 * Make a second copy in case xfs_dir_leaf_moveents()
+	 * below destroys the original.
+	 */
+	if (musthave || justcheck) {
+		tmpbuffer2 = kmem_alloc(lbsize, KM_SLEEP);
+		bcopy(bp->data, tmpbuffer2, lbsize);
+	}
+	bzero(bp->data, lbsize);
+
+	/*
+	 * Copy basic information
+	 */
+	leaf_s = (xfs_dir_leafblock_t *)tmpbuffer;
+	leaf_d = bp->data;
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	hdr_d->info = hdr_s->info;	/* struct copy */
+	INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize);
+	if (INT_ISZERO(hdr_d->firstused, ARCH_CONVERT))
+		INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize - 1);
+	INT_ZERO(hdr_d->namebytes, ARCH_CONVERT);
+	INT_ZERO(hdr_d->count, ARCH_CONVERT);
+	hdr_d->holes = 0;
+	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
+	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate filenames packed and in sequence.
+	 * This changes the source (leaf_s) as well.
+	 */
+	xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
+
+	if (musthave && INT_GET(hdr_d->freemap[0].size, ARCH_CONVERT) < musthave)
+		rval = XFS_ERROR(ENOSPC);
+	else
+		rval = 0;
+
+	if (justcheck || rval == ENOSPC) {
+		ASSERT(tmpbuffer2);
+		bcopy(tmpbuffer2, bp->data, lbsize);
+	} else {
+		xfs_da_log_buf(trans, bp, 0, lbsize - 1);
+	}
+
+	kmem_free(tmpbuffer, lbsize);
+	if (musthave || justcheck)
+		kmem_free(tmpbuffer2, lbsize);
+	return(rval);
+}
+
+/*
+ * Redistribute the directory entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of old block.
+ */
+STATIC void
+xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				      xfs_da_state_blk_t *blk2)
+{
+	xfs_da_state_blk_t *tmp_blk;
+	xfs_dir_leafblock_t *leaf1, *leaf2;
+	xfs_dir_leaf_hdr_t *hdr1, *hdr2;
+	int count, totallen, max, space, swap;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(blk1->magic == XFS_DIR_LEAF_MAGIC);
+	ASSERT(blk2->magic == XFS_DIR_LEAF_MAGIC);
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+
+	/*
+	 * Check ordering of blocks, reverse if it makes things simpler.
+	 */
+	swap = 0;
+	if (xfs_dir_leaf_order(blk1->bp, blk2->bp)) {
+		tmp_blk = blk1;
+		blk1 = blk2;
+		blk2 = tmp_blk;
+		leaf1 = blk1->bp->data;
+		leaf2 = blk2->bp->data;
+		swap = 1;
+	}
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.  Then get
+	 * the direction to copy and the number of elements to move.
+	 */
+	state->inleaf = xfs_dir_leaf_figure_balance(state, blk1, blk2,
+							   &count, &totallen);
+	if (swap)
+		state->inleaf = !state->inleaf;
+
+	/*
+	 * Move any entries required from leaf to leaf:
+	 */
+	if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		count = INT_GET(hdr1->count, ARCH_CONVERT) - count;	/* number entries being moved */
+		space  = INT_GET(hdr1->namebytes, ARCH_CONVERT) - totallen;
+		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
+		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
+
+		/*
+		 * leaf2 is the destination, compact it if it looks tight.
+		 */
+		max  = INT_GET(hdr2->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
+		max -= INT_GET(hdr2->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+		if (space > max) {
+			xfs_dir_leaf_compact(state->args->trans, blk2->bp,
+								 0, 0);
+		}
+
+		/*
+		 * Move high entries from leaf1 to low end of leaf2.
+		 */
+		xfs_dir_leaf_moveents(leaf1, INT_GET(hdr1->count, ARCH_CONVERT) - count,
+					     leaf2, 0, count, state->mp);
+
+		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
+						   state->blocksize-1);
+		xfs_da_log_buf(state->args->trans, blk2->bp, 0,
+						   state->blocksize-1);
+
+	} else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		count -= INT_GET(hdr1->count, ARCH_CONVERT);		/* number entries being moved */
+		space  = totallen - INT_GET(hdr1->namebytes, ARCH_CONVERT);
+		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
+		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
+
+		/*
+		 * leaf1 is the destination, compact it if it looks tight.
+		 */
+		max  = INT_GET(hdr1->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
+		max -= INT_GET(hdr1->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+		if (space > max) {
+			xfs_dir_leaf_compact(state->args->trans, blk1->bp,
+								 0, 0);
+		}
+
+		/*
+		 * Move low entries from leaf2 to high end of leaf1.
+		 */
+		xfs_dir_leaf_moveents(leaf2, 0, leaf1, (int)INT_GET(hdr1->count, ARCH_CONVERT),
+					     count, state->mp);
+
+		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
+						   state->blocksize-1);
+		xfs_da_log_buf(state->args->trans, blk2->bp, 0,
+						   state->blocksize-1);
+	}
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	blk1->hashval = INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+	blk2->hashval = INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 * GROT: this doesn't work unless blk2 was originally empty.
+	 */
+	if (!state->inleaf) {
+		blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+	}
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *blk1,
+					   xfs_da_state_blk_t *blk2,
+					   int *countarg, int *namebytesarg)
+{
+	xfs_dir_leafblock_t *leaf1, *leaf2;
+	xfs_dir_leaf_hdr_t *hdr1, *hdr2;
+	xfs_dir_leaf_entry_t *entry;
+	int count, max, totallen, half;
+	int lastdelta, foundit, tmp;
+
+	/*
+	 * Set up environment.
+	 */
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+	foundit = 0;
+	totallen = 0;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.
+	 */
+	max = INT_GET(hdr1->count, ARCH_CONVERT) + INT_GET(hdr2->count, ARCH_CONVERT);
+	half  = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
+	half += INT_GET(hdr1->namebytes, ARCH_CONVERT) + INT_GET(hdr2->namebytes, ARCH_CONVERT) + state->args->namelen;
+	half /= 2;
+	lastdelta = state->blocksize;
+	entry = &leaf1->entries[0];
+	for (count = 0; count < max; entry++, count++) {
+
+#define XFS_DIR_ABS(A)	(((A) < 0) ? -(A) : (A))
+		/*
+		 * The new entry is in the first block, account for it.
+		 */
+		if (count == blk1->index) {
+			tmp = totallen + (uint)sizeof(*entry)
+				+ XFS_DIR_LEAF_ENTSIZE_BYNAME(state->args->namelen);
+			if (XFS_DIR_ABS(half - tmp) > lastdelta)
+				break;
+			lastdelta = XFS_DIR_ABS(half - tmp);
+			totallen = tmp;
+			foundit = 1;
+		}
+
+		/*
+		 * Wrap around into the second block if necessary.
+		 */
+		if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
+			leaf1 = leaf2;
+			entry = &leaf1->entries[0];
+		}
+
+		/*
+		 * Figure out if next leaf entry would be too much.
+		 */
+		tmp = totallen + (uint)sizeof(*entry)
+				+ XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
+		if (XFS_DIR_ABS(half - tmp) > lastdelta)
+			break;
+		lastdelta = XFS_DIR_ABS(half - tmp);
+		totallen = tmp;
+#undef XFS_DIR_ABS
+	}
+
+	/*
+	 * Calculate the number of namebytes that will end up in lower block.
+	 * If new entry not in lower block, fix up the count.
+	 */
+	totallen -=
+		count * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
+	if (foundit) {
+		totallen -= (sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1) +
+			    state->args->namelen;
+	}
+
+	*countarg = count;
+	*namebytesarg = totallen;
+	return(foundit);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int
+xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	int count, bytes, forward, error, retval, i;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->data;
+	ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	leaf = (xfs_dir_leafblock_t *)info;
+	count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) +
+		count * (uint)sizeof(xfs_dir_leaf_entry_t) +
+		count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) +
+		INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+	if (bytes > (state->blocksize >> 1)) {
+		*action = 0;	/* blk over 50%, dont try to join */
+		return(0);
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (aribtrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = !INT_ISZERO(info->forw, ARCH_CONVERT);
+		bcopy(&state->path, &state->altpath, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return(error);
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return(0);
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	forward = (INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT));	/* start with smaller blk num */
+	for (i = 0; i < 2; forward = !forward, i++) {
+		if (forward)
+			blkno = INT_GET(info->forw, ARCH_CONVERT);
+		else
+			blkno = INT_GET(info->back, ARCH_CONVERT);
+		if (blkno == 0)
+			continue;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+							    blkno, -1, &bp,
+							    XFS_DATA_FORK);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+
+		leaf = (xfs_dir_leafblock_t *)info;
+		count  = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		bytes  = state->blocksize - (state->blocksize>>2);
+		bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		leaf = bp->data;
+		ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+		count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
+		bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t);
+		bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t);
+		if (bytes >= 0)
+			break;	/* fits with at least 25% to spare */
+
+		xfs_da_brelse(state->args->trans, bp);
+	}
+	if (i >= 2) {
+		*action = 0;
+		return(0);
+	}
+	xfs_da_buf_done(bp);
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	bcopy(&state->path, &state->altpath, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return(error);
+	if (retval) {
+		*action = 0;
+	} else {
+		*action = 1;
+	}
+	return(0);
+}
+
+/*
+ * Remove a name from the leaf directory structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_dir_leaf_map_t *map;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	int before, after, smallest, entsize;
+	int tablesize, tmp, i;
+	xfs_mount_t *mp;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	mp = trans->t_mountp;
+	ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
+	ASSERT((index >= 0) && (index < INT_GET(hdr->count, ARCH_CONVERT)));
+	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
+	entry = &leaf->entries[index];
+	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
+	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+
+	/*
+	 * Scan through free region table:
+	 *    check for adjacency of free'd entry with an existing one,
+	 *    find smallest free region in case we need to replace it,
+	 *    adjust any map that borders the entry table,
+	 */
+	tablesize = INT_GET(hdr->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
+			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	tmp = INT_GET(map->size, ARCH_CONVERT);
+	before = after = -1;
+	smallest = XFS_DIR_LEAF_MAPSIZE - 1;
+	entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
+	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
+		ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+		ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+		if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
+			INT_MOD(map->base, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
+			INT_MOD(map->size, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
+		}
+
+		if ((INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
+			before = i;
+		} else if (INT_GET(map->base, ARCH_CONVERT) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
+			after = i;
+		} else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
+			tmp = INT_GET(map->size, ARCH_CONVERT);
+			smallest = i;
+		}
+	}
+
+	/*
+	 * Coalesce adjacent freemap regions,
+	 * or replace the smallest region.
+	 */
+	if ((before >= 0) || (after >= 0)) {
+		if ((before >= 0) && (after >= 0)) {
+			map = &hdr->freemap[before];
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+			INT_MOD(map->size, ARCH_CONVERT, INT_GET(hdr->freemap[after].size, ARCH_CONVERT));
+			INT_ZERO(hdr->freemap[after].base, ARCH_CONVERT);
+			INT_ZERO(hdr->freemap[after].size, ARCH_CONVERT);
+		} else if (before >= 0) {
+			map = &hdr->freemap[before];
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+		} else {
+			map = &hdr->freemap[after];
+			INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+		}
+	} else {
+		/*
+		 * Replace smallest region (if it is smaller than free'd entry)
+		 */
+		map = &hdr->freemap[smallest];
+		if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
+			INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
+			INT_SET(map->size, ARCH_CONVERT, entsize);
+		}
+	}
+
+	/*
+	 * Did we remove the first entry?
+	 */
+	if (INT_GET(entry->nameidx, ARCH_CONVERT) == INT_GET(hdr->firstused, ARCH_CONVERT))
+		smallest = 1;
+	else
+		smallest = 0;
+
+	/*
+	 * Compress the remaining entries and zero out the removed stuff.
+	 */
+	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+	bzero((char *)namest, entsize);
+	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
+
+	INT_MOD(hdr->namebytes, ARCH_CONVERT, -(entry->namelen));
+	tmp = (INT_GET(hdr->count, ARCH_CONVERT) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
+	ovbcopy(entry + 1, entry, tmp);
+	INT_MOD(hdr->count, ARCH_CONVERT, -1);
+	xfs_da_log_buf(trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
+	entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
+	bzero((char *)entry, sizeof(xfs_dir_leaf_entry_t));
+
+	/*
+	 * If we removed the first entry, re-find the first used byte
+	 * in the name area.  Note that if the entry was the "firstused",
+	 * then we don't have a "hole" in our block resulting from
+	 * removing the name.
+	 */
+	if (smallest) {
+		tmp = XFS_LBSIZE(mp);
+		entry = &leaf->entries[0];
+		for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
+			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
+			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+			if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
+				tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
+		}
+		INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
+		if (INT_ISZERO(hdr->firstused, ARCH_CONVERT))
+			INT_SET(hdr->firstused, ARCH_CONVERT, tmp - 1);
+	} else {
+		hdr->holes = 1;		/* mark as needing compaction */
+	}
+
+	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+
+	/*
+	 * Check if leaf is less than 50% full, caller may want to
+	 * "join" the leaf with a sibling if so.
+	 */
+	tmp  = (uint)sizeof(xfs_dir_leaf_hdr_t);
+	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
+	tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+	if (tmp < mp->m_dir_magicpct)
+		return(1);			/* leaf is < 37% full */
+	return(0);
+}
+
+/*
+ * Move all the directory entries from drop_leaf into save_leaf.
+ */
+void
+xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				      xfs_da_state_blk_t *save_blk)
+{
+	xfs_dir_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
+	xfs_dir_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+
+	/*
+	 * Set up environment.
+	 */
+	mp = state->mp;
+	ASSERT(drop_blk->magic == XFS_DIR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == XFS_DIR_LEAF_MAGIC);
+	drop_leaf = drop_blk->bp->data;
+	save_leaf = save_blk->bp->data;
+	ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	drop_hdr = &drop_leaf->hdr;
+	save_hdr = &save_leaf->hdr;
+
+	/*
+	 * Save last hashval from dying block for later Btree fixup.
+	 */
+	drop_blk->hashval = INT_GET(drop_leaf->entries[ drop_leaf->hdr.count-1 ].hashval, ARCH_CONVERT);
+
+	/*
+	 * Check if we need a temp buffer, or can we do it in place.
+	 * Note that we don't check "leaf" for holes because we will
+	 * always be dropping it, toosmall() decided that for us already.
+	 */
+	if (save_hdr->holes == 0) {
+		/*
+		 * dest leaf has no holes, so we add there.  May need
+		 * to make some room in the entry array.
+		 */
+		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0,
+						 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+		} else {
+			xfs_dir_leaf_moveents(drop_leaf, 0,
+					      save_leaf, INT_GET(save_hdr->count, ARCH_CONVERT),
+					      (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+		}
+	} else {
+		/*
+		 * Destination has holes, so we make a temporary copy
+		 * of the leaf and add them both to that.
+		 */
+		tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
+		ASSERT(tmpbuffer != NULL);
+		bzero(tmpbuffer, state->blocksize);
+		tmp_leaf = (xfs_dir_leafblock_t *)tmpbuffer;
+		tmp_hdr = &tmp_leaf->hdr;
+		tmp_hdr->info = save_hdr->info; /* struct copy */
+		INT_ZERO(tmp_hdr->count, ARCH_CONVERT);
+		INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
+		if (INT_ISZERO(tmp_hdr->firstused, ARCH_CONVERT))
+			INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize - 1);
+		INT_ZERO(tmp_hdr->namebytes, ARCH_CONVERT);
+		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
+						 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+			xfs_dir_leaf_moveents(save_leaf, 0,
+					      tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+					      (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
+		} else {
+			xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
+						 (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
+			xfs_dir_leaf_moveents(drop_leaf, 0,
+					      tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+					      (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+		}
+		bcopy(tmp_leaf, save_leaf, state->blocksize);
+		kmem_free(tmpbuffer, state->blocksize);
+	}
+
+	xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
+					   state->blocksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	save_blk->hashval = INT_GET(save_leaf->entries[ INT_GET(save_leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf directory structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in *index the index into the entry[] array of either the found
+ * entry, or where the entry should have been (insert before that entry).
+ *
+ * Don't change the args->inumber unless we find the filename.
+ */
+int
+xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	int probe, span;
+	xfs_dahash_t hashval;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) < (XFS_LBSIZE(args->dp->i_mount)/8));
+
+	/*
+	 * Binary search.  (note: small blocks will skip this loop)
+	 */
+	hashval = args->hashval;
+	probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
+	for (entry = &leaf->entries[probe]; span > 4;
+		   entry = &leaf->entries[probe]) {
+		span /= 2;
+		if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
+			probe += span;
+		else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
+			probe -= span;
+		else
+			break;
+	}
+	ASSERT((probe >= 0) && \
+	       ((INT_ISZERO(leaf->hdr.count, ARCH_CONVERT)) || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
+	ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval));
+
+	/*
+	 * Since we may have duplicate hashval's, find the first matching
+	 * hashval in the leaf.
+	 */
+	while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) >= hashval)) {
+		entry--;
+		probe--;
+	}
+	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
+		entry++;
+		probe++;
+	}
+	if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
+		*index = probe;
+		ASSERT(args->oknoent);
+		return(XFS_ERROR(ENOENT));
+	}
+
+	/*
+	 * Duplicate keys may be present, so search all of them for a match.
+	 */
+	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		if (entry->namelen == args->namelen &&
+		    namest->name[0] == args->name[0] &&
+		    bcmp(args->name, namest->name, args->namelen) == 0) {
+			XFS_DIR_SF_GET_DIRINO_ARCH(&namest->inumber, &args->inumber, ARCH_CONVERT);
+			*index = probe;
+			return(XFS_ERROR(EEXIST));
+		}
+		entry++;
+		probe++;
+	}
+	*index = probe;
+	ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
+	return(XFS_ERROR(ENOENT));
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
+		      xfs_dir_leafblock_t *leaf_d, int start_d,
+		      int count, xfs_mount_t *mp)
+{
+	xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_dir_leaf_entry_t *entry_s, *entry_d;
+	int tmp, i;
+
+	/*
+	 * Check for nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) && (INT_GET(hdr_s->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
+	ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
+		((INT_GET(hdr_s->count, ARCH_CONVERT)*sizeof(*entry_s))+sizeof(*hdr_s)));
+	ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
+	ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
+		((INT_GET(hdr_d->count, ARCH_CONVERT)*sizeof(*entry_d))+sizeof(*hdr_d)));
+
+	ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
+	ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
+	ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
+
+	/*
+	 * Move the entries in the destination leaf up to make a hole?
+	 */
+	if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
+		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
+		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
+		entry_s = &leaf_d->entries[start_d];
+		entry_d = &leaf_d->entries[start_d + count];
+		bcopy(entry_s, entry_d, tmp);
+	}
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate filenames packed and in sequence.
+	 */
+	entry_s = &leaf_s->entries[start_s];
+	entry_d = &leaf_d->entries[start_d];
+	for (i = 0; i < count; entry_s++, entry_d++, i++) {
+		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
+		ASSERT(entry_s->namelen < MAXNAMELEN);
+		tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
+		INT_MOD(hdr_d->firstused, ARCH_CONVERT, -(tmp));
+		entry_d->hashval = entry_s->hashval; /* INT_: direct copy */
+		INT_COPY(entry_d->nameidx, hdr_d->firstused, ARCH_CONVERT);
+		entry_d->namelen = entry_s->namelen;
+		ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
+		bcopy(XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
+		      XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)), tmp);
+		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
+		bzero((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
+		      tmp);
+		INT_MOD(hdr_s->namebytes, ARCH_CONVERT, -(entry_d->namelen));
+		INT_MOD(hdr_d->namebytes, ARCH_CONVERT, entry_d->namelen);
+		INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+		INT_MOD(hdr_d->count, ARCH_CONVERT, +1);
+		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
+				+ (uint)sizeof(xfs_dir_leaf_hdr_t);
+		ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
+
+	}
+
+	/*
+	 * Zero out the entries we just copied.
+	 */
+	if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
+		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s];
+		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
+		bzero((char *)entry_s, tmp);
+	} else {
+		/*
+		 * Move the remaining entries down to fill the hole,
+		 * then zero the entries at the top.
+		 */
+		tmp  = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
+		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s + count];
+		entry_d = &leaf_s->entries[start_s];
+		bcopy(entry_s, entry_d, tmp);
+
+		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
+		entry_s = &leaf_s->entries[INT_GET(hdr_s->count, ARCH_CONVERT)];
+		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
+		bzero((char *)entry_s, tmp);
+	}
+
+	/*
+	 * Fill in the freemap information
+	 */
+	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_hdr_t));
+	INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t));
+	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+	INT_SET(hdr_d->freemap[1].base, ARCH_CONVERT, INT_ZERO(hdr_d->freemap[2].base, ARCH_CONVERT));
+	INT_SET(hdr_d->freemap[1].size, ARCH_CONVERT, INT_ZERO(hdr_d->freemap[2].size, ARCH_CONVERT));
+	hdr_s->holes = 1;	/* leaf may not be compact */
+}
+
+/*
+ * Compare two leaf blocks "order".
+ */
+int
+xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
+{
+	xfs_dir_leafblock_t *leaf1, *leaf2;
+
+	leaf1 = leaf1_bp->data;
+	leaf2 = leaf2_bp->data;
+	ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) &&
+	       (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC));
+	if ((INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) &&
+	    ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
+	      INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
+	     (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+	      INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+		return(1);
+	}
+	return(0);
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+{
+	xfs_dir_leafblock_t *leaf;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	if (count)
+		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	if (INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+		return(0);
+	return(INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Copy out directory entries for getdents(), for leaf directories.
+ */
+int
+xfs_dir_leaf_getdents_int(
+	xfs_dabuf_t	*bp,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	uio_t		*uio,
+	int		*eobp,
+	xfs_dirent_t	*dbp,
+	xfs_dir_put_t	put,
+	xfs_daddr_t		nextda)
+{
+	xfs_dir_leafblock_t	*leaf;
+	xfs_dir_leaf_entry_t	*entry;
+	xfs_dir_leaf_name_t	*namest;
+	int			entno, want_entno, i, nextentno;
+	xfs_mount_t		*mp;
+	xfs_dahash_t		cookhash;
+	xfs_dahash_t		nexthash = 0;
+#if (BITS_PER_LONG == 32)
+	xfs_dahash_t		lasthash = XFS_DA_MAXHASH;
+#endif
+	xfs_dir_put_args_t	p;
+
+	mp = dp->i_mount;
+	leaf = bp->data;
+	if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
+		*eobp = 1;
+		return(XFS_ERROR(ENOENT));	/* XXX wrong code */
+	}
+
+	want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
+
+	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
+
+	xfs_dir_trace_g_dul("leaf: start", dp, uio, leaf);
+
+	/*
+	 * Re-find our place.
+	 */
+	for (i = entno = 0, entry = &leaf->entries[0];
+		     i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+			     entry++, i++) {
+
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
+				    INT_GET(entry->nameidx, ARCH_CONVERT));
+
+		if (((char *)namest < (char *)leaf) ||
+		    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)) ||
+		    (entry->namelen >= MAXNAMELEN)) {
+			xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		if (INT_GET(entry->hashval, ARCH_CONVERT) >= cookhash) {
+			if (   entno < want_entno
+			    && INT_GET(entry->hashval, ARCH_CONVERT)
+							== cookhash) {
+				/*
+				 * Trying to get to a particular offset in a
+				 * run of equal-hashval entries.
+				 */
+				entno++;
+			} else if (   want_entno > 0
+				   && entno == want_entno
+				   && INT_GET(entry->hashval, ARCH_CONVERT)
+							== cookhash) {
+				break;
+			} else {
+				entno = 0;
+				break;
+			}
+		}
+	}
+
+	if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+		xfs_dir_trace_g_du("leaf: hash not found", dp, uio);
+		if (!INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))
+			uio->uio_offset =
+				XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
+		/*
+		 * Don't set uio_offset if there's another block:
+		 * the node code will be setting uio_offset anyway.
+		 */
+		*eobp = 0;
+		return(0);
+	}
+	xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry);
+
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+
+	/*
+	 * We're synchronized, start copying entries out to the user.
+	 */
+	for (; entno >= 0 && i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+			     entry++, i++, (entno = nextentno)) {
+		int lastresid=0, retval;
+		xfs_dircook_t lastoffset;
+		xfs_dahash_t thishash;
+
+		/*
+		 * Check for a damaged directory leaf block and pick up
+		 * the inode number from this entry.
+		 */
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
+				    INT_GET(entry->nameidx, ARCH_CONVERT));
+
+		if (((char *)namest < (char *)leaf) ||
+		    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)) ||
+		    (entry->namelen >= MAXNAMELEN)) {
+			xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		thishash = INT_GET(entry->hashval, ARCH_CONVERT);
+
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		XFS_PUT_COOKIE(p.cook, mp, bno, entno, thishash);
+
+		xfs_dir_trace_g_duc("leaf: middle cookie  ",
+						   dp, uio, p.cook.o);
+
+		if (i < (INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1)) {
+			nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT);
+
+			if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT))
+				nextentno = entno + 1;
+			else
+				nextentno = 0;
+
+		} else if (INT_GET(leaf->hdr.info.forw, ARCH_CONVERT)) {
+			xfs_dabuf_t *bp2;
+			xfs_dir_leafblock_t *leaf2;
+
+			ASSERT(nextda != -1);
+
+			retval = xfs_da_read_buf(dp->i_transp, dp,
+						 INT_GET(leaf->hdr.info.forw,
+						 ARCH_CONVERT), nextda,
+						 &bp2, XFS_DATA_FORK);
+			if (retval)
+				return(retval);
+
+			ASSERT(bp2 != NULL);
+
+			leaf2 = bp2->data;
+
+			if (   (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+						!= XFS_DIR_LEAF_MAGIC)
+			    || (INT_GET(leaf2->hdr.info.back, ARCH_CONVERT)
+						!= bno)) {	/* GROT */
+
+				xfs_da_brelse(dp->i_transp, bp2);
+
+				return(XFS_ERROR(EFSCORRUPTED));
+			}
+
+			nexthash = INT_GET(leaf2->entries[0].hashval,
+								ARCH_CONVERT);
+			nextentno = -1;
+
+			xfs_da_brelse(dp->i_transp, bp2);
+			xfs_dir_trace_g_duc("leaf: next blk cookie",
+						   dp, uio, p.cook.o);
+		} else {
+			nextentno = -1;
+			nexthash  = XFS_DA_MAXHASH;
+		}
+
+		/*
+		 * Save off the cookie so we can fall back should the
+		 * 'put' into the outgoing buffer fails.  To handle a run
+		 * of equal-hashvals, the off_t structure on 64bit
+		 * builds has entno built into the cookie to ID the
+		 * entry.  On 32bit builds, we only have space for the
+		 * hashval so we can't ID specific entries within a group
+		 * of same hashval entries.   For this, lastoffset is set
+		 * to the first in the run of equal hashvals so we don't
+		 * include any entries unless we can include all entries
+		 * that share the same hashval.	 Hopefully the buffer
+		 * provided is big enough to handle it (see pv763517).
+		 */
+#if (BITS_PER_LONG == 32)
+		if (INT_GET(entry->hashval, ARCH_CONVERT) != lasthash) {
+			XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
+			lastresid = uio->uio_resid;
+			lasthash = thishash;
+		} else {
+			xfs_dir_trace_g_duc("leaf: DUP COOKIES, skipped",
+						   dp, uio, p.cook.o);
+		}
+#else
+		XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
+		lastresid = uio->uio_resid;
+#endif /* BITS_PER_LONG == 32 */
+
+		/*
+		 * Put the current entry into the outgoing buffer.  If we fail
+		 * then restore the UIO to the first entry in the current
+		 * run of equal-hashval entries (probably one 1 entry long).
+		 */
+#if XFS_BIG_FILESYSTEMS
+		p.ino = XFS_GET_DIR_INO_ARCH(mp, namest->inumber, ARCH_CONVERT) + mp->m_inoadd;
+#else
+		p.ino = XFS_GET_DIR_INO_ARCH(mp, namest->inumber, ARCH_CONVERT);
+#endif
+		p.name = (char *)namest->name;
+		p.namelen = entry->namelen;
+
+		retval = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset = lastoffset.o;
+			uio->uio_resid = lastresid;
+
+			*eobp = 1;
+
+			xfs_dir_trace_g_du("leaf: E-O-B", dp, uio);
+
+			return(retval);
+		}
+	}
+
+	XFS_PUT_COOKIE(p.cook, mp, 0, 0, nexthash);
+
+	uio->uio_offset = p.cook.o;
+
+	*eobp = 0;
+
+	xfs_dir_trace_g_du("leaf: E-O-F", dp, uio);
+
+	return(0);
+}
+
+/*
+ * Format a dirent64 structure and copy it out the the user's buffer.
+ */
+int
+xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa)
+{
+	iovec_t *iovp;
+	int reclen, namelen;
+	xfs_dirent_t *idbp;
+	uio_t *uio;
+
+	namelen = pa->namelen;
+	reclen = DIRENTSIZE(namelen);
+	uio = pa->uio;
+	if (reclen > uio->uio_resid) {
+		pa->done = 0;
+		return 0;
+	}
+	iovp = uio->uio_iov;
+	idbp = (xfs_dirent_t *)iovp->iov_base;
+	iovp->iov_base = (char *)idbp + reclen;
+	iovp->iov_len -= reclen;
+	uio->uio_resid -= reclen;
+	idbp->d_reclen = reclen;
+	idbp->d_ino = pa->ino;
+	idbp->d_off = pa->cook.o;
+	idbp->d_name[namelen] = '\0';
+	pa->done = 1;
+	bcopy(pa->name, idbp->d_name, namelen);
+	return 0;
+}
+
+/*
+ * Format a dirent64 structure and copy it out the the user's buffer.
+ */
+int
+xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa)
+{
+	int		retval, reclen, namelen;
+	xfs_dirent_t	*idbp;
+	uio_t		*uio;
+
+	namelen = pa->namelen;
+	reclen = DIRENTSIZE(namelen);
+	uio = pa->uio;
+	if (reclen > uio->uio_resid) {
+		pa->done = 0;
+		return 0;
+	}
+	idbp = pa->dbp;
+	idbp->d_reclen = reclen;
+	idbp->d_ino = pa->ino;
+	idbp->d_off = pa->cook.o;
+	idbp->d_name[namelen] = '\0';
+	bcopy(pa->name, idbp->d_name, namelen);
+	retval = uiomove((caddr_t)idbp, reclen, UIO_READ, uio);
+	pa->done = (retval == 0);
+	return retval;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir_leaf.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir_leaf.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir_leaf.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir_leaf.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR_LEAF_H__
+#define __XFS_DIR_LEAF_H__
+
+/*
+ * Directory layout, internal structure, access macros, etc.
+ *
+ * Large directories are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Filenames are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of a filename may not be unique, we may have duplicate keys.	 The
+ * internal links in the Btree are logical block offsets into the file.
+ */
+
+struct dirent;
+struct uio;
+struct xfs_bmap_free;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_dir_put_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*========================================================================
+ * Directory Structure when equal to XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This is the structure of the leaf nodes in the Btree.
+ *
+ * Struct leaf_entry's are packed from the top.	 Names grow from the bottom
+ * but are not packed.	The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the namelist area and try again.  If we
+ * still don't have enough space, then we have to split the block.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual string.  The root and intermediate node search always takes
+ * the first-in-the-block key match found, so we should only have to work
+ * "forw"ard.  If none matches, continue with the "forw"ard leaf nodes
+ * until the hash key changes or the filename is found.
+ *
+ * The parent directory and the self-pointer are explicitly represented
+ * (ie: there are entries for "." and "..").
+ *
+ * Note that the count being a __uint16_t limits us to something like a
+ * blocksize of 1.3MB in the face of worst case (short) filenames.
+ */
+#define XFS_DIR_LEAF_MAPSIZE	3	/* how many freespace slots */
+
+typedef struct xfs_dir_leafblock {
+	struct xfs_dir_leaf_hdr {	/* constant-structure header block */
+		xfs_da_blkinfo_t info;	/* block type, links, etc. */
+		__uint16_t count;	/* count of active leaf_entry's */
+		__uint16_t namebytes;	/* num bytes of name strings stored */
+		__uint16_t firstused;	/* first used byte in name area */
+		__uint8_t  holes;	/* != 0 if blk needs compaction */
+		__uint8_t  pad1;
+		struct xfs_dir_leaf_map {/* RLE map of free bytes */
+			__uint16_t base; /* base of free region */
+			__uint16_t size; /* run length of free region */
+		} freemap[XFS_DIR_LEAF_MAPSIZE]; /* N largest free regions */
+	} hdr;
+	struct xfs_dir_leaf_entry {	/* sorted on key, not name */
+		xfs_dahash_t hashval;	/* hash value of name */
+		__uint16_t nameidx;	/* index into buffer of name */
+		__uint8_t namelen;	/* length of name string */
+		__uint8_t pad2;
+	} entries[1];			/* var sized array */
+	struct xfs_dir_leaf_name {
+		xfs_dir_ino_t inumber;	/* inode number for this key */
+		__uint8_t name[1];	/* name string itself */
+	} namelist[1];			/* grows from bottom of buf */
+} xfs_dir_leafblock_t;
+typedef struct xfs_dir_leaf_hdr xfs_dir_leaf_hdr_t;
+typedef struct xfs_dir_leaf_map xfs_dir_leaf_map_t;
+typedef struct xfs_dir_leaf_entry xfs_dir_leaf_entry_t;
+typedef struct xfs_dir_leaf_name xfs_dir_leaf_name_t;
+
+/*
+ * Length of name for which a 512-byte block filesystem
+ * can get a double split.
+ */
+#define XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN	\
+	(512 - (uint)sizeof(xfs_dir_leaf_hdr_t) - \
+	 (uint)sizeof(xfs_dir_leaf_entry_t) * 2 - \
+	 (uint)sizeof(xfs_dir_leaf_name_t) * 2 - (MAXNAMELEN - 2) + 1 + 1)
+
+typedef int (*xfs_dir_put_t)(struct xfs_dir_put_args *pa);
+
+typedef union {
+	xfs_off_t		o;		/* offset (cookie) */
+	/*
+	 * Watch the order here (endian-ness dependent).
+	 */
+	struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+		xfs_dahash_t	h;	/* hash value */
+		__uint32_t	be;	/* block and entry */
+#else	/* __BYTE_ORDER == __BIG_ENDIAN */
+		__uint32_t	be;	/* block and entry */
+		xfs_dahash_t	h;	/* hash value */
+#endif	/* __BYTE_ORDER == __BIG_ENDIAN */
+	} s;
+} xfs_dircook_t;
+
+#define XFS_PUT_COOKIE(c,mp,bno,entry,hash)	\
+	((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
+
+#define XFS_GET_DIR_INO_ARCH(mp,di,arch) \
+    DIRINO_GET_ARCH(&(di),arch)
+#define XFS_GET_DIR_INO(mp,di) \
+    XFS_GET_DIR_INO_ARCH(mp,di,ARCH_NOCONVERT)
+
+typedef struct xfs_dir_put_args
+{
+	xfs_dircook_t	cook;		/* cookie of (next) entry */
+	xfs_intino_t	ino;		/* inode number */
+	struct xfs_dirent	*dbp;		/* buffer pointer */
+	char		*name;		/* directory entry name */
+	int		namelen;	/* length of name */
+	int		done;		/* output: set if value was stored */
+	xfs_dir_put_t	put;		/* put function ptr (i/o) */
+	struct uio	*uio;		/* uio control structure */
+} xfs_dir_put_args_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYNAME)
+int xfs_dir_leaf_entsize_byname(int len);
+#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)	xfs_dir_leaf_entsize_byname(len)
+#else
+#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)	/* space a name will use */ \
+	((uint)sizeof(xfs_dir_leaf_name_t)-1 + len)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYENTRY)
+int xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry);
+#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)	\
+	xfs_dir_leaf_entsize_byentry(entry)
+#else
+#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)	/* space an entry will use */ \
+	((uint)sizeof(xfs_dir_leaf_name_t)-1 + (entry)->namelen)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_NAMESTRUCT)
+xfs_dir_leaf_name_t *
+xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset);
+#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset)	\
+	xfs_dir_leaf_namestruct(leafp,offset)
+#else
+#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset)	/* point to name struct */ \
+	((xfs_dir_leaf_name_t *)&((char *)(leafp))[offset])
+#endif
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when dirsize < XFS_LITINO(mp).
+ */
+int xfs_dir_shortform_create(struct xfs_da_args *args, xfs_ino_t parent);
+int xfs_dir_shortform_addname(struct xfs_da_args *args);
+int xfs_dir_shortform_lookup(struct xfs_da_args *args);
+int xfs_dir_shortform_to_leaf(struct xfs_da_args *args);
+int xfs_dir_shortform_removename(struct xfs_da_args *args);
+int xfs_dir_shortform_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
+				      struct xfs_dirent *dbp, xfs_dir_put_t put);
+int xfs_dir_shortform_replace(struct xfs_da_args *args);
+
+/*
+ * Internal routines when dirsize == XFS_LBSIZE(mp).
+ */
+int xfs_dir_leaf_to_node(struct xfs_da_args *args);
+int xfs_dir_leaf_to_shortform(struct xfs_da_args *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_dir_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block,
+				   struct xfs_dabuf **bpp);
+int	xfs_dir_leaf_split(struct xfs_da_state *state,
+				  struct xfs_da_state_blk *oldblk,
+				  struct xfs_da_state_blk *newblk);
+int	xfs_dir_leaf_add(struct xfs_dabuf *leaf_buffer,
+				struct xfs_da_args *args, int insertion_index);
+int	xfs_dir_leaf_addname(struct xfs_da_args *args);
+int	xfs_dir_leaf_lookup_int(struct xfs_dabuf *leaf_buffer,
+				       struct xfs_da_args *args,
+				       int *index_found_at);
+int	xfs_dir_leaf_remove(struct xfs_trans *trans,
+				   struct xfs_dabuf *leaf_buffer,
+				   int index_to_remove);
+int	xfs_dir_leaf_getdents_int(struct xfs_dabuf *bp, struct xfs_inode *dp,
+					 xfs_dablk_t bno, struct uio *uio,
+					 int *eobp, struct xfs_dirent *dbp,
+					 xfs_dir_put_t put, xfs_daddr_t nextda);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_dir_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void	xfs_dir_leaf_unbalance(struct xfs_da_state *state,
+					     struct xfs_da_state_blk *drop_blk,
+					     struct xfs_da_state_blk *save_blk);
+
+/*
+ * Utility routines.
+ */
+uint	xfs_dir_leaf_lasthash(struct xfs_dabuf *bp, int *count);
+int	xfs_dir_leaf_order(struct xfs_dabuf *leaf1_bp,
+				  struct xfs_dabuf *leaf2_bp);
+int	xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa);
+int	xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa);
+int	xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
+
+/*
+ * Global data.
+ */
+extern xfs_dahash_t	xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+#endif /* __XFS_DIR_LEAF_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir_sf.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir_sf.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dir_sf.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dir_sf.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR_SF_H__
+#define __XFS_DIR_SF_H__
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to
+ * fit into the literal area of the inode.
+ */
+
+typedef struct { __uint8_t i[sizeof(xfs_ino_t)]; } xfs_dir_ino_t;
+
+/*
+ * The parent directory has a dedicated field, and the self-pointer must
+ * be calculated on the fly.
+ *
+ * Entries are packed toward the top as tight as possible.  The header
+ * and the elements much be bcopy()'d out into a work area to get correct
+ * alignment for the inode number fields.
+ */
+typedef struct xfs_dir_shortform {
+	struct xfs_dir_sf_hdr {		/* constant-structure header block */
+		xfs_dir_ino_t parent;	/* parent dir inode number */
+		__uint8_t count;	/* count of active entries */
+	} hdr;
+	struct xfs_dir_sf_entry {
+		xfs_dir_ino_t inumber;	/* referenced inode number */
+		__uint8_t namelen;	/* actual length of name (no NULL) */
+		__uint8_t name[1];	/* name */
+	} list[1];			/* variable sized array */
+} xfs_dir_shortform_t;
+typedef struct xfs_dir_sf_hdr xfs_dir_sf_hdr_t;
+typedef struct xfs_dir_sf_entry xfs_dir_sf_entry_t;
+
+/*
+ * We generate this then sort it, so that readdirs are returned in
+ * hash-order.	Else seekdir won't work.
+ */
+typedef struct xfs_dir_sf_sort {
+	__uint8_t	entno;		/* .=0, ..=1, else entry# + 2 */
+	__uint8_t	seqno;		/* sequence # with same hash value */
+	__uint8_t	namelen;	/* length of name value (no null) */
+	xfs_dahash_t	hash;		/* this entry's hash value */
+	xfs_intino_t	ino;		/* this entry's inode number */
+	char		*name;		/* name value, pointer into buffer */
+} xfs_dir_sf_sort_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_GET_DIRINO)
+void xfs_dir_sf_get_dirino_arch(xfs_dir_ino_t *from, xfs_ino_t *to, xfs_arch_t arch);
+void xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to);
+#define XFS_DIR_SF_GET_DIRINO_ARCH(from,to,arch)    xfs_dir_sf_get_dirino_arch(from, to, arch)
+#define XFS_DIR_SF_GET_DIRINO(from,to)		    xfs_dir_sf_get_dirino(from, to)
+#else
+#define XFS_DIR_SF_GET_DIRINO_ARCH(from,to,arch)    DIRINO_COPY_ARCH(from,to,arch)
+#define XFS_DIR_SF_GET_DIRINO(from,to)		    DIRINO_COPY_ARCH(from,to,ARCH_NOCONVERT)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_PUT_DIRINO)
+void xfs_dir_sf_put_dirino_arch(xfs_ino_t *from, xfs_dir_ino_t *to, xfs_arch_t arch);
+void xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to);
+#define XFS_DIR_SF_PUT_DIRINO_ARCH(from,to,arch)    xfs_dir_sf_put_dirino_arch(from, to, arch)
+#define XFS_DIR_SF_PUT_DIRINO(from,to)		    xfs_dir_sf_put_dirino(from, to)
+#else
+#define XFS_DIR_SF_PUT_DIRINO_ARCH(from,to,arch)    DIRINO_COPY_ARCH(from,to,arch)
+#define XFS_DIR_SF_PUT_DIRINO(from,to)		    DIRINO_COPY_ARCH(from,to,ARCH_NOCONVERT)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYNAME)
+int xfs_dir_sf_entsize_byname(int len);
+#define XFS_DIR_SF_ENTSIZE_BYNAME(len)		xfs_dir_sf_entsize_byname(len)
+#else
+#define XFS_DIR_SF_ENTSIZE_BYNAME(len)		/* space a name uses */ \
+	((uint)sizeof(xfs_dir_sf_entry_t)-1 + (len))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYENTRY)
+int xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep);
+#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)	xfs_dir_sf_entsize_byentry(sfep)
+#else
+#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)	/* space an entry uses */ \
+	((uint)sizeof(xfs_dir_sf_entry_t)-1 + (sfep)->namelen)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_NEXTENTRY)
+xfs_dir_sf_entry_t *xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep);
+#define XFS_DIR_SF_NEXTENTRY(sfep)		xfs_dir_sf_nextentry(sfep)
+#else
+#define XFS_DIR_SF_NEXTENTRY(sfep)		/* next entry in struct */ \
+	((xfs_dir_sf_entry_t *) \
+		((char *)(sfep) + XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ALLFIT)
+int xfs_dir_sf_allfit(int count, int totallen);
+#define XFS_DIR_SF_ALLFIT(count,totallen)	\
+	xfs_dir_sf_allfit(count,totallen)
+#else
+#define XFS_DIR_SF_ALLFIT(count,totallen)	/* will all entries fit? */ \
+	((uint)sizeof(xfs_dir_sf_hdr_t) + \
+	       ((uint)sizeof(xfs_dir_sf_entry_t)-1)*(count) + (totallen))
+#endif
+
+#ifdef XFS_ALL_TRACE
+#define XFS_DIR_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_DIR_TRACE
+#endif
+
+/*
+ * Kernel tracing support for directories.
+ */
+struct uio;
+struct xfs_inode;
+struct xfs_da_intnode;
+struct xfs_dinode;
+struct xfs_dir_leafblock;
+struct xfs_dir_leaf_entry;
+
+#define XFS_DIR_TRACE_SIZE	4096	/* size of global trace buffer */
+
+/*
+ * Trace record types.
+ */
+#define XFS_DIR_KTRACE_G_DU	1	/* dp, uio */
+#define XFS_DIR_KTRACE_G_DUB	2	/* dp, uio, bno */
+#define XFS_DIR_KTRACE_G_DUN	3	/* dp, uio, node */
+#define XFS_DIR_KTRACE_G_DUL	4	/* dp, uio, leaf */
+#define XFS_DIR_KTRACE_G_DUE	5	/* dp, uio, leaf entry */
+#define XFS_DIR_KTRACE_G_DUC	6	/* dp, uio, cookie */
+
+#if defined(XFS_DIR_TRACE)
+
+void xfs_dir_trace_g_du(char *where, struct xfs_inode *dp, struct uio *uio);
+void xfs_dir_trace_g_dub(char *where, struct xfs_inode *dp, struct uio *uio,
+			      xfs_dablk_t bno);
+void xfs_dir_trace_g_dun(char *where, struct xfs_inode *dp, struct uio *uio,
+			      struct xfs_da_intnode *node);
+void xfs_dir_trace_g_dul(char *where, struct xfs_inode *dp, struct uio *uio,
+			      struct xfs_dir_leafblock *leaf);
+void xfs_dir_trace_g_due(char *where, struct xfs_inode *dp, struct uio *uio,
+			      struct xfs_dir_leaf_entry *entry);
+void xfs_dir_trace_g_duc(char *where, struct xfs_inode *dp, struct uio *uio,
+			      xfs_off_t cookie);
+void xfs_dir_trace_enter(int type, char *where,
+			     __psunsigned_t a0, __psunsigned_t a1,
+			     __psunsigned_t a2, __psunsigned_t a3,
+			     __psunsigned_t a4, __psunsigned_t a5,
+			     __psunsigned_t a6, __psunsigned_t a7,
+			     __psunsigned_t a8, __psunsigned_t a9,
+			     __psunsigned_t a10, __psunsigned_t a11);
+#else
+#define xfs_dir_trace_g_du(w,d,u)
+#define xfs_dir_trace_g_dub(w,d,u,b)
+#define xfs_dir_trace_g_dun(w,d,u,n)
+#define xfs_dir_trace_g_dul(w,d,u,l)
+#define xfs_dir_trace_g_due(w,d,u,e)
+#define xfs_dir_trace_g_duc(w,d,u,c)
+#endif /* DEBUG */
+
+#endif	/* __XFS_DIR_SF_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dmapi.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dmapi.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dmapi.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dmapi.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,2836 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/spinlock.h>
+#include <linux/iobuf.h>
+
+#define MAXNAMLEN MAXNAMELEN
+
+#define XFS_BHV_LOOKUP(vp, xbdp)  \
+	xbdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); \
+	ASSERT(xbdp);
+
+STATIC int prohibited_mr_events(vnode_t	*vp);
+
+/* Structure used to hold the on-disk version of a dm_attrname_t.  All
+   on-disk attribute names start with the 8-byte string "SGI_DMI_".
+*/
+
+typedef struct	{
+	char	dan_chars[DMATTR_PREFIXLEN + DM_ATTR_NAME_SIZE + 1];
+} dm_dkattrname_t;
+
+/* In the on-disk inode, DMAPI attribute names consist of the user-provided
+   name with the DMATTR_PREFIXSTRING pre-pended.  This string must NEVER be
+   changed!
+*/
+
+STATIC	const	char	dmattr_prefix[DMATTR_PREFIXLEN + 1] = DMATTR_PREFIXSTRING;
+
+STATIC	dm_size_t  dm_min_dio_xfer = 0; /* direct I/O disabled for now */
+
+
+/* See xfs_dm_get_dmattr() for a description of why this is needed. */
+
+#define XFS_BUG_KLUDGE	256	/* max size of an in-inode attribute value */
+
+#define DM_MAX_ATTR_BYTES_ON_DESTROY	256
+
+#define DM_STAT_SIZE(namelen)	\
+	(sizeof(dm_stat_t) + sizeof(xfs_handle_t) + namelen)
+#define MAX_DIRENT_SIZE		(sizeof(dirent_t) + MAXNAMELEN)
+
+#define DM_STAT_ALIGN		(sizeof(__uint64_t))
+
+/* DMAPI's E2BIG == EA's ERANGE */
+#define DM_EA_XLATE_ERR(err) { if (err == ERANGE) err = E2BIG; }
+
+/*
+ *	xfs_dm_send_data_event()
+ *
+ *	Send data event to DMAPI.  Drop IO lock (if specified) before
+ *	the dm_send_data_event() call and reacquire it afterwards.
+ */
+int
+xfs_dm_send_data_event(
+	dm_eventtype_t	event,
+	bhv_desc_t	*bdp,
+	xfs_off_t	offset,
+	size_t		length,
+	int		flags,
+	vrwlock_t	*locktype)
+{
+	int		error;
+	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
+	uint16_t	dmstate;
+
+	/*
+	 * Now we need to simulate VOP_RWLOCK/VOP_RWLOCK.   We can't
+	 * just call the VOP though, or we'll wind up going through
+	 * the dsvn layer for CXFS.  We have to avoid getting tokens,
+	 * so we go straight to XFS.
+	 */
+	ASSERT(BHV_IS_XFS(bdp));
+	do {
+		dmstate = ip->i_iocore.io_dmstate;
+		if (locktype)
+			xfs_rwunlock(bdp, *locktype);
+		error = dm_send_data_event(event, bdp, DM_RIGHT_NULL,
+				offset, length, flags);
+		if (locktype)
+			xfs_rwlock(bdp, *locktype);
+	} while (!error && (ip->i_iocore.io_dmstate != dmstate));
+
+	return error;
+}
+
+/*	xfs_dm_create_event
+ *
+ *	Conditionally send a DM_EVENT_CREATE.  The event will not be sent if
+ *	we can check the directory and find the name (to be created) already
+ *	there.	Some sort of a "double check" is required since in the
+ *	xfs_create and	xfs_mkdir routines, we determine that there is not a
+ *	duplicate with the directory ilock held.  We cannot send an event with
+ *	the ilock held, since this can potentially lead to a deadlock.
+ *	Dropping the ilock while the event is being sent is unwise since the
+ *	directory may have changed by the time we reacquire the lock.  Hence
+ *	this workaround.
+ *
+ *	Note that after we have determined that the name does/does not exist,
+ *	the situation might have changed by the time we get back to
+ *	xfs_create/xfs_mkdir.  So the workaround does not really solve the
+ *	problem.  The results can be missing or redundant create events.
+ */
+
+int
+xfs_dm_send_create_event(
+	bhv_desc_t	*dir_bdp,
+	char		*name,
+	mode_t		new_mode,
+	int		*good_event_sent)
+{
+	xfs_inode_t	*dip;
+	xfs_ino_t	inum;
+	vnode_t		*dir_vp;
+	int		error;
+	int		name_len;
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+
+	if (*name == '\0')
+		return 0;
+
+	dip = XFS_BHVTOI (dir_bdp);
+	xfs_ilock (dip, XFS_ILOCK_EXCL);
+
+	vn_trace_entry(dir_vp, "xfs_dm_send_create_event",
+				(inst_t *)__return_address);
+
+	/*
+	 * Handle degenerate pathname component.
+	 */
+
+#ifdef __sgi
+	/*
+	 * Try the directory name lookup cache.
+	 */
+	if (bdp = dnlc_lookup_fast(dir_vp, name, NULL, &fd, NOCRED, VN_GET_NOWAIT)) {
+		xfs_iunlock (dip, XFS_ILOCK_EXCL);
+		VN_RELE (BHV_TO_VNODE(bdp));
+		return 0;
+	}
+#endif
+
+	/*
+	 * Else call the directory code.
+	 */
+
+	name_len = strlen(name);
+	error = XFS_DIR_LOOKUP(dip->i_mount, NULL, dip, name, name_len, &inum);
+	xfs_iunlock (dip, XFS_ILOCK_EXCL);
+	if (error != ENOENT)
+		return 0;
+	error = dm_send_namesp_event(DM_EVENT_CREATE, dir_bdp, DM_RIGHT_NULL,
+				NULL, DM_RIGHT_NULL,
+				name, NULL, new_mode, 0, 0);
+	if (!error)
+		*good_event_sent = 1;
+	return error;
+}
+
+/*	prohibited_mr_events
+ *
+ *	Return event bits representing any events which cannot have managed
+ *	region events set due to memory mapping of the file.  If the maximum
+ *	protection allowed in any pregion includes PROT_WRITE, and the region
+ *	is shared and not text, then neither READ nor WRITE events can be set.
+ *	Otherwise if the file is memory mapped, no READ event can be set.
+ *
+ */
+
+STATIC int
+prohibited_mr_events(vnode_t	*vp)
+{
+	int	prohibited;
+	struct address_space *mapping;
+	struct vm_area_struct *vma;
+
+	if(!VN_MAPPED(vp))
+		return 0;
+
+	prohibited = 1 << DM_EVENT_READ;
+	mapping = LINVFS_GET_IP(vp)->i_mapping;
+
+	spin_lock(&mapping->i_shared_lock);
+
+	for( vma = mapping->i_mmap_shared; vma; vma = vma->vm_next ) {
+		if( vma && (!vma->vm_flags & VM_DENYWRITE) ){
+			prohibited |= 1 << DM_EVENT_WRITE;
+			break;
+		}
+	}
+
+	spin_unlock(&mapping->i_shared_lock);
+	return prohibited;
+}
+
+
+#ifdef	DEBUG_RIGHTS
+STATIC int
+xfs_bdp_to_hexhandle(
+	bhv_desc_t	*bdp,
+	u_int		type,
+	char		*buffer)
+{
+	xfs_handle_t	handle;
+	vnode_t		*vp;
+	u_char		*ip;
+	int		length;
+	int		error;
+	int		i;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	if ((error = dm_vp_to_handle(vp, &handle)))
+		return(error);
+
+	if (type == DM_FSYS_OBJ) {	/* a filesystem handle */
+		length = FSHSIZE;
+	} else {
+		length = XFS_HSIZE(handle);
+	}
+	for (ip = (u_char *)&handle, i = 0; i < length; i++) {
+		*buffer++ = "0123456789abcdef"[ip[i] >> 4];
+		*buffer++ = "0123456789abcdef"[ip[i] & 0xf];
+	}
+	*buffer = '\0';
+	return(0);
+}
+#endif	/* DEBUG_RIGHTS */
+
+
+
+
+/* Copy in and validate an attribute name from user space.  It should be a
+   string of at least one and at most DM_ATTR_NAME_SIZE characters.  Because
+   the dm_attrname_t structure doesn't provide room for the trailing NULL
+   byte, we just copy in one extra character and then zero it if it
+   happens to be non-NULL.
+*/
+
+STATIC int
+xfs_copyin_attrname(
+	dm_attrname_t	*from,		/* dm_attrname_t in user space */
+	dm_dkattrname_t *to)		/* name buffer in kernel space */
+{
+	int		error;
+	size_t len;
+
+	strcpy(to->dan_chars, dmattr_prefix);
+
+	len = strnlen_user((char*)from, DM_ATTR_NAME_SIZE);
+	error = copy_from_user(&to->dan_chars[DMATTR_PREFIXLEN], from, len);
+
+	if (!error && (to->dan_chars[DMATTR_PREFIXLEN] == '\0'))
+		error = EINVAL;
+	if (error == ENAMETOOLONG) {
+		to->dan_chars[sizeof(to->dan_chars) - 1] = '\0';
+		error = 0;
+	}
+	return(error);
+}
+
+
+/* This copies selected fields in an inode into a dm_stat structure.  Because
+   these fields must return the same values as they would in stat(), the
+   majority of this code was copied directly from xfs_getattr().  Any future
+   changes to xfs_gettattr() must also be reflected here.
+
+   The inode must be kept locked SHARED by the caller.
+*/
+
+STATIC void
+xfs_ip_to_stat(
+	xfs_mount_t	*mp,
+	dm_stat_t	*buf,
+	xfs_inode_t	*ip)
+{
+	vnode_t		*vp = XFS_ITOV(ip);
+
+	buf->dt_size = ip->i_d.di_size;
+	buf->dt_dev = ip->i_mount->m_dev;
+
+	buf->dt_ino = ip->i_ino;
+#if XFS_BIG_FILESYSTEMS
+	buf->dt_ino += mp->m_inoadd;
+#endif
+	/*
+	 * Copy from in-core inode.
+	 */
+	buf->dt_mode = VTTOIF(vp->v_type) | (ip->i_d.di_mode & MODEMASK);
+	buf->dt_uid = ip->i_d.di_uid;
+	buf->dt_gid = ip->i_d.di_gid;
+	buf->dt_nlink = ip->i_d.di_nlink;
+	/*
+	 * Minor optimization, check the common cases first.
+	 */
+	if ((vp->v_type == VREG) || (vp->v_type == VDIR)) {
+		buf->dt_rdev = 0;
+	} else if ((vp->v_type == VCHR) || (vp->v_type == VBLK) ) {
+		buf->dt_rdev = IRIX_DEV_TO_KDEVT(ip->i_df.if_u2.if_rdev);
+	} else {
+		buf->dt_rdev = 0;	/* not a b/c spec. */
+	}
+
+	buf->dt_atime = ip->i_d.di_atime.t_sec;
+	buf->dt_mtime = ip->i_d.di_mtime.t_sec;
+	buf->dt_ctime = ip->i_d.di_ctime.t_sec;
+
+	switch (ip->i_d.di_mode & IFMT) {
+	  case IFBLK:
+	  case IFCHR:
+		buf->dt_blksize = BLKDEV_IOSIZE;
+		break;
+	  default:
+		/*
+		 * We use the read buffer size as a recommended I/O
+		 * size.  This should always be larger than the
+		 * write buffer size, so it should be OK.
+		 * The value returned is in bytes.
+		 */
+		buf->dt_blksize = 1 << mp->m_readio_log;
+		break;
+	}
+
+	/*
+	 * XXX : truncate to 32 bits for now.
+	 */
+	buf->dt_blocks =
+		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+
+	/*
+	 * XFS-added attributes
+	 */
+
+	/*
+	 * convert di_flags to xflags
+	 */
+	buf->dt_xfs_xflags = 0;
+	if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
+		buf->dt_xfs_xflags |= DM_XFLAG_REALTIME;
+	if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC)
+		buf->dt_xfs_xflags |= DM_XFLAG_PREALLOC;
+	if (XFS_IFORK_Q(ip))
+		buf->dt_xfs_xflags |= DM_XFLAG_HASATTR;
+	buf->dt_xfs_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
+	buf->dt_xfs_extents = (ip->i_df.if_flags & XFS_IFEXTENTS) ?
+		ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
+		ip->i_d.di_nextents;
+	if (ip->i_afp != NULL) {
+		buf->dt_xfs_aextents =
+			(ip->i_afp->if_flags & XFS_IFEXTENTS) ?
+			 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
+			 ip->i_d.di_anextents;
+	} else {
+		buf->dt_xfs_aextents = 0;
+	}
+
+	/* Now fill in the fields that xfs_getattr() doesn't do. */
+
+	buf->dt_emask = ip->i_d.di_dmevmask;
+	buf->dt_nevents = DM_EVENT_MAX;
+	buf->dt_pers = 0;
+	buf->dt_change = 0;
+	buf->dt_dtime = ip->i_d.di_ctime.t_sec;
+	buf->dt_xfs_dmstate = ip->i_d.di_dmstate;
+	buf->dt_xfs_igen = ip->i_d.di_gen;
+
+	/* Set if one of READ, WRITE or TRUNCATE bits is set in emask */
+
+	buf->dt_pmanreg = ( DMEV_ISSET(DM_EVENT_READ, buf->dt_emask) ||
+			DMEV_ISSET(DM_EVENT_WRITE, buf->dt_emask) ||
+			DMEV_ISSET(DM_EVENT_TRUNCATE, buf->dt_emask) ) ? 1 : 0;
+}
+
+
+/*
+ * This is used by dm_get_bulkattr() as well as dm_get_dirattrs().
+ * Given a inumber, it igets the inode and fills the given buffer
+ * with the dm_stat structure for the file.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_dm_bulkstat_one(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		*buffer,	/* buffer to place output in */
+	xfs_daddr_t	bno,		/* starting block of inode cluster */
+	void		*dip,		/* on-disk inode pointer */
+	int		*res)		/* bulkstat result code */
+{
+	xfs_inode_t	*ip;
+	dm_stat_t	*buf;
+	xfs_handle_t	handle;
+	u_int		statstruct_sz;
+	int		error;
+
+	buf = (dm_stat_t *)buffer;
+
+	if (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino) {
+		*res = BULKSTAT_RV_NOTHING;
+		return EINVAL;
+	}
+	error = xfs_iget(mp, tp, ino, XFS_ILOCK_SHARED, &ip, bno);
+	if (error) {
+		*res = BULKSTAT_RV_NOTHING;
+		return(error);
+	}
+	if (ip->i_d.di_mode == 0) {
+		xfs_iput_new(ip, XFS_ILOCK_SHARED);
+		*res = BULKSTAT_RV_NOTHING;
+		return(ENOENT);
+	}
+
+	/*
+	 * copy everything to the dm_stat buffer
+	 */
+	xfs_ip_to_stat(mp, buf, ip);
+
+	/*
+	 * Make the handle and the link to the next dm_stat buffer
+	 */
+	dm_vp_to_handle(XFS_ITOV(ip), &handle);
+	bcopy(&handle, buf+1, sizeof(handle));	/* handle follows stat struct */
+
+	buf->dt_handle.vd_offset = (ssize_t) sizeof(dm_stat_t);
+	buf->dt_handle.vd_length = (size_t) XFS_HSIZE(handle);
+
+	/*
+	 *  xfs_bulkstat increments the buf if calls the formatter with
+	 *  by the size passed into it, which is this here size.
+	 */
+	statstruct_sz = DM_STAT_SIZE(0);
+	statstruct_sz = (statstruct_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+	buf->_link = statstruct_sz;
+
+	/*
+	 * This is unused in bulkstat - so we zero it out.
+	 */
+	bzero((void *) &buf->dt_compname, sizeof(dm_vardata_t));
+
+	xfs_iput(ip, XFS_ILOCK_SHARED);
+
+	*res = BULKSTAT_RV_DIDONE;
+	return(0);
+}
+
+
+STATIC int
+xfs_get_dirents(
+	xfs_inode_t	*dirp,
+	void		*bufp,
+	size_t		bufsz,
+	off_t		*locp,
+	size_t		*nreadp)
+{
+	int		sink;
+	struct uio	auio;
+	struct iovec	aiov;
+	int		rval;
+
+	*nreadp = 0;
+
+	aiov.iov_base = bufp;
+	aiov.iov_len = bufsz;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = *locp;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_resid = bufsz;
+
+	rval = XFS_DIR_GETDENTS(dirp->i_mount, NULL, dirp, &auio, &sink);
+	if (! rval) {
+		*locp = (off_t) auio.uio_offset;
+
+		/*
+		 * number of bytes read into the dirent buffer
+		 */
+		*nreadp = bufsz - auio.uio_resid;
+	}
+	return(rval);
+}
+
+
+STATIC int
+xfs_dirents_to_stats(
+	xfs_mount_t	*mp,
+	xfs_dirent_t	*direntp,	/* array of dirent structs */
+	void		*bufp,		/* buffer to fill */
+	size_t		direntbufsz,	/* sz of filled part of dirent buf */
+	size_t		*spaceleftp,	/* IO - space left in user buffer */
+	size_t		*nwrittenp,	/* number of bytes written to 'bufp' */
+	off_t		*locp)
+{
+	xfs_dirent_t	*p;
+	dm_stat_t	*statp;
+	size_t		reclen;
+	size_t		namelen;
+	size_t		spaceleft;
+	off_t		prevoff;
+	int		res;
+
+	spaceleft = *spaceleftp;
+	*spaceleftp = 0;
+	*nwrittenp = 0;
+	prevoff = 0;  /* sizeof this getdents record */
+
+	/*
+	 * Go thru all the dirent records, making dm_stat structures from
+	 * them, one by one, until dirent buffer is empty or stat buffer
+	 * is full.
+	 */
+	p = direntp;
+	statp = (dm_stat_t *) bufp;
+	for (reclen = (size_t) p->d_reclen; direntbufsz > 0;
+					direntbufsz -= reclen,
+					p = (xfs_dirent_t *) ((char *) p + reclen),
+					reclen = (size_t) p->d_reclen) {
+
+		namelen = strlen(p->d_name) + 1;
+
+		/*
+		 * Make sure we have enough space.
+		 */
+		if (spaceleft <= DM_STAT_SIZE(namelen)) {
+			/*
+			 * d_off field in dirent_t points at the next entry.
+			 */
+			if (prevoff)	/* did at least one; update location */
+				*locp = prevoff;
+			*spaceleftp = 0;
+
+			/*
+			 * The last link is NULL.
+			 */
+			statp->_link = 0;
+			return(0);
+		}
+
+		statp = (dm_stat_t *) bufp;
+
+		(void)xfs_dm_bulkstat_one(mp, NULL, (xfs_ino_t)p->d_ino, statp, 0, 0, &res);
+		if (res != BULKSTAT_RV_DIDONE)
+			continue;
+
+		/*
+		 * On return from bulkstat_one(), stap->_link points
+		 * at the end of the handle in the stat structure.
+		 */
+		statp->dt_compname.vd_offset = statp->_link;
+		statp->dt_compname.vd_length = namelen;
+		/*
+		 * Directory entry name is guaranteed to be
+		 * null terminated; the copy gets the '\0' too.
+		 */
+		bcopy(p->d_name, (char *) statp + statp->_link, namelen);
+
+		/* Word-align the record */
+		statp->_link = (statp->_link + namelen + (DM_STAT_ALIGN - 1))
+			& ~(DM_STAT_ALIGN - 1);
+
+		spaceleft -= statp->_link;
+		*nwrittenp += statp->_link;
+		bufp = (char *)statp + statp->_link;
+
+		/*
+		 * We need to rollback to this position if something happens.
+		 * So we remember it.
+		 */
+		prevoff = p->d_off;
+	}
+	statp->_link = 0;
+
+	/*
+	 * If there's space left to put in more, caller should know that..
+	 */
+	if (spaceleft > DM_STAT_SIZE(MAXNAMLEN)) {
+		*spaceleftp = spaceleft;
+	}
+	return(0);
+}
+
+
+/* xfs_dm_f_get_eventlist - return the dm_eventset_t mask for inode vp. */
+
+STATIC int
+xfs_dm_f_get_eventlist(
+	bhv_desc_t	*bdp,
+	dm_right_t	right,
+	u_int		nelem,
+	dm_eventset_t	*eventsetp,		/* in kernel space! */
+	u_int		*nelemp)		/* in kernel space! */
+{
+	dm_eventset_t	eventset;
+	xfs_inode_t	*ip;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	/* Note that we MUST return a regular file's managed region bits as
+	   part of the mask because dm_get_eventlist is supposed to return the
+	   union of all managed region flags in those bits.  Since we only
+	   support one region, we can just return the bits as they are.	 For
+	   all other object types, the bits will already be zero.  Handy, huh?
+	*/
+
+	ip = XFS_BHVTOI(bdp);
+	eventset = ip->i_d.di_dmevmask;
+
+	/* Now copy the event mask and event count back to the caller.	We
+	   return the lesser of nelem and DM_EVENT_MAX.
+	*/
+
+	if (nelem > DM_EVENT_MAX)
+		nelem = DM_EVENT_MAX;
+	eventset &= (1 << nelem) - 1;
+
+	*eventsetp = eventset;
+	*nelemp = nelem;
+	return(0);
+}
+
+
+/* xfs_dm_f_set_eventlist - update the dm_eventset_t mask in the inode vp.  Only the
+   bits from zero to maxevent-1 are being replaced; higher bits are preserved.
+*/
+
+STATIC int
+xfs_dm_f_set_eventlist(
+	bhv_desc_t	*bdp,
+	dm_right_t	right,
+	dm_eventset_t	*eventsetp,	/* in kernel space! */
+	u_int		maxevent)
+{
+	dm_eventset_t	eventset;
+	dm_eventset_t	max_mask;
+	dm_eventset_t	valid_events;
+	vnode_t		*vp;
+	xfs_inode_t	*ip;
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	int		error;
+
+	if (right < DM_RIGHT_EXCL)
+		return(EACCES);
+
+	eventset = *eventsetp;
+	if (maxevent >= sizeof(ip->i_d.di_dmevmask) * NBBY)
+		return(EINVAL);
+	max_mask = (1 << maxevent) - 1;
+
+	vp = BHV_TO_VNODE(bdp);
+	if (vp->v_type == VDIR) {
+		valid_events = DM_XFS_VALID_DIRECTORY_EVENTS;
+	} else {	/* file or symlink */
+		valid_events = DM_XFS_VALID_FILE_EVENTS;
+	}
+	if ((eventset & max_mask) & ~valid_events)
+		return(EINVAL);
+
+	/* Adjust the event mask so that the managed region bits will not
+	   be altered.
+	*/
+
+	max_mask &= ~(1 <<DM_EVENT_READ);	/* preserve current MR bits */
+	max_mask &= ~(1 <<DM_EVENT_WRITE);
+	max_mask &= ~(1 <<DM_EVENT_TRUNCATE);
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return(error);
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	ip->i_d.di_dmevmask = (eventset & max_mask) | (ip->i_d.di_dmevmask & ~max_mask);
+	ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	VN_HOLD(vp);
+	xfs_trans_commit(tp, 0, NULL);
+
+	return(0);
+}
+
+
+/* xfs_dm_fs_get_eventlist - return the dm_eventset_t mask for filesystem vfsp. */
+
+STATIC int
+xfs_dm_fs_get_eventlist(
+	bhv_desc_t	*bdp,
+	dm_right_t	right,
+	u_int		nelem,
+	dm_eventset_t	*eventsetp,		/* in kernel space! */
+	u_int		*nelemp)		/* in kernel space! */
+{
+	dm_eventset_t	eventset;
+	xfs_mount_t	*mp;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	mp = XFS_BHVTOI(bdp)->i_mount;
+	eventset = mp->m_dmevmask;
+
+	/* Now copy the event mask and event count back to the caller.	We
+	   return the lesser of nelem and DM_EVENT_MAX.
+	*/
+
+	if (nelem > DM_EVENT_MAX)
+		nelem = DM_EVENT_MAX;
+	eventset &= (1 << nelem) - 1;
+
+	*eventsetp = eventset;
+	*nelemp = nelem;
+	return(0);
+}
+
+
+/* xfs_dm_fs_set_eventlist - update the dm_eventset_t mask in the mount structure for
+   filesystem vfsp.  Only the bits from zero to maxevent-1 are being replaced;
+   higher bits are preserved.
+*/
+
+STATIC int
+xfs_dm_fs_set_eventlist(
+	bhv_desc_t	*bdp,
+	dm_right_t	right,
+	dm_eventset_t	*eventsetp,	/* in kernel space! */
+	u_int		maxevent)
+{
+	dm_eventset_t	eventset;
+	dm_eventset_t	max_mask;
+	xfs_mount_t	*mp;
+
+	if (right < DM_RIGHT_EXCL)
+		return(EACCES);
+
+	eventset = *eventsetp;
+
+	mp = XFS_BHVTOI(bdp)->i_mount;
+	if (maxevent >= sizeof(mp->m_dmevmask) * NBBY)
+		return(EINVAL);
+	max_mask = (1 << maxevent) - 1;
+
+	if ((eventset & max_mask) & ~DM_XFS_VALID_FS_EVENTS)
+		return(EINVAL);
+
+	mp->m_dmevmask = (eventset & max_mask) | (mp->m_dmevmask & ~max_mask);
+	return(0);
+}
+
+
+/* Code in this routine must exactly match the logic in xfs_diordwr() in
+   order for this to work!
+*/
+
+STATIC int
+xfs_dm_direct_ok(
+	bhv_desc_t	*bdp,
+	dm_off_t	off,
+	dm_size_t	len,
+	void		*bufp)
+{
+	xfs_mount_t	*mp;
+	xfs_inode_t	*ip;
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	/* Realtime files can ONLY do direct I/O. */
+
+	if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
+		return(1);
+
+	/* If direct I/O is disabled, or if the request is too small, use
+	   buffered I/O.
+	*/
+
+	if (!dm_min_dio_xfer || len < dm_min_dio_xfer)
+		return(0);
+
+#if 0
+	/* If the request is not well-formed or is too large, use
+	   buffered I/O.
+	*/
+
+	if ((__psint_t)bufp & scache_linemask)	/* if buffer not aligned */
+		return(0);
+	if (off & mp->m_blockmask)		/* if file offset not aligned */
+		return(0);
+	if (len & mp->m_blockmask)		/* if xfer length not aligned */
+		return(0);
+	if (len > ctooff(v.v_maxdmasz - 1))	/* if transfer too large */
+		return(0);
+
+	/* A valid direct I/O candidate. */
+
+	return(1);
+#else
+	return(0);
+#endif
+}
+
+
+/* We need to be able to select various combinations of FINVIS, O_NONBLOCK,
+   O_DIRECT, and O_SYNC, yet we don't have a file descriptor and we don't have
+   the file's pathname.	 All we have is a handle.
+*/
+
+STATIC int
+xfs_dm_rdwr(
+	vnode_t		*vp,
+	uint		fflag,
+	mode_t		fmode,
+	dm_off_t	off,
+	dm_size_t	len,
+	void		*bufp,
+	int		*rvp)
+{
+	int		error;
+	int		oflags;
+	ssize_t		xfer;
+	struct file	file;
+	struct inode	*ip;
+	struct dentry	*dentry;
+	struct list_head *lp;
+	bhv_desc_t	*xbdp;
+
+	if (off < 0 || vp->v_type != VREG)
+		return(EINVAL);
+
+	if (fmode & FMODE_READ) {
+		XFS_STATS_INC(xfsstats.xs_read_calls);
+		oflags = O_RDONLY;
+	} else {
+		XFS_STATS_INC(xfsstats.xs_write_calls);
+		oflags = O_WRONLY;
+	}
+
+	/* Build file descriptor flags and I/O flags.  O_NONBLOCK is needed so
+	   that we don't block on mandatory file locks.	 FINVIS is needed so
+	   that we don't change any file timestamps.
+	*/
+
+	fmode |= FINVIS;
+	oflags |= O_NONBLOCK;
+	XFS_BHV_LOOKUP(vp, xbdp);
+	if (xfs_dm_direct_ok(xbdp, off, len, bufp))
+		oflags |= O_DIRECT;
+
+	if (fflag & O_SYNC)
+		oflags |= O_SYNC;
+
+	ip = LINVFS_GET_IP(vp);
+	if( ip->i_fop == NULL ){
+		/* no iput; caller did get, and will do put */
+		return(EINVAL);
+	}
+	igrab(ip);
+
+	/* Find a dentry.  Get a well-connected one, if possible. */
+	spin_lock(&dcache_lock);
+	for (lp = ip->i_dentry.next; lp != &ip->i_dentry ; lp=lp->next) {
+		dentry = list_entry(lp,struct dentry, d_alias);
+		if (! (dentry->d_flags & DCACHE_NFSD_DISCONNECTED)) {
+			dget_locked(dentry);
+			dentry->d_vfs_flags |= DCACHE_REFERENCED;
+			spin_unlock(&dcache_lock);
+			iput(ip);
+			goto found;
+		}
+	}
+	spin_unlock(&dcache_lock);
+	dentry = d_alloc_root(ip);
+	if (dentry == NULL) {
+		iput(ip);
+		return ENOMEM;
+	}
+	dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
+
+found:
+	if( ip->i_ino != dentry->d_inode->i_ino ){
+		dput(dentry);
+		return EINVAL;
+	}
+
+	if (fmode & FMODE_WRITE) {
+		error = get_write_access(ip);
+		if (error) {
+			dput(dentry);
+			return(-error);
+		}
+	}
+
+	error = init_private_file( &file, dentry, fmode );
+	if(error){
+		if (error == -EFBIG) {
+			/* try again */
+			oflags |= O_LARGEFILE;
+			file.f_flags = oflags;
+			error = file.f_op->open( dentry->d_inode, &file );
+		}
+		if (error) {
+			if (fmode & FMODE_WRITE)
+				put_write_access(ip);
+			dput(dentry);
+			return(EINVAL);
+		}
+	}
+
+	file.f_flags = oflags;
+
+	if (fmode & FMODE_READ) {
+		VOP_READ(vp, &file, bufp, len, &off, NULL, xfer);
+	} else {
+		VOP_WRITE(vp, &file, bufp, len, &off, NULL, xfer);
+	}
+	if (xfer >= 0) {
+		*rvp = xfer;
+		error = 0;
+		linvfs_revalidate_core(ip, ATTR_COMM);
+		if (fmode & FMODE_READ) {
+			XFS_STATS_ADD(xfsstats.xs_read_bytes, xfer);
+		} else {
+			XFS_STATS_ADD(xfsstats.xs_write_bytes, xfer);
+		}
+	} else {
+		error = -(int)xfer;
+	}
+
+	if (file.f_mode & FMODE_WRITE)
+		put_write_access(ip);
+	if (file.f_op->release)
+		file.f_op->release(ip, &file);
+	dput(dentry);
+	return error;
+}
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_clear_inherit(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_attrname_t	*attrnamep)
+{
+	return(ENOSYS);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_create_by_handle(
+	vnode_t		*vp,
+	dm_right_t	right,
+	void		*hanp,
+	size_t		hlen,
+	char		*cname)
+{
+	return(ENOSYS);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_downgrade_right(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		type)		/* DM_FSYS_OBJ or zero */
+{
+#ifdef	DEBUG_RIGHTS
+	char		buffer[sizeof(xfs_handle_t) * 2 + 1];
+	bhv_desc_t	*bdp;
+
+	XFS_BHV_LOOKUP(vp, bdp);
+
+	if (!xfs_bdp_to_hexhandle(bdp, type, buffer)) {
+		printf("dm_downgrade_right: old %d new %d type %d handle %s\n",
+			right, DM_RIGHT_SHARED, type, buffer);
+	} else {
+		printf("dm_downgrade_right: old %d new %d type %d handle "
+			"<INVALID>\n", right, DM_RIGHT_SHARED, type);
+	}
+#endif	/* DEBUG_RIGHTS */
+	return(0);
+}
+
+
+/* Note: xfs_dm_get_allocinfo() makes no attempt to coalesce two adjacent
+   extents when both are of type DM_EXTENT_RES; this is left to the caller.
+   XFS guarantees that there will never be two adjacent DM_EXTENT_HOLE extents.
+
+   In order to provide the caller with all extents in a file including
+   those beyond the file's last byte offset, we have to use the xfs_bmapi()
+   interface.  (VOP_BMAP won't let us see past EOF, and xfs_getbmap is too
+   buggy.)
+*/
+
+STATIC int
+xfs_dm_get_allocinfo_rvp(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_off_t	*offp,
+	u_int		nelem,
+	dm_extent_t	*extentp,
+	u_int		*nelemp,
+	int		*rvp)
+{
+	xfs_inode_t	*ip;		/* xfs incore inode pointer */
+	xfs_mount_t	*mp;		/* file system mount point */
+	xfs_fileoff_t	fsb_offset;
+	xfs_filblks_t	fsb_length;
+	dm_off_t	startoff;
+	int		elem;
+	bhv_desc_t	*xbdp;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	if (copy_from_user( &startoff, offp, sizeof(startoff)))
+		return(EFAULT);
+
+	if (startoff > XFS_MAX_FILE_OFFSET)
+		return(EINVAL);
+
+	if (nelem == 0) {
+		if (put_user(1, nelemp))
+			return(EFAULT);
+		return(E2BIG);
+	}
+
+	XFS_BHV_LOOKUP(vp, xbdp);
+
+	ip = XFS_BHVTOI(xbdp);
+	mp = ip->i_mount;
+
+	/* Convert the caller's starting offset into filesystem allocation
+	   units as required by xfs_bmapi().  Round the offset down so that
+	   it is sure to be included in the reply.
+	*/
+
+	fsb_offset = XFS_B_TO_FSBT(mp, startoff);
+	fsb_length = XFS_B_TO_FSB(mp, XFS_MAX_FILE_OFFSET) - fsb_offset;
+	elem = 0;
+
+	while (fsb_length && elem < nelem) {
+		xfs_bmbt_irec_t bmp[50];
+		dm_extent_t	extent;
+		xfs_filblks_t	fsb_bias;
+		dm_size_t	bias;
+		int		error;
+		int		lock;
+		int		num;
+		int		i;
+
+		/* Compute how many getbmap structures to use on the xfs_bmapi
+		   call.
+		*/
+
+		num = MIN(nelem - elem, sizeof(bmp) / sizeof(bmp[0]));
+
+		xfs_ilock(ip, XFS_IOLOCK_SHARED);
+		lock = xfs_ilock_map_shared(ip);
+
+		error = xfs_bmapi(NULL, ip, fsb_offset, fsb_length,
+			XFS_BMAPI_ENTIRE, NULL, 0, bmp, &num, NULL);
+
+		xfs_iunlock_map_shared(ip, lock);
+		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+		if (error)
+			return(error);
+
+		/* Fill in the caller's extents, adjusting the bias in the
+		   first entry if necessary.
+		*/
+
+		for (i = 0; i < num; i++, extentp++) {
+			bias = startoff - XFS_FSB_TO_B(mp, bmp[i].br_startoff);
+			extent.ex_offset = startoff;
+			extent.ex_length =
+				XFS_FSB_TO_B(mp, bmp[i].br_blockcount) - bias;
+			if (bmp[i].br_startblock == HOLESTARTBLOCK) {
+				extent.ex_type = DM_EXTENT_HOLE;
+			} else {
+				extent.ex_type = DM_EXTENT_RES;
+			}
+			startoff = extent.ex_offset + extent.ex_length;
+
+			if (copy_to_user( extentp, &extent, sizeof(extent)))
+				return(EFAULT);
+
+			fsb_bias = fsb_offset - bmp[i].br_startoff;
+			fsb_offset += bmp[i].br_blockcount - fsb_bias;
+			fsb_length -= bmp[i].br_blockcount - fsb_bias;
+			elem++;
+		}
+	}
+
+	if (fsb_length == 0) {
+		startoff = 0;
+	}
+	if (copy_to_user( offp, &startoff, sizeof(startoff)))
+		return(EFAULT);
+
+	if (copy_to_user( nelemp, &elem, sizeof(elem)))
+		return(EFAULT);
+
+	*rvp = (fsb_length == 0 ? 0 : 1);
+
+	return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_bulkall_rvp(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		mask,
+	dm_attrname_t	*attrnamep,
+	dm_attrloc_t	*locp,
+	size_t		buflen,
+	void		*bufp,		/* address of buffer in user space */
+	size_t		*rlenp,		/* user space address */
+	int		*rvalp)
+{
+	return(ENOSYS);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_bulkattr_rvp(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		mask,
+	dm_attrloc_t	*locp,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp,
+	int		*rvalp)
+{
+	int		error, done;
+	int		nelems;
+	u_int		statstruct_sz;
+	dm_attrloc_t	loc;
+	bhv_desc_t	*mp_bdp;
+	xfs_mount_t	*mp;
+	vfs_t		*vfsp = vp->v_vfsp;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	if (copy_from_user( &loc, locp, sizeof(loc)))
+		return(EFAULT);
+
+	/* Because we will write directly to the user's buffer, make sure that
+	   the buffer is properly aligned.
+	*/
+
+	if (((__psint_t)bufp & (DM_STAT_ALIGN - 1)) != 0)
+		return(EFAULT);
+
+	/* size of the handle is constant for this function */
+
+	statstruct_sz = DM_STAT_SIZE(0);
+	statstruct_sz = (statstruct_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+
+	nelems = buflen / statstruct_sz;
+	if (nelems < 1) {
+		if (put_user( statstruct_sz, rlenp ))
+			return(EFAULT);
+		return(E2BIG);
+	}
+
+	mp_bdp = bhv_lookup(VFS_BHVHEAD(vfsp), &xfs_vfsops);
+	ASSERT(mp_bdp);
+	mp = XFS_BHVTOM(mp_bdp);
+	
+
+	/*
+	 * fill the buffer with dm_stat_t's
+	 */
+
+	error = xfs_bulkstat(mp, NULL,
+			     (xfs_ino_t *)&loc,
+			     &nelems,
+			     xfs_dm_bulkstat_one,
+			     statstruct_sz,
+			     bufp,
+			     BULKSTAT_FG_IGET,
+			     &done);
+	if (error)
+		return(error);
+	if (!done) {
+		*rvalp = 1;
+	} else {
+		*rvalp = 0;
+	}
+
+	if (put_user( statstruct_sz * nelems, rlenp ))
+		return(EFAULT);
+
+	if (copy_to_user( locp, &loc, sizeof(loc)))
+		return(EFAULT);
+
+	/*
+	 *  If we didn't do any, we must not have any more to do.
+	 */
+	if (nelems < 1)
+		return(0);
+	/* set _link in the last struct to zero */
+	if (put_user( 0,
+	    &((dm_stat_t *)((char *)bufp + statstruct_sz*(nelems-1)))->_link)
+	   )
+		return(EFAULT);
+	return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_config(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_config_t	flagname,
+	dm_size_t	*retvalp)
+{
+	dm_size_t	retval;
+
+	switch (flagname) {
+	case DM_CONFIG_DTIME_OVERLOAD:
+	case DM_CONFIG_PERS_ATTRIBUTES:
+	case DM_CONFIG_PERS_EVENTS:
+	case DM_CONFIG_PERS_MANAGED_REGIONS:
+	case DM_CONFIG_PUNCH_HOLE:
+	case DM_CONFIG_WILL_RETRY:
+		retval = DM_TRUE;
+		break;
+
+	case DM_CONFIG_CREATE_BY_HANDLE:	/* these will never be done */
+	case DM_CONFIG_LOCK_UPGRADE:
+	case DM_CONFIG_PERS_INHERIT_ATTRIBS:
+		retval = DM_FALSE;
+		break;
+
+	case DM_CONFIG_BULKALL:			/* these will be done someday */
+		retval = DM_FALSE;
+		break;
+	case DM_CONFIG_MAX_ATTR_ON_DESTROY:
+		retval = DM_MAX_ATTR_BYTES_ON_DESTROY;
+		break;
+
+	case DM_CONFIG_MAX_ATTRIBUTE_SIZE:
+		retval = ATTR_MAX_VALUELEN;
+		break;
+
+	case DM_CONFIG_MAX_HANDLE_SIZE:
+		retval = DM_MAX_HANDLE_SIZE;
+		break;
+
+	case DM_CONFIG_MAX_MANAGED_REGIONS:
+		retval = 1;
+		break;
+
+	case DM_CONFIG_TOTAL_ATTRIBUTE_SPACE:
+		retval = 0x7fffffff;	/* actually it's unlimited */
+		break;
+
+	default:
+		return(EINVAL);
+	}
+
+	/* Copy the results back to the user. */
+
+	if (copy_to_user( retvalp, &retval, sizeof(retval)))
+		return(EFAULT);
+	return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_config_events(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		nelem,
+	dm_eventset_t	*eventsetp,
+	u_int		*nelemp)
+{
+	dm_eventset_t	eventset;
+
+	if (nelem == 0)
+		return(EINVAL);
+
+	eventset = DM_XFS_SUPPORTED_EVENTS;
+
+	/* Now copy the event mask and event count back to the caller.	We
+	   return the lesser of nelem and DM_EVENT_MAX.
+	*/
+
+	if (nelem > DM_EVENT_MAX)
+		nelem = DM_EVENT_MAX;
+	eventset &= (1 << nelem) - 1;
+
+	if (copy_to_user( eventsetp, &eventset, sizeof(eventset)))
+		return(EFAULT);
+
+	if (put_user(nelem, nelemp))
+		return(EFAULT);
+	return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_destroy_dmattr(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_attrname_t	*attrnamep,
+	char		**valuepp,
+	int		*vlenp)
+{
+	char		buffer[XFS_BUG_KLUDGE];
+	dm_dkattrname_t dkattrname;
+	int		alloc_size;
+	int		value_len;
+	char		*value;
+	int		error;
+
+	*vlenp = -1;		/* assume failure by default */
+
+	if (attrnamep->an_chars[0] == '\0')
+		return(EINVAL);
+
+	/* Build the on-disk version of the attribute name. */
+
+	strcpy(dkattrname.dan_chars, dmattr_prefix);
+	strncpy(&dkattrname.dan_chars[DMATTR_PREFIXLEN],
+		(char *)attrnamep->an_chars, DM_ATTR_NAME_SIZE + 1);
+	dkattrname.dan_chars[sizeof(dkattrname.dan_chars) - 1] = '\0';
+
+	/* VOP_ATTR_GET will not return anything if the buffer is too small,
+	   and we don't know how big to make the buffer, so this may take
+	   two tries to get it right.  The initial try must use a buffer of
+	   at least XFS_BUG_KLUDGE bytes to prevent buffer overflow because
+	   of a bug in XFS.
+	*/
+
+	alloc_size = 0;
+	value_len = sizeof(buffer);	/* in/out parameter */
+	value = buffer;
+
+	VOP_ATTR_GET(vp, dkattrname.dan_chars, value, &value_len,
+			ATTR_ROOT, sys_cred, error);
+
+	if (error == ERANGE) {
+		alloc_size = value_len;
+		value = kmalloc(alloc_size, SLAB_KERNEL);
+		if (value == NULL) {
+			printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+			return(ENOMEM);
+		}
+
+		VOP_ATTR_GET(vp, dkattrname.dan_chars, value,
+			&value_len, ATTR_ROOT, sys_cred, error);
+	}
+	if (error) {
+		if (alloc_size)
+			kfree(value);
+		DM_EA_XLATE_ERR(error);
+		return(error);
+	}
+
+	/* The attribute exists and has a value.  Note that a value_len of
+	   zero is valid!
+	*/
+
+	if (value_len == 0) {
+		*vlenp = 0;
+		return(0);
+	}
+
+	if (!alloc_size) {
+		value = kmalloc(value_len, SLAB_KERNEL);
+		if (value == NULL) {
+			printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+			return(ENOMEM);
+		}
+		bcopy(buffer, value, value_len);
+	} else if (value_len > DM_MAX_ATTR_BYTES_ON_DESTROY) {
+		int	value_len2 = DM_MAX_ATTR_BYTES_ON_DESTROY;
+		char	*value2;
+
+		value2 = kmalloc(value_len2, SLAB_KERNEL);
+		if (value2 == NULL) {
+			printk("%s/%d: kmalloc returned NULL\n", __FUNCTION__, __LINE__);
+			kfree(value);
+			return(ENOMEM);
+		}
+		bcopy(value, value2, value_len2);
+		kfree(value);
+		value = value2;
+		value_len = value_len2;
+	}
+	*vlenp = value_len;
+	*valuepp = value;
+	return(0);
+}
+
+/* This code was taken from xfs_fcntl(F_DIOINFO) and modified slightly because
+   we don't have a flags parameter (no open file).
+   Taken from xfs_ioctl(XFS_IOC_DIOINFO) on Linux.
+*/
+
+STATIC int
+xfs_dm_get_dioinfo(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_dioinfo_t	*diop)
+{
+	dm_dioinfo_t	dio;
+	xfs_mount_t	*mp;
+	xfs_inode_t	*ip;
+	bhv_desc_t	*xbdp;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	XFS_BHV_LOOKUP(vp, xbdp);
+
+	ip = XFS_BHVTOI(xbdp);
+	mp = ip->i_mount;
+
+	/*
+	 * this only really needs to be BBSIZE.
+	 * it is set to the file system block size to
+	 * avoid having to do block zeroing on short writes.
+	 */
+	dio.d_miniosz = mp->m_sb.sb_blocksize;
+	dio.d_maxiosz = XFS_FSB_TO_B(mp,
+			    XFS_B_TO_FSBT(mp, KIO_MAX_ATOMIC_IO << 10));
+	dio.d_mem = 512;
+
+	if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+		dio.d_dio_only = DM_TRUE;
+	} else {
+		dio.d_dio_only = DM_FALSE;
+	}
+
+	if (copy_to_user(diop, &dio, sizeof(dio)))
+		return(EFAULT);
+	return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_dirattrs_rvp(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		mask,
+	dm_attrloc_t	*locp,
+	size_t		buflen,
+	void		*bufp,		/* address of buffer in user space */
+	size_t		*rlenp,		/* user space address */
+	int		*rvp)
+{
+	xfs_inode_t	*dp;
+	xfs_mount_t	*mp;
+	size_t		direntbufsz, statbufsz;
+	size_t		nread, spaceleft, nwritten=0;
+	void		*direntp, *statbufp;
+	uint		lock_mode;
+	int		error;
+	dm_attrloc_t	loc;
+	bhv_desc_t	*xbdp;
+	bhv_desc_t	*mp_bdp;
+	vfs_t		*vfsp = vp->v_vfsp;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	if (copy_from_user( &loc, locp, sizeof(loc)))
+		return(EFAULT);
+
+	if ((buflen / DM_STAT_SIZE(MAXNAMLEN)) == 0) {
+		if (put_user( DM_STAT_SIZE(MAXNAMLEN), rlenp ))
+			return(EFAULT);
+		return(E2BIG);
+	}
+
+	mp_bdp = bhv_lookup(VFS_BHVHEAD(vfsp), &xfs_vfsops);
+	ASSERT(mp_bdp);
+	xbdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
+	ASSERT(xbdp);
+
+	mp = XFS_BHVTOM(mp_bdp);
+	dp = XFS_BHVTOI(xbdp);
+	if ((dp->i_d.di_mode & IFMT) != IFDIR)
+		return(ENOTDIR);
+
+	/*
+	 * Don't get more dirents than are guaranteed to fit.
+	 * The minimum that the stat buf holds is the buf size over
+	 * maximum entry size.	That times the minimum dirent size
+	 * is an overly conservative size for the dirent buf.
+	 */
+	statbufsz = NBPP;
+	direntbufsz = (NBPP / DM_STAT_SIZE(MAXNAMLEN)) * sizeof(xfs_dirent_t);
+
+	direntp = kmem_alloc(direntbufsz, KM_SLEEP);
+	statbufp = kmem_alloc(statbufsz, KM_SLEEP);
+	error = 0;
+	spaceleft = buflen;
+	/*
+	 * Keep getting dirents until the ubuffer is packed with
+	 * dm_stat structures.
+	 */
+	do {
+		ulong	dir_gen = 0;
+
+		lock_mode = xfs_ilock_map_shared(dp);
+		/* See if the directory was removed after it was opened. */
+		if (dp->i_d.di_nlink <= 0) {
+			xfs_iunlock_map_shared(dp, lock_mode);
+			error = ENOENT;
+			break;
+		}
+		if (dir_gen == 0)
+			dir_gen = dp->i_gen;
+		else if (dir_gen != dp->i_gen) {
+			/* if dir changed, quit.  May be overzealous... */
+			xfs_iunlock_map_shared(dp, lock_mode);
+			break;
+		}
+		error = xfs_get_dirents(dp, direntp, direntbufsz, (off_t *)&loc,
+					&nread);
+		xfs_iunlock_map_shared(dp, lock_mode);
+
+		if (error) {
+			break;
+		}
+		if (nread == 0)
+			break;
+		/*
+		 * Now iterate thru them and call bulkstat_one() on all
+		 * of them
+		 */
+		error = xfs_dirents_to_stats(mp,
+					  (xfs_dirent_t *) direntp,
+					  statbufp,
+					  nread,
+					  &spaceleft,
+					  &nwritten,
+					  (off_t *)&loc);
+		if (error) {
+			break;
+		}
+
+		if (nwritten) {
+			if (copy_to_user( bufp, statbufp, nwritten)) {
+				error = EFAULT;
+				break;
+			}
+			break;
+		}
+	} while (spaceleft);
+	/*
+	 *  If xfs_get_dirents found anything, there might be more to do.
+	 *  If it didn't read anything, signal all done (rval == 0).
+	 *  (Doesn't matter either way if there was an error.)
+	 */
+	if (nread) {
+		*rvp = 1;
+	} else {
+		*rvp = 0;
+	}
+
+	kmem_free(statbufp, statbufsz);
+	kmem_free(direntp, direntbufsz);
+	if (!error){
+		if (put_user( buflen - spaceleft, rlenp))
+			return(EFAULT);
+	}
+
+	if (!error && copy_to_user(locp, &loc, sizeof(loc)))
+		error = EFAULT;
+	return(error);
+}
+
+
+STATIC int
+xfs_dm_get_dmattr(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_attrname_t	*attrnamep,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp)
+{
+	dm_dkattrname_t name;
+	char		*value;
+	int		value_len;
+	int		alloc_size;
+	int		error;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
+		return(error);
+
+	/* Allocate a buffer to receive the attribute's value.	We allocate
+	   at least one byte even if the caller specified a buflen of zero.
+	   (A buflen of zero is considered valid.)
+
+	   Allocating a minimum of XFS_BUG_KLUDGE bytes temporarily works
+	   around a bug within XFS in which in-inode attribute values are not
+	   checked to see if they will fit in the buffer before they are
+	   copied.  Since no in-core attribute value can be larger than 256
+	   bytes (an 8-bit size field), we allocate that minimum size here to
+	   prevent buffer overrun in both the kernel's and user's buffers.
+	*/
+
+	alloc_size = buflen;
+	if (alloc_size < XFS_BUG_KLUDGE)
+		alloc_size = XFS_BUG_KLUDGE;
+	if (alloc_size > ATTR_MAX_VALUELEN)
+		alloc_size = ATTR_MAX_VALUELEN;
+	value = kmem_alloc(alloc_size, KM_SLEEP);
+
+	/* Get the attribute's value. */
+
+	value_len = alloc_size;		/* in/out parameter */
+
+	VOP_ATTR_GET(vp, name.dan_chars, value, &value_len,
+			ATTR_ROOT, NULL, error);
+	DM_EA_XLATE_ERR(error);
+
+	/* DMAPI requires an errno of ENOENT if an attribute does not exist,
+	   so remap ENOATTR here.
+	*/
+
+	if (error == ENOATTR)
+		error = ENOENT;
+	if (!error && value_len > buflen)
+		error = E2BIG;
+	if (!error && copy_to_user(bufp, value, value_len))
+		error = EFAULT;
+	if (!error || error == E2BIG) {
+		if (put_user(value_len, rlenp))
+			error = EFAULT;
+	}
+
+	kmem_free(value, alloc_size);
+	return(error);
+}
+
+STATIC int
+xfs_dm_get_eventlist(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		type,
+	u_int		nelem,
+	dm_eventset_t	*eventsetp,
+	u_int		*nelemp)
+{
+	int		error;
+	bhv_desc_t	*xbdp;
+
+	XFS_BHV_LOOKUP(vp, xbdp);
+
+	if (type == DM_FSYS_OBJ) {
+		error = xfs_dm_fs_get_eventlist(xbdp, right, nelem,
+			eventsetp, nelemp);
+	} else {
+		error = xfs_dm_f_get_eventlist(xbdp, right, nelem,
+			eventsetp, nelemp);
+	}
+	return(error);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_fileattr(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		mask,		/* not used; always return everything */
+	dm_stat_t	*statp)
+{
+	dm_stat_t	stat;
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+	bhv_desc_t	*xbdp;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	XFS_BHV_LOOKUP(vp, xbdp);
+
+	/* Find the mount point. */
+
+	ip = XFS_BHVTOI(xbdp);
+	mp = ip->i_mount;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	xfs_ip_to_stat(mp, &stat, ip);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (copy_to_user( statp, &stat, sizeof(stat)))
+		return(EFAULT);
+	return(0);
+}
+
+
+/* We currently only support a maximum of one managed region per file, and
+   use the DM_EVENT_READ, DM_EVENT_WRITE, and DM_EVENT_TRUNCATE events in
+   the file's dm_eventset_t event mask to implement the DM_REGION_READ,
+   DM_REGION_WRITE, and DM_REGION_TRUNCATE flags for that single region.
+*/
+
+STATIC int
+xfs_dm_get_region(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		nelem,
+	dm_region_t	*regbufp,
+	u_int		*nelemp)
+{
+	dm_eventset_t	evmask;
+	dm_region_t	region;
+	xfs_inode_t	*ip;
+	u_int		elem;
+	bhv_desc_t	*xbdp;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	XFS_BHV_LOOKUP(vp, xbdp);
+
+	ip = XFS_BHVTOI(xbdp);
+	evmask = ip->i_d.di_dmevmask;	/* read the mask "atomically" */
+
+	/* Get the file's current managed region flags out of the
+	   dm_eventset_t mask and use them to build a managed region that
+	   covers the entire file, i.e. set rg_offset and rg_size to zero.
+	*/
+
+	bzero((char *)&region, sizeof(region));
+
+	if (evmask & (1 << DM_EVENT_READ))
+		region.rg_flags |= DM_REGION_READ;
+	if (evmask & (1 << DM_EVENT_WRITE))
+		region.rg_flags |= DM_REGION_WRITE;
+	if (evmask & (1 << DM_EVENT_TRUNCATE))
+		region.rg_flags |= DM_REGION_TRUNCATE;
+
+	elem = (region.rg_flags ? 1 : 0);
+
+	if (copy_to_user( nelemp, &elem, sizeof(elem)))
+		return(EFAULT);
+	if (elem > nelem)
+		return(E2BIG);
+	if (elem && copy_to_user(regbufp, &region, sizeof(region)))
+		return(EFAULT);
+	return(0);
+}
+
+
+STATIC int
+xfs_dm_getall_dmattr(
+	vnode_t		*vp,
+	dm_right_t	right,
+	size_t		buflen,
+	void		*bufp,
+	size_t		*rlenp)
+{
+	attrlist_cursor_kern_t cursor;
+	attrlist_t	*attrlist;
+	dm_attrlist_t	*ulist;
+	int		*last_link;
+	int		alignment;
+	int		total_size;
+	int		list_size = 8192;	/* should be big enough */
+	int		error;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	/* Verify that the user gave us a buffer that is 4-byte aligned, lock
+	   it down, and work directly within that buffer.  As a side-effect,
+	   values of buflen < sizeof(int) return EINVAL.
+	*/
+
+	alignment = sizeof(int) - 1;
+	if (((__psint_t)bufp & alignment) != 0) {
+		return(EFAULT);
+	}
+	buflen &= ~alignment;		/* round down the alignment */
+
+#if defined(HAVE_USERACC)
+	if ((error = useracc(bufp, buflen, B_READ, NULL)) != 0)
+		return error;
+#endif
+
+	/* Initialize all the structures and variables for the main loop. */
+
+	bzero(&cursor, sizeof(cursor));
+	attrlist = (attrlist_t *)kmem_alloc(list_size, KM_SLEEP);
+	total_size = 0;
+	ulist = (dm_attrlist_t *)bufp;
+	last_link = NULL;
+
+	/* Use VOP_ATTR_LIST to get the names of DMAPI attributes, and use
+	   VOP_ATTR_GET to get their values.  There is a risk here that the
+	   DMAPI attributes could change between the VOP_ATTR_LIST and
+	   VOP_ATTR_GET calls.	If we can detect it, we return EIO to notify
+	   the user.
+	*/
+
+	do {
+		int	i;
+
+		/* Get a buffer full of attribute names.  If there aren't any
+		   more or if we encounter an error, then finish up.
+		*/
+
+		VOP_ATTR_LIST(vp, (char *)attrlist, list_size,
+			ATTR_ROOT, &cursor, NULL, error);
+		DM_EA_XLATE_ERR(error);
+
+		if (error || attrlist->al_count == 0)
+			break;
+
+		for (i = 0; i < attrlist->al_count; i++) {
+			attrlist_ent_t	*entry;
+			char		*user_name;
+			int		size_needed;
+			int		value_len;
+
+			/* Skip over all non-DMAPI attributes.	If the
+			   attribute name is too long, we assume it is
+			   non-DMAPI even if it starts with the correct
+			   prefix.
+			*/
+
+			entry = ATTR_ENTRY(attrlist, i);
+			if (strncmp(entry->a_name, dmattr_prefix, DMATTR_PREFIXLEN))
+				continue;
+			user_name = &entry->a_name[DMATTR_PREFIXLEN];
+			if (strlen(user_name) > DM_ATTR_NAME_SIZE)
+				continue;
+
+			/* We have a valid DMAPI attribute to return.  If it
+			   won't fit in the user's buffer, we still need to
+			   keep track of the number of bytes for the user's
+			   next call.
+			*/
+
+
+			size_needed = sizeof(*ulist) + entry->a_valuelen;
+			size_needed = (size_needed + alignment) & ~alignment;
+
+			total_size += size_needed;
+			if (total_size > buflen)
+				continue;
+
+			/* Start by filling in all the fields in the
+			   dm_attrlist_t structure.
+			*/
+
+			strncpy((char *)ulist->al_name.an_chars, user_name,
+				DM_ATTR_NAME_SIZE);
+			ulist->al_data.vd_offset = sizeof(*ulist);
+			ulist->al_data.vd_length = entry->a_valuelen;
+			ulist->_link =	size_needed;
+			last_link = &ulist->_link;
+
+			/* Next read the attribute's value into its correct
+			   location after the dm_attrlist structure.  Any sort
+			   of error indicates that the data is moving under us,
+			   so we return EIO to let the user know.
+			*/
+
+			value_len = entry->a_valuelen;
+
+			VOP_ATTR_GET(vp, entry->a_name,
+				(void *)(ulist + 1), &value_len,
+				ATTR_ROOT, NULL, error);
+			DM_EA_XLATE_ERR(error);
+
+			if (error || value_len != entry->a_valuelen) {
+				error = EIO;
+				break;
+			}
+
+			ulist = (dm_attrlist_t *)((char *)ulist + ulist->_link);
+		}
+	} while (!error && attrlist->al_more);
+	if (last_link)
+		*last_link = 0;
+
+	if (!error && total_size > buflen)
+		error = E2BIG;
+	if (!error || error == E2BIG) {
+		if (put_user(total_size, rlenp))
+			error = EFAULT;
+	}
+
+	kmem_free(attrlist, list_size);
+	return(error);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_getall_inherit(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		nelem,
+	dm_inherit_t	*inheritbufp,
+	u_int		*nelemp)
+{
+	return(ENOSYS);
+}
+
+
+/* Initialize location pointer for subsequent dm_get_dirattrs,
+   dm_get_bulkattr, and dm_get_bulkall calls.  The same initialization must
+   work for vnode-based routines (dm_get_dirattrs) and filesystem-based
+   routines (dm_get_bulkattr and dm_get_bulkall).  Filesystem-based functions
+   call this routine using the filesystem's root vnode.
+*/
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_init_attrloc(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_attrloc_t	*locp)
+{
+	dm_attrloc_t	loc = 0;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	if (copy_to_user( locp, &loc, sizeof(loc)))
+		return(EFAULT);
+	return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_mkdir_by_handle(
+	vnode_t		*vp,
+	dm_right_t	right,
+	void		*hanp,
+	size_t		hlen,
+	char		*cname)
+{
+	return(ENOSYS);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_probe_hole(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_off_t	off,
+	dm_size_t	len,		/* we ignore this for now */
+	dm_off_t	*roffp,
+	dm_size_t	*rlenp)
+{
+	dm_off_t	roff;
+	dm_size_t	rlen;
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+	uint		lock_flags;
+	xfs_fsize_t	realsize;
+	u_int		bsize;
+	bhv_desc_t	*xbdp;
+
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	XFS_BHV_LOOKUP(vp, xbdp);
+
+	ip = XFS_BHVTOI(xbdp);
+	if ((ip->i_d.di_mode & IFMT) != IFREG)
+		return(EINVAL);
+
+	mp = ip->i_mount;
+	bsize = mp->m_sb.sb_blocksize;
+
+	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+	xfs_ilock(ip, lock_flags);
+	realsize = ip->i_d.di_size;
+	xfs_iunlock(ip, lock_flags);
+	if (off >= realsize)
+		return(E2BIG);
+
+	roff = (off + bsize-1) & ~(bsize-1);
+	rlen = 0;		/* Only support punches to EOF for now */
+	if (copy_to_user( roffp, &roff, sizeof(roff)))
+		return(EFAULT);
+	if (copy_to_user( rlenp, &rlen, sizeof(rlen)))
+		return(EFAULT);
+	return(0);
+}
+
+
+STATIC int
+xfs_dm_punch_hole(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_off_t	off,
+	dm_size_t	len)
+{
+	xfs_inode_t	*ip;
+	xfs_trans_t	*tp;
+	xfs_trans_t	*tp2;
+	xfs_mount_t	*mp;
+	int		error;
+	uint		lock_flags;
+	uint		commit_flags;
+	xfs_fsize_t	realsize;
+	u_int		bsize;
+	bhv_desc_t	*xbdp;
+
+	if (right < DM_RIGHT_EXCL)
+		return(EACCES);
+
+	if (vp->v_type != VREG)
+		return(EINVAL);
+	if (len != 0)	/* Only support punches to EOF for now */
+		return(EAGAIN);
+	if (VN_MAPPED(vp))
+		return(EBUSY);
+
+	XFS_BHV_LOOKUP(vp, xbdp);
+
+	ip = XFS_BHVTOI(xbdp);
+	mp = ip->i_mount;
+	bsize = mp->m_sb.sb_blocksize;
+
+	if (off & (bsize-1))
+		return(EAGAIN);
+
+	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+	xfs_ilock(ip, lock_flags);
+
+	realsize = ip->i_d.di_size;	/* saved size to restore to */
+	if (off >= realsize) {	/* also check block boundary */
+		xfs_iunlock(ip, lock_flags);
+		return(EINVAL);
+	}
+
+	/*
+	 * Before we join the inode to the transaction, take care of
+	 * the part of the truncation that must be done without the
+	 * inode lock.	This needs to be done before joining the inode
+	 * to the transaction, because the inode cannot be unlocked
+	 * once it is a part of the transaction.
+	 */
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+	if ((error = xfs_trans_reserve(tp, 0,
+				     XFS_ITRUNCATE_LOG_RES(mp), 0,
+				     XFS_TRANS_PERM_LOG_RES,
+				     XFS_ITRUNCATE_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return(error);
+	}
+	tp2 = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+	if ((error = xfs_trans_reserve(tp2, 0,
+				     XFS_ITRUNCATE_LOG_RES(mp), 0,
+				     XFS_TRANS_PERM_LOG_RES,
+				     XFS_ITRUNCATE_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp2, 0);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return(error);
+	}
+	commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/* --- start of truncate --- */
+	xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t) off);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, lock_flags);
+	xfs_trans_ihold(tp, ip);
+	xfs_itruncate_finish(&tp, ip, (xfs_fsize_t) off, XFS_DATA_FORK, 0);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+	xfs_trans_commit(tp, commit_flags, NULL);
+	/* --- end of truncate --- */
+
+	/* --- start of grow --- */
+	/* ip left locked after previous commit */
+	xfs_igrow_start(ip, realsize, NULL);
+	xfs_trans_ijoin(tp2, ip, lock_flags);
+	xfs_igrow_finish(tp2, ip, realsize, 0);
+
+	/* Let threads in send_data_event know we punched the file. */
+	ip->i_iocore.io_dmstate++;
+
+	VN_HOLD(vp);
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp2);
+	}
+	xfs_trans_commit(tp2, commit_flags, NULL);
+	/* --- end of grow --- */
+
+	/* ip unlocked during the commit */
+	return(0);
+}
+
+
+STATIC int
+xfs_dm_read_invis_rvp(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_off_t	off,
+	dm_size_t	len,
+	void		*bufp,
+	int		*rvp)
+{
+	if (right < DM_RIGHT_SHARED)
+		return(EACCES);
+
+	return(xfs_dm_rdwr(vp, 0, FMODE_READ, off, len, bufp, rvp));
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_release_right(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		type)		/* DM_FSYS_OBJ or zero */
+{
+#ifdef	DEBUG_RIGHTS
+	char		buffer[sizeof(xfs_handle_t) * 2 + 1];
+	bhv_desc_t	*bdp;
+
+	XFS_BHV_LOOKUP(vp, bdp);
+
+	if (!xfs_bdp_to_hexhandle(bdp, type, buffer)) {
+		printf("dm_release_right: old %d type %d handle %s\n",
+			right, type, buffer);
+	} else {
+		printf("dm_release_right: old %d type %d handle "
+			" <INVALID>\n", right, type);
+	}
+#endif	/* DEBUG_RIGHTS */
+	return(0);
+}
+
+
+STATIC int
+xfs_dm_remove_dmattr(
+	vnode_t		*vp,
+	dm_right_t	right,
+	int		setdtime,
+	dm_attrname_t	*attrnamep)
+{
+	dm_dkattrname_t name;
+	int		error;
+
+	if (right < DM_RIGHT_EXCL)
+		return(EACCES);
+
+	if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
+		return(error);
+
+	/* Remove the attribute from the object. */
+
+	VOP_ATTR_REMOVE(vp, name.dan_chars,
+			(setdtime ? ATTR_ROOT : ATTR_ROOT|ATTR_KERNOTIME),
+			NULL, error);
+	DM_EA_XLATE_ERR(error);
+
+	if (error == ENOATTR)
+		error = ENOENT;
+	return(error);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_request_right(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		type,		/* DM_FSYS_OBJ or zero */
+	u_int		flags,
+	dm_right_t	newright)
+{
+#ifdef	DEBUG_RIGHTS
+	char		buffer[sizeof(xfs_handle_t) * 2 + 1];
+	bhv_desc_t	*bdp;
+
+	XFS_BHV_LOOKUP(vp, bdp);
+
+	if (!xfs_bdp_to_hexhandle(bdp, type, buffer)) {
+		printf("dm_request_right: old %d new %d type %d flags 0x%x "
+			"handle %s\n", right, newright, type, flags, buffer);
+	} else {
+		printf("dm_request_right: old %d new %d type %d flags 0x%x "
+			"handle <INVALID>\n", right, newright, type, flags);
+	}
+#endif	/* DEBUG_RIGHTS */
+	return(0);
+}
+
+
+STATIC int
+xfs_dm_set_dmattr(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_attrname_t	*attrnamep,
+	int		setdtime,
+	size_t		buflen,
+	void		*bufp)
+{
+	dm_dkattrname_t name;
+	char		*value;
+	int		alloc_size;
+	int		error;
+
+	if (right < DM_RIGHT_EXCL)
+		return(EACCES);
+
+	if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
+		return(error);
+	if (buflen > ATTR_MAX_VALUELEN)
+		return(E2BIG);
+
+	/* Copy in the attribute's value and store the <name,value> pair in
+	   the object.	We allocate a buffer of at least one byte even if the
+	   caller specified a buflen of zero.  (A buflen of zero is considered
+	   valid.)
+	*/
+
+	alloc_size = (buflen == 0) ? 1 : buflen;
+	value = kmem_alloc(alloc_size, KM_SLEEP);
+	if (copy_from_user( value, bufp, buflen)) {
+		error = EFAULT;
+	} else {
+		VOP_ATTR_SET(vp, name.dan_chars, value, buflen,
+			(setdtime ? ATTR_ROOT : ATTR_ROOT|ATTR_KERNOTIME),
+			NULL, error);
+		DM_EA_XLATE_ERR(error);
+	}
+	kmem_free(value, alloc_size);
+	return(error);
+}
+
+STATIC int
+xfs_dm_set_eventlist(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		type,
+	dm_eventset_t	*eventsetp,	/* in kernel space! */
+	u_int		maxevent)
+{
+	int		error;
+	bhv_desc_t	*xbdp;
+
+	XFS_BHV_LOOKUP(vp, xbdp);
+
+	if (type == DM_FSYS_OBJ) {
+		error = xfs_dm_fs_set_eventlist(xbdp, right, eventsetp, maxevent);
+	} else {
+		error = xfs_dm_f_set_eventlist(xbdp, right, eventsetp, maxevent);
+	}
+	return(error);
+}
+
+
+/*
+ *  This turned out not XFS-specific, but leave it here with get_fileattr.
+ */
+
+STATIC int
+xfs_dm_set_fileattr(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		mask,
+	dm_fileattr_t	*statp)
+{
+	dm_fileattr_t	stat;
+	vattr_t		vat;
+	int		error;
+
+	if (right < DM_RIGHT_EXCL)
+		return(EACCES);
+
+	if (copy_from_user( &stat, statp, sizeof(stat)))
+		return(EFAULT);
+
+	vat.va_mask = 0;
+
+	if (mask & DM_AT_MODE) {
+		vat.va_mask |= AT_MODE;
+		vat.va_mode = stat.fa_mode;
+	}
+	if (mask & DM_AT_UID) {
+		vat.va_mask |= AT_UID;
+		vat.va_uid = stat.fa_uid;
+	}
+	if (mask & DM_AT_GID) {
+		vat.va_mask |= AT_GID;
+		vat.va_gid = stat.fa_gid;
+	}
+	if (mask & DM_AT_ATIME) {
+		vat.va_mask |= AT_ATIME;
+		vat.va_atime.tv_sec = stat.fa_atime;
+		vat.va_atime.tv_nsec = 0;
+	}
+	if (mask & DM_AT_MTIME) {
+		vat.va_mask |= AT_MTIME;
+		vat.va_mtime.tv_sec = stat.fa_mtime;
+		vat.va_mtime.tv_nsec = 0;
+	}
+	if (mask & DM_AT_CTIME) {
+		vat.va_mask |= AT_CTIME;
+		vat.va_ctime.tv_sec = stat.fa_ctime;
+		vat.va_ctime.tv_nsec = 0;
+	}
+
+	/* DM_AT_DTIME only takes effect if DM_AT_CTIME is not specified.  We
+	   overload ctime to also act as dtime, i.e. DM_CONFIG_DTIME_OVERLOAD.
+	*/
+
+	if ((mask & DM_AT_DTIME) && !(mask & DM_AT_CTIME)) {
+		vat.va_mask |= AT_CTIME;
+		vat.va_ctime.tv_sec = stat.fa_dtime;
+		vat.va_ctime.tv_nsec = 0;
+	}
+	if (mask & DM_AT_SIZE) {
+		vat.va_mask |= AT_SIZE;
+		vat.va_size = stat.fa_size;
+	}
+
+	VOP_SETATTR(vp, &vat, ATTR_DMI, NULL, error);
+	return(error);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_set_inherit(
+	vnode_t		*vp,
+	dm_right_t	right,
+	dm_attrname_t	*attrnamep,
+	mode_t		mode)
+{
+	return(ENOSYS);
+}
+
+
+STATIC int
+xfs_dm_set_region(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		nelem,
+	dm_region_t	*regbufp,
+	dm_boolean_t	*exactflagp)
+{
+	xfs_inode_t	*ip;
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	dm_region_t	region;
+	dm_eventset_t	new_mask;
+	dm_eventset_t	mr_mask;
+	int		error;
+	u_int		exactflag;
+	bhv_desc_t	*xbdp;
+
+	if (right < DM_RIGHT_EXCL)
+		return(EACCES);
+
+	/* If the caller gave us more than one dm_region_t structure, complain.
+	   (He has to call dm_get_config() to find out what our limit is.)
+	*/
+
+	if (nelem > 1)
+		return(E2BIG);
+
+	/* If the user provided a dm_region_t structure, then copy it in,
+	   validate it, and convert its flags to the corresponding bits in a
+	   dm_set_eventlist() event mask.  A call with zero regions is
+	   equivalent to clearing all region flags.
+	*/
+
+	new_mask = 0;
+	if (nelem == 1) {
+		if (copy_from_user( &region, regbufp, sizeof(region)))
+			return(EFAULT);
+
+		if (region.rg_flags & ~(DM_REGION_READ|DM_REGION_WRITE|DM_REGION_TRUNCATE))
+			return(EINVAL);
+		if (region.rg_flags & DM_REGION_READ)
+			new_mask |= 1 << DM_EVENT_READ;
+		if (region.rg_flags & DM_REGION_WRITE)
+			new_mask |= 1 << DM_EVENT_WRITE;
+		if (region.rg_flags & DM_REGION_TRUNCATE)
+			new_mask |= 1 << DM_EVENT_TRUNCATE;
+	}
+	if ((new_mask & prohibited_mr_events(vp)) != 0)
+		return(EBUSY);
+	mr_mask = (1 << DM_EVENT_READ) | (1 << DM_EVENT_WRITE) | (1 << DM_EVENT_TRUNCATE);
+
+	/* Get the file's existing event mask, clear the old managed region
+	   bits, add in the new ones, and update the file's mask.
+	*/
+
+	XFS_BHV_LOOKUP(vp, xbdp);
+
+	ip = XFS_BHVTOI(xbdp);
+	mp = ip->i_mount;
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return(error);
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	ip->i_d.di_dmevmask = (ip->i_d.di_dmevmask & ~mr_mask) | new_mask;
+	ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	VN_HOLD(vp);
+	xfs_trans_commit(tp, 0, NULL);
+
+	/* Return the proper value for *exactflagp depending upon whether or not
+	   we "changed" the user's managed region.  In other words, if the user
+	   specified a non-zero value for either rg_offset or rg_size, we
+	   round each of those values back to zero.
+	*/
+
+	if (nelem && (region.rg_offset || region.rg_size)) {
+		exactflag = DM_FALSE;	/* user region was changed */
+	} else {
+		exactflag = DM_TRUE;	/* user region was unchanged */
+	}
+	if (copy_to_user( exactflagp, &exactflag, sizeof(exactflag)))
+		return(EFAULT);
+	return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_symlink_by_handle(
+	vnode_t		*vp,
+	dm_right_t	right,
+	void		*hanp,
+	size_t		hlen,
+	char		*cname,
+	char		*path)
+{
+	return(ENOSYS);
+}
+
+
+STATIC int
+xfs_dm_sync_by_handle (
+	vnode_t		*vp,
+	dm_right_t	right)
+{
+	int		error;
+
+	if (right < DM_RIGHT_EXCL)
+		return(EACCES);
+
+	VOP_FSYNC(vp, FSYNC_WAIT, NULL, (off_t)0, (off_t)-1, error);
+	return(error);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_upgrade_right(
+	vnode_t		*vp,
+	dm_right_t	right,
+	u_int		type)		/* DM_FSYS_OBJ or zero */
+{
+#ifdef	DEBUG_RIGHTS
+	char		buffer[sizeof(xfs_handle_t) * 2 + 1];
+	bhv_desc_t	*bdp;
+
+	XFS_BHV_LOOKUP(vp, bdp);
+
+	if (!xfs_bdp_to_hexhandle(bdp, type, buffer)) {
+		printf("dm_upgrade_right: old %d new %d type %d handle %s\n",
+			right, DM_RIGHT_EXCL, type, buffer);
+	} else {
+		printf("dm_upgrade_right: old %d new %d type %d handle "
+			"<INVALID>\n", right, DM_RIGHT_EXCL, type);
+	}
+#endif	/* DEBUG_RIGHTS */
+	return(0);
+}
+
+
+STATIC int
+xfs_dm_write_invis_rvp(
+	vnode_t		*vp,
+	dm_right_t	right,
+	int		flags,
+	dm_off_t	off,
+	dm_size_t	len,
+	void		*bufp,
+	int		*rvp)
+{
+	int		fflag = 0;
+
+	if (right < DM_RIGHT_EXCL)
+		return(EACCES);
+
+	if (flags & DM_WRITE_SYNC)
+		fflag |= O_SYNC;
+	return(xfs_dm_rdwr(vp, fflag, FMODE_WRITE, off, len, bufp, rvp));
+}
+
+
+STATIC void
+xfs_dm_obj_ref_hold(
+	vnode_t		*vp)
+{
+	VN_HOLD(vp);
+}
+
+
+STATIC fsys_function_vector_t	xfs_fsys_vector[DM_FSYS_MAX];
+
+
+int
+xfs_dm_get_fsys_vector(
+	bhv_desc_t	*bdp,
+	dm_fcntl_vector_t	*vecrq)
+{
+	static	int		initialized = 0;
+	fsys_function_vector_t *vecp;
+	int		i = 0;
+
+	vecrq->count =
+		sizeof(xfs_fsys_vector) / sizeof(xfs_fsys_vector[0]);
+	vecrq->vecp = xfs_fsys_vector;
+	if (initialized)
+		return(0);
+	vecrq->code_level = DM_CLVL_XOPEN;
+	vecp = xfs_fsys_vector;
+
+	vecp[i].func_no = DM_FSYS_CLEAR_INHERIT;
+	vecp[i++].u_fc.clear_inherit = xfs_dm_clear_inherit;
+	vecp[i].func_no = DM_FSYS_CREATE_BY_HANDLE;
+	vecp[i++].u_fc.create_by_handle = xfs_dm_create_by_handle;
+	vecp[i].func_no = DM_FSYS_DOWNGRADE_RIGHT;
+	vecp[i++].u_fc.downgrade_right = xfs_dm_downgrade_right;
+	vecp[i].func_no = DM_FSYS_GET_ALLOCINFO_RVP;
+	vecp[i++].u_fc.get_allocinfo_rvp = xfs_dm_get_allocinfo_rvp;
+	vecp[i].func_no = DM_FSYS_GET_BULKALL_RVP;
+	vecp[i++].u_fc.get_bulkall_rvp = xfs_dm_get_bulkall_rvp;
+	vecp[i].func_no = DM_FSYS_GET_BULKATTR_RVP;
+	vecp[i++].u_fc.get_bulkattr_rvp = xfs_dm_get_bulkattr_rvp;
+	vecp[i].func_no = DM_FSYS_GET_CONFIG;
+	vecp[i++].u_fc.get_config = xfs_dm_get_config;
+	vecp[i].func_no = DM_FSYS_GET_CONFIG_EVENTS;
+	vecp[i++].u_fc.get_config_events = xfs_dm_get_config_events;
+	vecp[i].func_no = DM_FSYS_GET_DESTROY_DMATTR;
+	vecp[i++].u_fc.get_destroy_dmattr = xfs_dm_get_destroy_dmattr;
+	vecp[i].func_no = DM_FSYS_GET_DIOINFO;
+	vecp[i++].u_fc.get_dioinfo = xfs_dm_get_dioinfo;
+	vecp[i].func_no = DM_FSYS_GET_DIRATTRS_RVP;
+	vecp[i++].u_fc.get_dirattrs_rvp = xfs_dm_get_dirattrs_rvp;
+	vecp[i].func_no = DM_FSYS_GET_DMATTR;
+	vecp[i++].u_fc.get_dmattr = xfs_dm_get_dmattr;
+	vecp[i].func_no = DM_FSYS_GET_EVENTLIST;
+	vecp[i++].u_fc.get_eventlist = xfs_dm_get_eventlist;
+	vecp[i].func_no = DM_FSYS_GET_FILEATTR;
+	vecp[i++].u_fc.get_fileattr = xfs_dm_get_fileattr;
+	vecp[i].func_no = DM_FSYS_GET_REGION;
+	vecp[i++].u_fc.get_region = xfs_dm_get_region;
+	vecp[i].func_no = DM_FSYS_GETALL_DMATTR;
+	vecp[i++].u_fc.getall_dmattr = xfs_dm_getall_dmattr;
+	vecp[i].func_no = DM_FSYS_GETALL_INHERIT;
+	vecp[i++].u_fc.getall_inherit = xfs_dm_getall_inherit;
+	vecp[i].func_no = DM_FSYS_INIT_ATTRLOC;
+	vecp[i++].u_fc.init_attrloc = xfs_dm_init_attrloc;
+	vecp[i].func_no = DM_FSYS_MKDIR_BY_HANDLE;
+	vecp[i++].u_fc.mkdir_by_handle = xfs_dm_mkdir_by_handle;
+	vecp[i].func_no = DM_FSYS_PROBE_HOLE;
+	vecp[i++].u_fc.probe_hole = xfs_dm_probe_hole;
+	vecp[i].func_no = DM_FSYS_PUNCH_HOLE;
+	vecp[i++].u_fc.punch_hole = xfs_dm_punch_hole;
+	vecp[i].func_no = DM_FSYS_READ_INVIS_RVP;
+	vecp[i++].u_fc.read_invis_rvp = xfs_dm_read_invis_rvp;
+	vecp[i].func_no = DM_FSYS_RELEASE_RIGHT;
+	vecp[i++].u_fc.release_right = xfs_dm_release_right;
+	vecp[i].func_no = DM_FSYS_REMOVE_DMATTR;
+	vecp[i++].u_fc.remove_dmattr = xfs_dm_remove_dmattr;
+	vecp[i].func_no = DM_FSYS_REQUEST_RIGHT;
+	vecp[i++].u_fc.request_right = xfs_dm_request_right;
+	vecp[i].func_no = DM_FSYS_SET_DMATTR;
+	vecp[i++].u_fc.set_dmattr = xfs_dm_set_dmattr;
+	vecp[i].func_no = DM_FSYS_SET_EVENTLIST;
+	vecp[i++].u_fc.set_eventlist = xfs_dm_set_eventlist;
+	vecp[i].func_no = DM_FSYS_SET_FILEATTR;
+	vecp[i++].u_fc.set_fileattr = xfs_dm_set_fileattr;
+	vecp[i].func_no = DM_FSYS_SET_INHERIT;
+	vecp[i++].u_fc.set_inherit = xfs_dm_set_inherit;
+	vecp[i].func_no = DM_FSYS_SET_REGION;
+	vecp[i++].u_fc.set_region = xfs_dm_set_region;
+	vecp[i].func_no = DM_FSYS_SYMLINK_BY_HANDLE;
+	vecp[i++].u_fc.symlink_by_handle = xfs_dm_symlink_by_handle;
+	vecp[i].func_no = DM_FSYS_SYNC_BY_HANDLE;
+	vecp[i++].u_fc.sync_by_handle = xfs_dm_sync_by_handle;
+	vecp[i].func_no = DM_FSYS_UPGRADE_RIGHT;
+	vecp[i++].u_fc.upgrade_right = xfs_dm_upgrade_right;
+	vecp[i].func_no = DM_FSYS_WRITE_INVIS_RVP;
+	vecp[i++].u_fc.write_invis_rvp = xfs_dm_write_invis_rvp;
+	vecp[i].func_no = DM_FSYS_OBJ_REF_HOLD;
+	vecp[i++].u_fc.obj_ref_hold = xfs_dm_obj_ref_hold;
+
+	return(0);
+}
+
+
+/*	xfs_dm_mapevent - send events needed for memory mapping a file.
+ *
+ *	xfs_dm_map is a workaround called for files that are about to be
+ *	mapped.	 DMAPI events are not being generated at a low enough level
+ *	in the kernel for page reads/writes to generate the correct events.
+ *	So for memory-mapped files we generate read  or write events for the
+ *	whole byte range being mapped.	If the mmap call can never cause a
+ *	write to the file, then only a read event is sent.
+ *
+ *	Code elsewhere prevents adding managed regions to a file while it
+ *	is still mapped.
+ */
+
+/* ARGSUSED */
+static int
+xfs_dm_mapevent(
+	bhv_desc_t	*bdp,
+	int		flags,
+	xfs_off_t	offset,
+	dm_fcntl_mapevent_t *mapevp)
+{
+	xfs_fsize_t	filesize;		/* event read/write "size" */
+	xfs_inode_t	*ip;
+	off_t		end_of_area, evsize;
+	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	struct vfs	*vfsp = vp->v_vfsp;
+
+	/* exit immediately if not regular file in a DMAPI file system */
+
+	mapevp->error = 0;			/* assume success */
+
+	if ((vp->v_type != VREG) || !(vfsp->vfs_flag & VFS_DMI))
+		return 0;
+
+	if (mapevp->max_event != DM_EVENT_WRITE &&
+		mapevp->max_event != DM_EVENT_READ)
+			return 0;
+
+	/* Set file size to work with. */
+
+	ip = XFS_BHVTOI(bdp);
+	filesize = ip->i_iocore.io_new_size;
+	if (filesize < ip->i_d.di_size) {
+		filesize = ip->i_d.di_size;
+	}
+
+	/* Set first byte number beyond the map area. */
+
+	if (mapevp->length) {
+		end_of_area = offset + mapevp->length;
+		if (end_of_area > filesize)
+			end_of_area = filesize;
+	} else {
+		end_of_area = filesize;
+	}
+
+	/* Set the real amount being mapped. */
+	evsize = end_of_area - offset;
+	if (evsize < 0)
+		evsize = 0;
+
+	/* If write possible, try a DMAPI write event */
+	if (mapevp->max_event == DM_EVENT_WRITE &&
+		DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_WRITE)) {
+		mapevp->error = xfs_dm_send_data_event(DM_EVENT_WRITE, bdp,
+				offset, evsize, 0, NULL);
+		return(0);
+	}
+
+	/* Try a read event if max_event was != DM_EVENT_WRITE or if it
+	 * was DM_EVENT_WRITE but the WRITE event was not enabled.
+	 */
+	if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_READ)) {
+		mapevp->error = xfs_dm_send_data_event(DM_EVENT_READ, bdp,
+				offset, evsize, 0, NULL);
+	}
+
+	return 0;
+}
+
+
+int
+xfs_dm_send_mmap_event(
+	struct vm_area_struct *vma,
+	unsigned int	wantflag)
+{
+	vnode_t		*vp;
+	xfs_inode_t	*ip;
+	bhv_desc_t	*bdp;
+	int		ret = 0;
+	dm_fcntl_mapevent_t maprq;
+	dm_eventtype_t	max_event = DM_EVENT_READ;
+
+	if (!vma->vm_file)
+		return 0;
+
+	vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode);
+	ASSERT(vp);
+
+	if ((vp->v_type != VREG) || !(vp->v_vfsp->vfs_flag & VFS_DMI))
+		return 0;
+
+	/* If they specifically asked for 'read', then give it to them.
+	 * Otherwise, see if it's possible to give them 'write'.
+	 */
+	if( wantflag & VM_READ ){
+		max_event = DM_EVENT_READ;
+	}
+	else if( ! (vma->vm_flags & VM_DENYWRITE) ) {
+		if((wantflag & VM_WRITE) || (vma->vm_flags & VM_WRITE))
+			max_event = DM_EVENT_WRITE;
+	}
+
+	if( (wantflag & VM_WRITE) && (max_event != DM_EVENT_WRITE) ){
+		return -EACCES;
+	}
+
+	maprq.max_event = max_event;
+
+	/* Figure out how much of the file is being requested by the user. */
+	maprq.length = 0; /* whole file, for now */
+
+	VN_BHV_READ_LOCK(VN_BHV_HEAD(vp));
+	XFS_BHV_LOOKUP(vp, bdp);
+	ip = XFS_BHVTOI(bdp);
+
+	if(DM_EVENT_ENABLED(vp->v_vfsp, ip, max_event)){
+		xfs_dm_mapevent(bdp, 0, 0, &maprq);
+		ret = maprq.error;
+	}
+	VN_BHV_READ_UNLOCK(VN_BHV_HEAD(vp));
+
+	return -ret;
+}
+
+
+int
+xfs_dm_mount(
+	vfs_t		*vfsp,
+	char		*dir_name,
+	char		*fsname)
+{
+	vnode_t		*rootvp;
+	bhv_desc_t	*rootbdp;
+	int		error;
+
+	if (*dir_name == '\0')
+		return EINVAL;
+	VFS_ROOT(vfsp, &rootvp, error);
+	if (error)
+		return error;
+
+	rootbdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(rootvp), &xfs_vnodeops);
+	VN_RELE(rootvp);
+	error = dm_send_mount_event(vfsp, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
+				    rootbdp, DM_RIGHT_NULL, dir_name,
+				    fsname);
+
+	return error;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dmapi.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dmapi.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dmapi.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dmapi.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DMAPI_H__
+#define __XFS_DMAPI_H__
+
+#ifdef CONFIG_XFS_DMAPI
+
+#include <dmapi/dmapi.h>
+#include <dmapi/dmapi_kern.h>
+
+/*	Values used to define the on-disk version of dm_attrname_t. All
+ *	on-disk attribute names start with the 8-byte string "SGI_DMI_".
+ *
+ *	In the on-disk inode, DMAPI attribute names consist of the user-provided
+ *	name with the DMATTR_PREFIXSTRING pre-pended.  This string must NEVER be
+ *	changed.
+ */
+
+#define DMATTR_PREFIXLEN	8
+#define DMATTR_PREFIXSTRING	"SGI_DMI_"
+
+/* Defines for determining if an event message should be sent. */
+#define DM_EVENT_ENABLED(vfsp, ip, event) ( \
+	unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
+		( ((ip)->i_d.di_dmevmask & (1 << event)) || \
+		  ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
+	)
+
+#define DM_EVENT_ENABLED_IO(vfsp, io, event) ( \
+	unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
+		( ((io)->io_dmevmask & (1 << event)) || \
+		  ((io)->io_mount->m_dmevmask & (1 << event)) ) \
+	)
+
+/*
+ *	Macros to turn caller specified delay/block flags into
+ *	dm_send_xxxx_event flag DM_FLAGS_NDELAY.
+ */
+
+#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
+			DM_FLAGS_NDELAY : 0)
+#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
+
+
+
+/* events valid in dm_set_eventlist() when called with a filesystem handle.
+   These events are not persistent.
+*/
+
+#define DM_XFS_VALID_FS_EVENTS		( \
+	(1 << DM_EVENT_PREUNMOUNT)	| \
+	(1 << DM_EVENT_UNMOUNT)		| \
+	(1 << DM_EVENT_NOSPACE)		| \
+	(1 << DM_EVENT_DEBUT)		| \
+	(1 << DM_EVENT_CREATE)		| \
+	(1 << DM_EVENT_POSTCREATE)	| \
+	(1 << DM_EVENT_REMOVE)		| \
+	(1 << DM_EVENT_POSTREMOVE)	| \
+	(1 << DM_EVENT_RENAME)		| \
+	(1 << DM_EVENT_POSTRENAME)	| \
+	(1 << DM_EVENT_LINK)		| \
+	(1 << DM_EVENT_POSTLINK)	| \
+	(1 << DM_EVENT_SYMLINK)		| \
+	(1 << DM_EVENT_POSTSYMLINK)	| \
+	(1 << DM_EVENT_ATTRIBUTE)	| \
+	(1 << DM_EVENT_DESTROY)		)
+
+/* Events valid in dm_set_eventlist() when called with a file handle for
+   a regular file or a symlink.	 These events are persistent.
+*/
+
+#define DM_XFS_VALID_FILE_EVENTS	( \
+	(1 << DM_EVENT_ATTRIBUTE)	| \
+	(1 << DM_EVENT_DESTROY)		)
+
+/* Events valid in dm_set_eventlist() when called with a file handle for
+   a directory.	 These events are persistent.
+*/
+
+#define DM_XFS_VALID_DIRECTORY_EVENTS	( \
+	(1 << DM_EVENT_CREATE)		| \
+	(1 << DM_EVENT_POSTCREATE)	| \
+	(1 << DM_EVENT_REMOVE)		| \
+	(1 << DM_EVENT_POSTREMOVE)	| \
+	(1 << DM_EVENT_RENAME)		| \
+	(1 << DM_EVENT_POSTRENAME)	| \
+	(1 << DM_EVENT_LINK)		| \
+	(1 << DM_EVENT_POSTLINK)	| \
+	(1 << DM_EVENT_SYMLINK)		| \
+	(1 << DM_EVENT_POSTSYMLINK)	| \
+	(1 << DM_EVENT_ATTRIBUTE)	| \
+	(1 << DM_EVENT_DESTROY)		)
+
+
+/* Events supported by the XFS filesystem. */
+#define DM_XFS_SUPPORTED_EVENTS		( \
+	(1 << DM_EVENT_MOUNT)		| \
+	(1 << DM_EVENT_PREUNMOUNT)	| \
+	(1 << DM_EVENT_UNMOUNT)		| \
+	(1 << DM_EVENT_NOSPACE)		| \
+	(1 << DM_EVENT_CREATE)		| \
+	(1 << DM_EVENT_POSTCREATE)	| \
+	(1 << DM_EVENT_REMOVE)		| \
+	(1 << DM_EVENT_POSTREMOVE)	| \
+	(1 << DM_EVENT_RENAME)		| \
+	(1 << DM_EVENT_POSTRENAME)	| \
+	(1 << DM_EVENT_LINK)		| \
+	(1 << DM_EVENT_POSTLINK)	| \
+	(1 << DM_EVENT_SYMLINK)		| \
+	(1 << DM_EVENT_POSTSYMLINK)	| \
+	(1 << DM_EVENT_READ)		| \
+	(1 << DM_EVENT_WRITE)		| \
+	(1 << DM_EVENT_TRUNCATE)	| \
+	(1 << DM_EVENT_ATTRIBUTE)	| \
+	(1 << DM_EVENT_DESTROY)		)
+
+
+extern int
+xfs_dm_mount(
+	vfs_t		*vfsp,
+	char		*dir_name,
+	char		*fsname);
+
+extern int
+xfs_dm_get_fsys_vector(
+	bhv_desc_t	*bdp,
+	dm_fcntl_vector_t *vecrq);
+
+extern int
+xfs_dm_send_data_event(
+	dm_eventtype_t	event,
+	bhv_desc_t	*bdp,
+	xfs_off_t	offset,
+	size_t		length,
+	int		flags,
+	vrwlock_t	*locktype);
+
+extern int
+xfs_dm_send_create_event(
+	bhv_desc_t	*dir_bdp,
+	char		*name,
+	mode_t		new_mode,
+	int		*good_event_sent);
+
+extern int
+xfs_dm_send_mmap_event(
+	struct vm_area_struct *vma,
+	unsigned int	wantflag);
+
+#else /* CONFIG_XFS_DMAPI */
+
+/*
+ *	Flags needed to build with dmapi disabled.
+ */
+
+typedef enum {
+	DM_EVENT_INVALID	= -1,
+	DM_EVENT_CANCEL		= 0,		/* not supported */
+	DM_EVENT_MOUNT		= 1,
+	DM_EVENT_PREUNMOUNT	= 2,
+	DM_EVENT_UNMOUNT	= 3,
+	DM_EVENT_DEBUT		= 4,		/* not supported */
+	DM_EVENT_CREATE		= 5,
+	DM_EVENT_CLOSE		= 6,		/* not supported */
+	DM_EVENT_POSTCREATE	= 7,
+	DM_EVENT_REMOVE		= 8,
+	DM_EVENT_POSTREMOVE	= 9,
+	DM_EVENT_RENAME		= 10,
+	DM_EVENT_POSTRENAME	= 11,
+	DM_EVENT_LINK		= 12,
+	DM_EVENT_POSTLINK	= 13,
+	DM_EVENT_SYMLINK	= 14,
+	DM_EVENT_POSTSYMLINK	= 15,
+	DM_EVENT_READ		= 16,
+	DM_EVENT_WRITE		= 17,
+	DM_EVENT_TRUNCATE	= 18,
+	DM_EVENT_ATTRIBUTE	= 19,
+	DM_EVENT_DESTROY	= 20,
+	DM_EVENT_NOSPACE	= 21,
+	DM_EVENT_USER		= 22,
+	DM_EVENT_MAX		= 23
+} dm_eventtype_t;
+
+typedef enum {
+	DM_RIGHT_NULL,
+	DM_RIGHT_SHARED,
+	DM_RIGHT_EXCL
+} dm_right_t;
+
+/*
+ *	Defines for determining if an event message should be sent.
+ */
+#define DM_EVENT_ENABLED(vfsp, ip, event)	0
+#define DM_EVENT_ENABLED_IO(vfsp, io, event)	0
+
+/*
+ *	Stubbed out DMAPI delay macros.
+ */
+
+#define FILP_DELAY_FLAG(filp)			0
+#define AT_DELAY_FLAG(f)			0
+
+/*
+ *	Events supported by the XFS filesystem.
+ */
+
+#define DM_XFS_VALID_FS_EVENTS			0
+#define DM_XFS_VALID_FILE_EVENTS		0
+#define DM_XFS_VALID_DIRECTORY_EVENTS		0
+#define DM_XFS_SUPPORTED_EVENTS			0
+
+/*
+ *	Dummy definitions used for the flags field on dm_send_*_event().
+ */
+
+#define DM_FLAGS_NDELAY		0x001	/* return EAGAIN after dm_pending() */
+#define DM_FLAGS_UNWANTED	0x002	/* event not in fsys dm_eventset_t */
+
+/*
+ *	Stubs for XFS DMAPI utility routines.
+ */
+
+static __inline int
+xfs_dm_send_create_event(
+	bhv_desc_t	*dir_bdp,
+	char		*name,
+	mode_t		new_mode,
+	int		*good_event_sent)
+{
+	return 0;
+}
+
+static __inline int
+xfs_dm_send_data_event(
+	dm_eventtype_t	event,
+	bhv_desc_t	*bdp,
+	xfs_off_t	offset,
+	size_t		length,
+	int		flags,
+	vrwlock_t	*locktype)
+{
+	return nopkg();
+}
+
+static __inline int
+xfs_dm_send_mmap_event(
+	struct vm_area_struct *vma,
+	unsigned int	wantflag)
+{
+	return 0;
+}
+
+/*
+ *	Stubs for routines needed for the X/Open version of DMAPI.
+ */
+
+static __inline int
+dm_send_destroy_event(
+	bhv_desc_t	*bdp,
+	dm_right_t	vp_right)
+{
+	return nopkg();
+}
+
+static __inline int
+dm_send_namesp_event(
+	dm_eventtype_t	event,
+	bhv_desc_t	*bdp1,
+	dm_right_t	vp1_right,
+	bhv_desc_t	*bdp2,
+	dm_right_t	vp2_right,
+	char		*name1,
+	char		*name2,
+	mode_t		mode,
+	int		retcode,
+	int		flags)
+{
+	return nopkg();
+}
+
+static __inline void
+dm_send_unmount_event(
+	vfs_t		*vfsp,
+	vnode_t		*vp,
+	dm_right_t	vfsp_right,
+	mode_t		mode,
+	int		retcode,
+	int		flags)
+{
+}
+
+#endif	/* CONFIG_XFS_DMAPI */
+#endif	/* __XFS_DMAPI_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dqblk.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dqblk.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dqblk.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dqblk.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DQBLK_H__
+#define __XFS_DQBLK_H__
+
+/*
+ * The ondisk form of a dquot structure.
+ */
+#define XFS_DQUOT_MAGIC		0x4451		/* 'DQ' */
+#define XFS_DQUOT_VERSION	(u_int8_t)0x01	/* latest version number */
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct	xfs_disk_dquot {
+/*16*/	u_int16_t	d_magic;	/* dquot magic = XFS_DQUOT_MAGIC */
+/*8 */	u_int8_t	d_version;	/* dquot version */
+/*8 */	u_int8_t	d_flags;	/* XFS_DQ_USER/PROJ/GROUP */
+/*32*/	xfs_dqid_t	d_id;		/* user,project,group id */
+/*64*/	xfs_qcnt_t	d_blk_hardlimit;/* absolute limit on disk blks */
+/*64*/	xfs_qcnt_t	d_blk_softlimit;/* preferred limit on disk blks */
+/*64*/	xfs_qcnt_t	d_ino_hardlimit;/* maximum # allocated inodes */
+/*64*/	xfs_qcnt_t	d_ino_softlimit;/* preferred inode limit */
+/*64*/	xfs_qcnt_t	d_bcount;	/* disk blocks owned by the user */
+/*64*/	xfs_qcnt_t	d_icount;	/* inodes owned by the user */
+/*32*/	__int32_t	d_itimer;	/* zero if within inode limits if not,
+					   this is when we refuse service */
+/*32*/	__int32_t	d_btimer;	/* similar to above; for disk blocks */
+/*16*/	xfs_qwarncnt_t	d_iwarns;	/* warnings issued wrt num inodes */
+/*16*/	xfs_qwarncnt_t	d_bwarns;	/* warnings issued wrt disk blocks */
+/*32*/	__int32_t	d_pad0;		/* 64 bit align */
+/*64*/	xfs_qcnt_t	d_rtb_hardlimit;/* absolute limit on realtime blks */
+/*64*/	xfs_qcnt_t	d_rtb_softlimit;/* preferred limit on RT disk blks */
+/*64*/	xfs_qcnt_t	d_rtbcount;	/* realtime blocks owned */
+/*32*/	__int32_t	d_rtbtimer;	/* similar to above; for RT disk blocks */
+/*16*/	xfs_qwarncnt_t	d_rtbwarns;	/* warnings issued wrt RT disk blocks */
+/*16*/	__uint16_t	d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+	xfs_disk_dquot_t  dd_diskdq;	/* portion that lives incore as well */
+	char		  dd_fill[32];	/* filling for posterity */
+} xfs_dqblk_t;
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER		0x0001		/* a user quota */
+/* #define XFS_DQ_PROJ		0x0002		-- project quota (IRIX) */
+#define XFS_DQ_GROUP		0x0004		/* a group quota */
+#define XFS_DQ_FLOCKED		0x0008		/* flush lock taken */
+#define XFS_DQ_DIRTY		0x0010		/* dquot is dirty */
+#define XFS_DQ_WANT		0x0020		/* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE		0x0040		/* dq off mplist & hashlist */
+#define XFS_DQ_MARKER		0x0080		/* sentinel */
+
+/*
+ * In the worst case, when both user and group quotas are on,
+ * we can have a max of three dquots changing in a single transaction.
+ */
+#define XFS_DQUOT_LOGRES(mp)	(sizeof(xfs_disk_dquot_t) * 3)
+
+#endif	/* __XFS_DQBLK_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dquot.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dquot.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dquot.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dquot.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1660 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+
+/*
+   LOCK ORDER
+
+   inode lock		    (ilock)
+   dquot hash-chain lock    (hashlock)
+   xqm dquot freelist lock  (freelistlock
+   mount's dquot list lock  (mplistlock)
+   user dquot lock - lock ordering among dquots is based on the uid or gid
+   group dquot lock - similar to udquots. Between the two dquots, the udquot
+		      has to be locked first.
+   pin lock - the dquot lock must be held to take this lock.
+   flush lock - ditto.
+*/
+
+STATIC void		xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
+
+#ifdef DEBUG
+dev_t xfs_dqerror_dev = 0;
+int xfs_do_dqerror = 0;
+int xfs_dqreq_num = 0;
+int xfs_dqerror_mod = 33;
+#endif
+
+/*
+ * Allocate and initialize a dquot. We don't always allocate fresh memory;
+ * we try to reclaim a free dquot if the number of incore dquots are above
+ * a threshold.
+ * The only field inside the core that gets initialized at this point
+ * is the d_id field. The idea is to fill in the entire q_core
+ * when we read in the on disk dquot.
+ */
+xfs_dquot_t *
+xfs_qm_dqinit(
+	xfs_mount_t  *mp,
+	xfs_dqid_t   id,
+	uint	     type)
+{
+	xfs_dquot_t	*dqp;
+	boolean_t	brandnewdquot;
+
+	brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
+	dqp->dq_flags = type;
+	INT_SET(dqp->q_core.d_id, ARCH_CONVERT, id);
+	dqp->q_mount = mp;
+
+	/*
+	 * No need to re-initialize these if this is a reclaimed dquot.
+	 */
+	if (brandnewdquot) {
+		dqp->dq_flnext = dqp->dq_flprev = dqp;
+		mutex_init(&dqp->q_qlock,  MUTEX_DEFAULT, "xdq");
+		initnsema(&dqp->q_flock, 1, "fdq");
+		sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+
+#ifdef DQUOT_TRACING
+		dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
+		xfs_dqtrace_entry(dqp, "DQINIT");
+#endif
+	} else {
+		/*
+		 * Only the q_core portion was bzeroed in dqreclaim_one().
+		 * So, we need to reset others.
+		 */
+		 dqp->q_nrefs = 0;
+		 dqp->q_blkno = 0;
+		 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
+		 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
+		 dqp->q_bufoffset = 0;
+		 dqp->q_fileoffset = 0;
+		 dqp->q_transp = NULL;
+		 dqp->q_gdquot = NULL;
+		 dqp->q_res_bcount = 0;
+		 dqp->q_res_icount = 0;
+		 dqp->q_res_rtbcount = 0;
+		 dqp->q_pincount = 0;
+		 dqp->q_hash = 0;
+		 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
+
+#ifdef DQUOT_TRACING
+		 ASSERT(dqp->q_trace);
+		 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
+#endif
+	 }
+
+	/*
+	 * log item gets initialized later
+	 */
+	return (dqp);
+}
+
+/*
+ * This is called to free all the memory associated with a dquot
+ */
+void
+xfs_qm_dqdestroy(
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
+
+	mutex_destroy(&dqp->q_qlock);
+	freesema(&dqp->q_flock);
+	sv_destroy(&dqp->q_pinwait);
+
+#ifdef DQUOT_TRACING
+	if (dqp->q_trace)
+	     ktrace_free(dqp->q_trace);
+	dqp->q_trace = NULL;
+#endif
+	kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+	atomic_dec(&xfs_Gqm->qm_totaldquots);
+}
+
+/*
+ * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
+ */
+STATIC void
+xfs_qm_dqinit_core(
+	xfs_dqid_t	 id,
+	uint		 type,
+	xfs_dqblk_t	 *d)
+{
+	/*
+	 * Caller has bzero'd the entire dquot 'chunk' already.
+	 */
+	INT_SET(d->dd_diskdq.d_magic, ARCH_CONVERT, XFS_DQUOT_MAGIC);
+	INT_SET(d->dd_diskdq.d_version, ARCH_CONVERT, XFS_DQUOT_VERSION);
+	INT_SET(d->dd_diskdq.d_id, ARCH_CONVERT, id);
+	INT_SET(d->dd_diskdq.d_flags, ARCH_CONVERT, type);
+}
+
+
+#ifdef DQUOT_TRACING
+/*
+ * Dquot tracing for debugging.
+ */
+/* ARGSUSED */
+void
+xfs_dqtrace_entry__(
+	xfs_dquot_t *dqp,
+	char *func,
+	void *retaddr,
+	xfs_inode_t *ip)
+{
+	xfs_dquot_t *udqp = NULL;
+	int ino;
+
+	ASSERT(dqp->q_trace);
+	if (ip) {
+		ino = ip->i_ino;
+		udqp = ip->i_udquot;
+	}
+	ktrace_enter(dqp->q_trace,
+		     (void *)(__psint_t)DQUOT_KTRACE_ENTRY,
+		     (void *)func,
+		     (void *)(__psint_t)dqp->q_nrefs,
+		     (void *)(__psint_t)dqp->dq_flags,
+		     (void *)(__psint_t)dqp->q_res_bcount,
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_icount, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_id, ARCH_CONVERT), /* 11 */
+		     (void *)(__psint_t)current_pid(),
+		     (void *)(__psint_t)ino,
+		     (void *)(__psint_t)retaddr,
+		     (void *)(__psint_t)udqp);
+	return;
+}
+#endif
+
+
+/*
+ * Check the limits and timers of a dquot and start or reset timers
+ * if necessary.
+ * This gets called even when quota enforcement is OFF, which makes our
+ * life a little less complicated. (We just don't reject any quota
+ * reservations in that case, when enforcement is off).
+ * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
+ * enforcement's off.
+ * In contrast, warnings are a little different in that they don't
+ * 'automatically' get started when limits get exceeded.
+ */
+void
+xfs_qm_adjust_dqtimers(
+	xfs_mount_t		*mp,
+	xfs_disk_dquot_t	*d)
+{
+	/*
+	 * The dquot had better be locked. We are modifying it here.
+	 */
+
+	/*
+	 * root's limits are not real limits.
+	 */
+	if (INT_ISZERO(d->d_id, ARCH_CONVERT))
+		return;
+
+#ifdef QUOTADEBUG
+	if (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT))
+		ASSERT(INT_GET(d->d_blk_softlimit, ARCH_CONVERT) <= INT_GET(d->d_blk_hardlimit, ARCH_CONVERT));
+	if (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT))
+		ASSERT(INT_GET(d->d_ino_softlimit, ARCH_CONVERT) <= INT_GET(d->d_ino_hardlimit, ARCH_CONVERT));
+#endif
+	if (INT_ISZERO(d->d_btimer, ARCH_CONVERT)) {
+		if ((INT_GET(d->d_blk_softlimit, ARCH_CONVERT) &&
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) >= INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) ||
+		    (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT) &&
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) >= INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) {
+			INT_SET(d->d_btimer, ARCH_CONVERT, CURRENT_TIME + XFS_QI_BTIMELIMIT(mp));
+		}
+	} else {
+		if ((INT_ISZERO(d->d_blk_softlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) < INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) &&
+		    (INT_ISZERO(d->d_blk_hardlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) < INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) {
+			INT_ZERO(d->d_btimer, ARCH_CONVERT);
+		}
+	}
+
+	if (INT_ISZERO(d->d_itimer, ARCH_CONVERT)) {
+		if ((INT_GET(d->d_ino_softlimit, ARCH_CONVERT) &&
+		    (INT_GET(d->d_icount, ARCH_CONVERT) >= INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) ||
+		    (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT) &&
+		    (INT_GET(d->d_icount, ARCH_CONVERT) >= INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) {
+			INT_SET(d->d_itimer, ARCH_CONVERT, CURRENT_TIME + XFS_QI_ITIMELIMIT(mp));
+		}
+	} else {
+		if ((INT_ISZERO(d->d_ino_softlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_icount, ARCH_CONVERT) < INT_GET(d->d_ino_softlimit, ARCH_CONVERT)))  &&
+		    (INT_ISZERO(d->d_ino_hardlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_icount, ARCH_CONVERT) < INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) {
+			INT_ZERO(d->d_itimer, ARCH_CONVERT);
+		}
+	}
+}
+
+/*
+ * Increment or reset warnings of a given dquot.
+ */
+int
+xfs_qm_dqwarn(
+	xfs_disk_dquot_t	*d,
+	uint			flags)
+{
+	int	warned;
+
+	/*
+	 * root's limits are not real limits.
+	 */
+	if (INT_ISZERO(d->d_id, ARCH_CONVERT))
+		return (0);
+
+	warned = 0;
+	if (INT_GET(d->d_blk_softlimit, ARCH_CONVERT) &&
+	    (INT_GET(d->d_bcount, ARCH_CONVERT) >= INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) {
+		if (flags & XFS_QMOPT_DOWARN) {
+			INT_MOD(d->d_bwarns, ARCH_CONVERT, +1);
+			warned++;
+		}
+	} else {
+		if (INT_ISZERO(d->d_blk_softlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) < INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) {
+			INT_ZERO(d->d_bwarns, ARCH_CONVERT);
+		}
+	}
+
+	if (INT_GET(d->d_ino_softlimit, ARCH_CONVERT) > 0 &&
+	    (INT_GET(d->d_icount, ARCH_CONVERT) >= INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) {
+		if (flags & XFS_QMOPT_DOWARN) {
+			INT_MOD(d->d_iwarns, ARCH_CONVERT, +1);
+			warned++;
+		}
+	} else {
+		if ((INT_ISZERO(d->d_ino_softlimit, ARCH_CONVERT)) ||
+		    (INT_GET(d->d_icount, ARCH_CONVERT) < INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) {
+			INT_ZERO(d->d_iwarns, ARCH_CONVERT);
+		}
+	}
+#ifdef QUOTADEBUG
+	if (INT_GET(d->d_iwarns, ARCH_CONVERT))
+		printk("--------@@Inode warnings running : %Lu >= %Lu\n",
+		       INT_GET(d->d_icount, ARCH_CONVERT), INT_GET(d->d_ino_softlimit, ARCH_CONVERT));
+	if (INT_GET(d->d_bwarns, ARCH_CONVERT))
+		printk("--------@@Blks warnings running : %Lu >= %Lu\n",
+		       INT_GET(d->d_bcount, ARCH_CONVERT), INT_GET(d->d_blk_softlimit, ARCH_CONVERT));
+#endif
+	return (warned);
+}
+
+
+/*
+ * initialize a buffer full of dquots and log the whole thing
+ */
+STATIC void
+xfs_qm_init_dquot_blk(
+	xfs_trans_t	*tp,
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	xfs_buf_t	*bp)
+{
+	xfs_dqblk_t	*d;
+	int		curid, i;
+
+	ASSERT(tp);
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
+
+	/*
+	 * ID of the first dquot in the block - id's are zero based.
+	 */
+	curid = id - (id % XFS_QM_DQPERBLK(mp));
+	ASSERT(curid >= 0);
+	bzero(d, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
+	for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
+		xfs_qm_dqinit_core(curid, type, d);
+	xfs_trans_dquot_buf(tp, bp,
+			    type & XFS_DQ_USER ?
+			    XFS_BLI_UDQUOT_BUF :
+			    XFS_BLI_GDQUOT_BUF);
+	xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
+}
+
+
+
+/*
+ * Allocate a block and fill it with dquots.
+ * This is called when the bmapi finds a hole.
+ */
+STATIC int
+xfs_qm_dqalloc(
+	xfs_trans_t	*tp,
+	xfs_mount_t	*mp,
+	xfs_dquot_t	*dqp,
+	xfs_inode_t	*quotip,
+	xfs_fileoff_t	offset_fsb,
+	xfs_buf_t	**O_bpp)
+{
+	xfs_fsblock_t	firstblock;
+	xfs_bmap_free_t flist;
+	xfs_bmbt_irec_t map;
+	int		nmaps, error, committed;
+	xfs_buf_t	*bp;
+
+	ASSERT(tp != NULL);
+	xfs_dqtrace_entry(dqp, "DQALLOC");
+
+	/*
+	 * Initialize the bmap freelist prior to calling bmapi code.
+	 */
+	XFS_BMAP_INIT(&flist, &firstblock);
+	xfs_ilock(quotip, XFS_ILOCK_EXCL);
+	/*
+	 * Return if this type of quotas is turned off while we didn't
+	 * have an inode lock
+	 */
+	if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+		xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+		return (ESRCH);
+	}
+
+	/*
+	 * xfs_trans_commit normally decrements the vnode ref count
+	 * when it unlocks the inode. Since we want to keep the quota
+	 * inode around, we bump the vnode ref count now.
+	 */
+	VN_HOLD(XFS_ITOV(quotip));
+
+	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
+	nmaps = 1;
+	if ((error = xfs_bmapi(tp, quotip,
+			      offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
+			      XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
+			      &firstblock,
+			      XFS_QM_DQALLOC_SPACE_RES(mp),
+			      &map, &nmaps, &flist))) {
+		goto error0;
+	}
+	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
+	ASSERT(nmaps == 1);
+	ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+	       (map.br_startblock != HOLESTARTBLOCK));
+
+	/*
+	 * Keep track of the blkno to save a lookup later
+	 */
+	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+	/* now we can just get the buffer (there's nothing to read yet) */
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			       dqp->q_blkno,
+			       XFS_QI_DQCHUNKLEN(mp),
+			       0);
+	if (!bp || (error = XFS_BUF_GETERROR(bp)))
+		goto error1;
+	/*
+	 * Make a chunk of dquots out of this buffer and log
+	 * the entire thing.
+	 */
+	xfs_qm_init_dquot_blk(tp, mp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT),
+			      dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP),
+			      bp);
+
+	if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) {
+		goto error1;
+	}
+
+	*O_bpp = bp;
+	return 0;
+
+      error1:
+	xfs_bmap_cancel(&flist);
+      error0:
+	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+
+	return (error);
+}
+
+/*
+ * Maps a dquot to the buffer containing its on-disk version.
+ * This returns a ptr to the buffer containing the on-disk dquot
+ * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ */
+STATIC int
+xfs_qm_dqtobp(
+	xfs_trans_t		*tp,
+	xfs_dquot_t		*dqp,
+	xfs_disk_dquot_t	**O_ddpp,
+	xfs_buf_t		**O_bpp,
+	uint			flags)
+{
+	xfs_bmbt_irec_t map;
+	int		nmaps, error;
+	xfs_buf_t	*bp;
+	xfs_inode_t	*quotip;
+	xfs_mount_t	*mp;
+	xfs_disk_dquot_t *ddq;
+	xfs_dqid_t	id;
+	boolean_t	newdquot;
+
+	mp = dqp->q_mount;
+	id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
+	nmaps = 1;
+	newdquot = B_FALSE;
+
+	/*
+	 * If we don't know where the dquot lives, find out.
+	 */
+	if (dqp->q_blkno == (xfs_daddr_t) 0) {
+		/* We use the id as an index */
+		dqp->q_fileoffset = (xfs_fileoff_t) ((uint)id /
+						     XFS_QM_DQPERBLK(mp));
+		nmaps = 1;
+		quotip = XFS_DQ_TO_QIP(dqp);
+		xfs_ilock(quotip, XFS_ILOCK_SHARED);
+		/*
+		 * Return if this type of quotas is turned off while we didn't
+		 * have an inode lock
+		 */
+		if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+			xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+			return (ESRCH);
+		}
+		/*
+		 * Find the block map; no allocations yet
+		 */
+		error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+				  XFS_DQUOT_CLUSTER_SIZE_FSB,
+				  XFS_BMAPI_METADATA,
+				  NULL, 0, &map, &nmaps, NULL);
+
+		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+		if (error)
+			return (error);
+		ASSERT(nmaps == 1);
+		ASSERT(map.br_blockcount == 1);
+
+		/*
+		 * offset of dquot in the (fixed sized) dquot chunk.
+		 */
+		dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
+			sizeof(xfs_dqblk_t);
+		if (map.br_startblock == HOLESTARTBLOCK) {
+			/*
+			 * We don't allocate unless we're asked to
+			 */
+			if (!(flags & XFS_QMOPT_DQALLOC))
+				return (ENOENT);
+
+			ASSERT(tp);
+			if ((error = xfs_qm_dqalloc(tp, mp, dqp, quotip,
+						dqp->q_fileoffset, &bp)))
+				return (error);
+			newdquot = B_TRUE;
+		} else {
+			/*
+			 * store the blkno etc so that we don't have to do the
+			 * mapping all the time
+			 */
+			dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+		}
+	}
+	ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
+	ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
+
+	/*
+	 * Read in the buffer, unless we've just done the allocation
+	 * (in which case we already have the buf).
+	 */
+	if (! newdquot) {
+		xfs_dqtrace_entry(dqp, "DQTOBP READBUF");
+		if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+					       dqp->q_blkno,
+					       XFS_QI_DQCHUNKLEN(mp),
+					       0, &bp))) {
+			return (error);
+		}
+		if (error || !bp)
+			return XFS_ERROR(error);
+	}
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	/*
+	 * calculate the location of the dquot inside the buffer.
+	 */
+	ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+
+	/*
+	 * A simple sanity check in case we got a corrupted dquot...
+	 */
+	if (xfs_qm_dqcheck(ddq, id,
+			   dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP),
+			   flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
+			   "dqtobp")) {
+		if (!(flags & XFS_QMOPT_DQREPAIR)) {
+			xfs_trans_brelse(tp, bp);
+			return XFS_ERROR(EIO);
+		}
+		XFS_BUF_BUSY(bp); /* We dirtied this */
+	}
+
+	*O_bpp = bp;
+	*O_ddpp = ddq;
+
+	return (0);
+}
+
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.
+ *
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqread(
+	xfs_trans_t	*tp,
+	xfs_dqid_t	id,
+	xfs_dquot_t	*dqp,	/* dquot to get filled in */
+	uint		flags)
+{
+	xfs_disk_dquot_t *ddqp;
+	xfs_buf_t	 *bp;
+	int		 error;
+
+	/*
+	 * get a pointer to the on-disk dquot and the buffer containing it
+	 * dqp already knows its own type (GROUP/USER).
+	 */
+	xfs_dqtrace_entry(dqp, "DQREAD");
+	if ((error = xfs_qm_dqtobp(tp, dqp, &ddqp, &bp, flags))) {
+		return (error);
+	}
+
+	/* copy everything from disk dquot to the incore dquot */
+	bcopy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+	ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id);
+	xfs_qm_dquot_logitem_init(dqp);
+
+	/*
+	 * Reservation counters are defined as reservation plus current usage
+	 * to avoid having to add everytime.
+	 */
+	dqp->q_res_bcount = INT_GET(ddqp->d_bcount, ARCH_CONVERT);
+	dqp->q_res_icount = INT_GET(ddqp->d_icount, ARCH_CONVERT);
+	dqp->q_res_rtbcount = INT_GET(ddqp->d_rtbcount, ARCH_CONVERT);
+
+	/* Mark the buf so that this will stay incore a little longer */
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+
+	/*
+	 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
+	 * So we need to release with xfs_trans_brelse().
+	 * The strategy here is identical to that of inodes; we lock
+	 * the dquot in xfs_qm_dqget() before making it accessible to
+	 * others. This is because dquots, like inodes, need a good level of
+	 * concurrency, and we don't want to take locks on the entire buffers
+	 * for dquot accesses.
+	 * Note also that the dquot buffer may even be dirty at this point, if
+	 * this particular dquot was repaired. We still aren't afraid to
+	 * brelse it because we have the changes incore.
+	 */
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+	xfs_trans_brelse(tp, bp);
+
+	return (error);
+}
+
+
+/*
+ * allocate an incore dquot from the kernel heap,
+ * and fill its core with quota information kept on disk.
+ * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
+ * if it wasn't already allocated.
+ */
+STATIC int
+xfs_qm_idtodq(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,	 /* gid or uid, depending on type */
+	uint		type,	 /* UDQUOT or GDQUOT */
+	uint		flags,	 /* DQALLOC, DQREPAIR */
+	xfs_dquot_t	**O_dqpp)/* OUT : incore dquot, not locked */
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+	xfs_trans_t	*tp;
+	int		cancelflags=0;
+
+	dqp = xfs_qm_dqinit(mp, id, type);
+	tp = NULL;
+	if (flags & XFS_QMOPT_DQALLOC) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+		if ((error = xfs_trans_reserve(tp,
+				       XFS_QM_DQALLOC_SPACE_RES(mp),
+				       XFS_WRITE_LOG_RES(mp) +
+					      BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
+					      128,
+				       0,
+				       XFS_TRANS_PERM_LOG_RES,
+				       XFS_WRITE_LOG_COUNT))) {
+			cancelflags = 0;
+			goto error0;
+		}
+		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+	}
+
+	/*
+	 * Read it from disk; xfs_dqread() takes care of
+	 * all the necessary initialization of dquot's fields (locks, etc)
+	 */
+	if ((error = xfs_qm_dqread(tp, id, dqp, flags))) {
+		/*
+		 * This can happen if quotas got turned off (ESRCH),
+		 * or if the dquot didn't exist on disk and we ask to
+		 * allocate (ENOENT).
+		 */
+		xfs_dqtrace_entry(dqp, "DQREAD FAIL");
+		cancelflags |= XFS_TRANS_ABORT;
+		goto error0;
+	}
+	if (tp) {
+		if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
+					     NULL)))
+			goto error1;
+	}
+
+	*O_dqpp = dqp;
+	ASSERT(! XFS_DQ_IS_LOCKED(dqp));
+	return (0);
+
+ error0:
+	ASSERT(error);
+	if (tp)
+		xfs_trans_cancel(tp, cancelflags);
+ error1:
+	xfs_qm_dqdestroy(dqp);
+	*O_dqpp = NULL;
+	return (error);
+}
+
+/*
+ * Lookup a dquot in the incore dquot hashtable. We keep two separate
+ * hashtables for user and group dquots; and, these are global tables
+ * inside the XQM, not per-filesystem tables.
+ * The hash chain must be locked by caller, and it is left locked
+ * on return. Returning dquot is locked.
+ */
+STATIC int
+xfs_qm_dqlookup(
+	xfs_mount_t		*mp,
+	xfs_dqid_t		id,
+	xfs_dqhash_t		*qh,
+	xfs_dquot_t		**O_dqpp)
+{
+	xfs_dquot_t		*dqp;
+	uint			flist_locked;
+	xfs_dquot_t		*d;
+
+	ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+
+	flist_locked = B_FALSE;
+
+	/*
+	 * Traverse the hashchain looking for a match
+	 */
+	for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
+		/*
+		 * We already have the hashlock. We don't need the
+		 * dqlock to look at the id field of the dquot, since the
+		 * id can't be modified without the hashlock anyway.
+		 */
+		if (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id && dqp->q_mount == mp) {
+			xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP");
+			/*
+			 * All in core dquots must be on the dqlist of mp
+			 */
+			ASSERT(dqp->MPL_PREVP != NULL);
+
+			xfs_dqlock(dqp);
+			if (dqp->q_nrefs == 0) {
+				ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
+				if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+					xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT");
+
+					/*
+					 * We may have raced with dqreclaim_one()
+					 * (and lost). So, flag that we don't
+					 * want the dquot to be reclaimed.
+					 */
+					dqp->dq_flags |= XFS_DQ_WANT;
+					xfs_dqunlock(dqp);
+					xfs_qm_freelist_lock(xfs_Gqm);
+					xfs_dqlock(dqp);
+					dqp->dq_flags &= ~(XFS_DQ_WANT);
+				}
+				flist_locked = B_TRUE;
+			}
+
+			/*
+			 * id couldn't have changed; we had the hashlock all
+			 * along
+			 */
+			ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id);
+
+			if (flist_locked) {
+				if (dqp->q_nrefs != 0) {
+					xfs_qm_freelist_unlock(xfs_Gqm);
+					flist_locked = B_FALSE;
+				} else {
+					/*
+					 * take it off the freelist
+					 */
+					xfs_dqtrace_entry(dqp,
+							"DQLOOKUP: TAKEOFF FL");
+					XQM_FREELIST_REMOVE(dqp);
+					/* xfs_qm_freelist_print(&(xfs_Gqm->
+							qm_dqfreelist),
+							"after removal"); */
+				}
+			}
+
+			/*
+			 * grab a reference
+			 */
+			XFS_DQHOLD(dqp);
+
+			if (flist_locked)
+				xfs_qm_freelist_unlock(xfs_Gqm);
+			/*
+			 * move the dquot to the front of the hashchain
+			 */
+			ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+			if (dqp->HL_PREVP != &qh->qh_next) {
+				xfs_dqtrace_entry(dqp,
+						  "DQLOOKUP: HASH MOVETOFRONT");
+				if ((d = dqp->HL_NEXT))
+					d->HL_PREVP = dqp->HL_PREVP;
+				*(dqp->HL_PREVP) = d;
+				d = qh->qh_next;
+				d->HL_PREVP = &dqp->HL_NEXT;
+				dqp->HL_NEXT = d;
+				dqp->HL_PREVP = &qh->qh_next;
+				qh->qh_next = dqp;
+			}
+			xfs_dqtrace_entry(dqp, "LOOKUP END");
+			*O_dqpp = dqp;
+			ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+			return (0);
+		}
+	}
+
+	*O_dqpp = NULL;
+	ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+	return (1);
+}
+
+/*
+ * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
+ * a locked dquot, doing an allocation (if requested) as needed.
+ * When both an inode and an id are given, the inode's id takes precedence.
+ * That is, if the id changes while we dont hold the ilock inside this
+ * function, the new dquot is returned, not necessarily the one requested
+ * in the id argument.
+ */
+int
+xfs_qm_dqget(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,	  /* locked inode (optional) */
+	xfs_dqid_t	id,	  /* gid or uid, depending on type */
+	uint		type,	  /* UDQUOT or GDQUOT */
+	uint		flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+	xfs_dquot_t	**O_dqpp) /* OUT : locked incore dquot */
+{
+	xfs_dquot_t	*dqp;
+	xfs_dqhash_t	*h;
+	uint		version;
+	int		error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+	if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
+	    (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
+		return (ESRCH);
+	}
+	h = XFS_DQ_HASH(mp, id, type);
+
+#ifdef DEBUG
+	if (xfs_do_dqerror) {
+		if ((xfs_dqerror_dev == mp->m_dev) &&
+		    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
+			printk("Returning error in dqget\n");
+			return (EIO);
+		}
+	}
+#endif
+
+ again:
+
+#ifdef DEBUG
+	ASSERT(type == XFS_DQ_USER || type == XFS_DQ_GROUP);
+	if (ip) {
+		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+		if (type == XFS_DQ_USER)
+			ASSERT(ip->i_udquot == NULL);
+		else
+			ASSERT(ip->i_gdquot == NULL);
+	}
+#endif
+	XFS_DQ_HASH_LOCK(h);
+
+	/*
+	 * Look in the cache (hashtable).
+	 * The chain is kept locked during lookup.
+	 */
+	if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
+		XFS_STATS_INC(xfsstats.xs_qm_dqcachehits);
+		/*
+		 * The dquot was found, moved to the front of the chain,
+		 * taken off the freelist if it was on it, and locked
+		 * at this point. Just unlock the hashchain and return.
+		 */
+		ASSERT(*O_dqpp);
+		ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
+		XFS_DQ_HASH_UNLOCK(h);
+		xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
+		return (0);	/* success */
+	}
+	XFS_STATS_INC(xfsstats.xs_qm_dqcachemisses);
+
+	/*
+	 * Dquot cache miss. We don't want to keep the inode lock across
+	 * a (potential) disk read. Also we don't want to deal with the lock
+	 * ordering between quotainode and this inode. OTOH, dropping the inode
+	 * lock here means dealing with a chown that can happen before
+	 * we re-acquire the lock.
+	 */
+	if (ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * Save the hashchain version stamp, and unlock the chain, so that
+	 * we don't keep the lock across a disk read
+	 */
+	version = h->qh_version;
+	XFS_DQ_HASH_UNLOCK(h);
+
+	/*
+	 * Allocate the dquot on the kernel heap, and read the ondisk
+	 * portion off the disk. Also, do all the necessary initialization
+	 * This can return ENOENT if dquot didn't exist on disk and we didn't
+	 * ask it to allocate; ESRCH if quotas got turned off suddenly.
+	 */
+	if ((error = xfs_qm_idtodq(mp, id, type,
+				  flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
+					   XFS_QMOPT_DOWARN),
+				  &dqp))) {
+		if (ip)
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		return (error);
+	}
+
+	/*
+	 * See if this is mount code calling to look at the overall quota limits
+	 * which are stored in the id == 0 user or group's dquot.
+	 * Since we may not have done a quotacheck by this point, just return
+	 * the dquot without attaching it to any hashtables, lists, etc, or even
+	 * taking a reference.
+	 * The caller must dqdestroy this once done.
+	 */
+	if (flags & XFS_QMOPT_DQSUSER) {
+		ASSERT(id == 0);
+		ASSERT(! ip);
+		goto dqret;
+	}
+
+	/*
+	 * Dquot lock comes after hashlock in the lock ordering
+	 */
+	ASSERT(! XFS_DQ_IS_LOCKED(dqp));
+	if (ip) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (! XFS_IS_DQTYPE_ON(mp, type)) {
+			/* inode stays locked on return */
+			xfs_qm_dqdestroy(dqp);
+			return XFS_ERROR(ESRCH);
+		}
+		/*
+		 * A dquot could be attached to this inode by now, since
+		 * we had dropped the ilock.
+		 */
+		if (type == XFS_DQ_USER) {
+			if (ip->i_udquot) {
+				xfs_qm_dqdestroy(dqp);
+				dqp = ip->i_udquot;
+				xfs_dqlock(dqp);
+				goto dqret;
+			}
+		} else {
+			if (ip->i_gdquot) {
+				xfs_qm_dqdestroy(dqp);
+				dqp = ip->i_gdquot;
+				xfs_dqlock(dqp);
+				goto dqret;
+			}
+		}
+	}
+
+	/*
+	 * Hashlock comes after ilock in lock order
+	 */
+	XFS_DQ_HASH_LOCK(h);
+	if (version != h->qh_version) {
+		xfs_dquot_t *tmpdqp;
+		/*
+		 * Now, see if somebody else put the dquot in the
+		 * hashtable before us. This can happen because we didn't
+		 * keep the hashchain lock. We don't have to worry about
+		 * lock order between the two dquots here since dqp isn't
+		 * on any findable lists yet.
+		 */
+		if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
+			/*
+			 * Duplicate found. Just throw away the new dquot
+			 * and start over.
+			 */
+			xfs_qm_dqput(tmpdqp);
+			XFS_DQ_HASH_UNLOCK(h);
+			xfs_qm_dqdestroy(dqp);
+			XFS_STATS_INC(xfsstats.xs_qm_dquot_dups);
+			goto again;
+		}
+	}
+
+	/*
+	 * Put the dquot at the beginning of the hash-chain and mp's list
+	 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
+	 */
+	ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
+	dqp->q_hash = h;
+	XQM_HASHLIST_INSERT(h, dqp);
+
+	/*
+	 * Attach this dquot to this filesystem's list of all dquots,
+	 * kept inside the mount structure in m_quotainfo field
+	 */
+	xfs_qm_mplist_lock(mp);
+
+	/*
+	 * We return a locked dquot to the caller, with a reference taken
+	 */
+	xfs_dqlock(dqp);
+	dqp->q_nrefs = 1;
+
+	XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
+
+	xfs_qm_mplist_unlock(mp);
+	XFS_DQ_HASH_UNLOCK(h);
+ dqret:
+	ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip));
+	xfs_dqtrace_entry(dqp, "DQGET DONE");
+	*O_dqpp = dqp;
+	return (0);
+}
+
+
+/*
+ * Release a reference to the dquot (decrement ref-count)
+ * and unlock it. If there is a group quota attached to this
+ * dquot, carefully release that too without tripping over
+ * deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+	xfs_dquot_t	*dqp)
+{
+	xfs_dquot_t	*gdqp;
+
+	ASSERT(dqp->q_nrefs > 0);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	xfs_dqtrace_entry(dqp, "DQPUT");
+
+	if (dqp->q_nrefs != 1) {
+		dqp->q_nrefs--;
+		xfs_dqunlock(dqp);
+		return;
+	}
+
+	/*
+	 * drop the dqlock and acquire the freelist and dqlock
+	 * in the right order; but try to get it out-of-order first
+	 */
+	if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+		xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT");
+		xfs_dqunlock(dqp);
+		xfs_qm_freelist_lock(xfs_Gqm);
+		xfs_dqlock(dqp);
+	}
+
+	while (1) {
+		gdqp = NULL;
+
+		/* We can't depend on nrefs being == 1 here */
+		if (--dqp->q_nrefs == 0) {
+			xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST");
+			/*
+			 * insert at end of the freelist.
+			 */
+			XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
+
+			/*
+			 * If we just added a udquot to the freelist, then
+			 * we want to release the gdquot reference that
+			 * it (probably) has. Otherwise it'll keep the
+			 * gdquot from getting reclaimed.
+			 */
+			if ((gdqp = dqp->q_gdquot)) {
+				/*
+				 * Avoid a recursive dqput call
+				 */
+				xfs_dqlock(gdqp);
+				dqp->q_gdquot = NULL;
+			}
+
+			/* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
+			   "@@@@@++ Free list (after append) @@@@@+");
+			   */
+		}
+		xfs_dqunlock(dqp);
+
+		/*
+		 * If we had a group quota inside the user quota as a hint,
+		 * release it now.
+		 */
+		if (! gdqp)
+			break;
+		dqp = gdqp;
+	}
+	xfs_qm_freelist_unlock(xfs_Gqm);
+}
+
+/*
+ * Release a dquot. Flush it if dirty, then dqput() it.
+ * dquot must not be locked.
+ */
+void
+xfs_qm_dqrele(
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(dqp);
+	xfs_dqtrace_entry(dqp, "DQRELE");
+
+	xfs_dqlock(dqp);
+	/*
+	 * We don't care to flush it if the dquot is dirty here.
+	 * That will create stutters that we want to avoid.
+	 * Instead we do a delayed write when we try to reclaim
+	 * a dirty dquot. Also xfs_sync will take part of the burden...
+	 */
+	xfs_qm_dqput(dqp);
+}
+
+
+/*
+ * Write a modified dquot to disk.
+ * The dquot must be locked and the flush lock too taken by caller.
+ * The flush lock will not be unlocked until the dquot reaches the disk,
+ * but the dquot is free to be unlocked and modified by the caller
+ * in the interim. Dquot is still locked on return. This behavior is
+ * identical to that of inodes.
+ */
+int
+xfs_qm_dqflush(
+	xfs_dquot_t		*dqp,
+	uint			flags)
+{
+	xfs_mount_t		*mp;
+	xfs_buf_t		*bp;
+	xfs_disk_dquot_t	*ddqp;
+	int			error;
+	SPLDECL(s);
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+	xfs_dqtrace_entry(dqp, "DQFLUSH");
+
+	/*
+	 * If not dirty, nada.
+	 */
+	if (!XFS_DQ_IS_DIRTY(dqp)) {
+		xfs_dqfunlock(dqp);
+		return (0);
+	}
+
+	/*
+	 * Cant flush a pinned dquot. Wait for it.
+	 */
+	xfs_qm_dqunpin_wait(dqp);
+
+	/*
+	 * This may have been unpinned because the filesystem is shutting
+	 * down forcibly. If that's the case we must not write this dquot
+	 * to disk, because the log record didn't make it to disk!
+	 */
+	if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
+		dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+		xfs_dqfunlock(dqp);
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * Get the buffer containing the on-disk dquot
+	 * We don't need a transaction envelope because we know that the
+	 * the ondisk-dquot has already been allocated for.
+	 */
+	if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
+		xfs_dqtrace_entry(dqp, "DQTOBP FAIL");
+		ASSERT(error != ENOENT);
+		/*
+		 * Quotas could have gotten turned off (ESRCH)
+		 */
+		xfs_dqfunlock(dqp);
+		return (error);
+	}
+
+	if (xfs_qm_dqcheck(&dqp->q_core, INT_GET(ddqp->d_id, ARCH_CONVERT), 0, XFS_QMOPT_DOWARN,
+			   "dqflush (incore copy)")) {
+		xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE);
+		return XFS_ERROR(EIO);
+	}
+
+	/* This is the only portion of data that needs to persist */
+	bcopy(&(dqp->q_core), ddqp, sizeof(xfs_disk_dquot_t));
+
+	/*
+	 * Clear the dirty field and remember the flush lsn for later use.
+	 */
+	dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+	mp = dqp->q_mount;
+
+	/* lsn is 64 bits */
+	AIL_LOCK(mp, s);
+	dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
+	AIL_UNLOCK(mp, s);
+
+	/*
+	 * Attach an iodone routine so that we can remove this dquot from the
+	 * AIL and release the flush lock once the dquot is synced to disk.
+	 */
+	xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
+			      xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
+	/*
+	 * If the buffer is pinned then push on the log so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (XFS_BUF_ISPINNED(bp)) {
+		xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE");
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	}
+
+	if (flags & XFS_QMOPT_DELWRI) {
+		xfs_bdwrite(mp, bp);
+	} else if (flags & XFS_QMOPT_ASYNC) {
+		xfs_bawrite(mp, bp);
+	} else {
+		error = xfs_bwrite(mp, bp);
+	}
+	xfs_dqtrace_entry(dqp, "DQFLUSH END");
+	/*
+	 * dqp is still locked, but caller is free to unlock it now.
+	 */
+	return (error);
+
+}
+
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_dqflush_done(
+	xfs_buf_t		*bp,
+	xfs_dq_logitem_t	*qip)
+{
+	xfs_dquot_t		*dqp;
+	SPLDECL(s);
+
+	dqp = qip->qli_dquot;
+
+	/*
+	 * We only want to pull the item from the AIL if its
+	 * location in the log has not changed since we started the flush.
+	 * Thus, we only bother if the dquot's lsn has
+	 * not changed. First we check the lsn outside the lock
+	 * since it's cheaper, and then we recheck while
+	 * holding the lock before removing the dquot from the AIL.
+	 */
+	if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
+	    qip->qli_item.li_lsn == qip->qli_flush_lsn) {
+
+		AIL_LOCK(dqp->q_mount, s);
+		/*
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
+			xfs_trans_delete_ail(dqp->q_mount,
+					     (xfs_log_item_t*)qip, s);
+		else
+			AIL_UNLOCK(dqp->q_mount, s);
+	}
+
+	/*
+	 * Release the dq's flush lock since we're done with it.
+	 */
+	xfs_dqfunlock(dqp);
+}
+
+
+int
+xfs_qm_dqflock_nowait(
+	xfs_dquot_t *dqp)
+{
+	int locked;
+
+	locked = cpsema(&((dqp)->q_flock));
+
+	/* XXX ifdef these out */
+	if (locked)
+		(dqp)->dq_flags |= XFS_DQ_FLOCKED;
+	return (locked);
+}
+
+
+int
+xfs_qm_dqlock_nowait(
+	xfs_dquot_t *dqp)
+{
+	return (mutex_trylock(&((dqp)->q_qlock)));
+}
+
+void
+xfs_dqlock(
+	xfs_dquot_t *dqp)
+{
+	mutex_lock(&(dqp->q_qlock), PINOD);
+}
+
+void
+xfs_dqunlock(
+	xfs_dquot_t *dqp)
+{
+	mutex_unlock(&(dqp->q_qlock));
+	if (dqp->q_logitem.qli_dquot == dqp) {
+		/* Once was dqp->q_mount, but might just have been cleared */
+		xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
+					(xfs_log_item_t*)&(dqp->q_logitem));
+	}
+}
+
+
+void
+xfs_dqunlock_nonotify(
+	xfs_dquot_t *dqp)
+{
+	mutex_unlock(&(dqp->q_qlock));
+}
+
+void
+xfs_dqlock2(
+	xfs_dquot_t	*d1,
+	xfs_dquot_t	*d2)
+{
+	if (d1 && d2) {
+		ASSERT(d1 != d2);
+		if (INT_GET(d1->q_core.d_id, ARCH_CONVERT) > INT_GET(d2->q_core.d_id, ARCH_CONVERT)) {
+			xfs_dqlock(d2);
+			xfs_dqlock(d1);
+		} else {
+			xfs_dqlock(d1);
+			xfs_dqlock(d2);
+		}
+	} else {
+		if (d1) {
+			xfs_dqlock(d1);
+		} else if (d2) {
+			xfs_dqlock(d2);
+		}
+	}
+}
+
+
+/*
+ * A rarely used accessor. This exists because we don't really want
+ * to expose the internals of a dquot to the outside world.
+ */
+xfs_dqid_t
+xfs_qm_dqid(
+	xfs_dquot_t	*dqp)
+{
+	return (INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
+}
+
+
+/*
+ * Take a dquot out of the mount's dqlist as well as the hashlist.
+ * This is called via unmount as well as quotaoff, and the purge
+ * will always succeed unless there are soft (temp) references
+ * outstanding.
+ *
+ * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
+ * that we're returning! XXXsup - not cool.
+ */
+/* ARGSUSED */
+int
+xfs_qm_dqpurge(
+	xfs_dquot_t	*dqp,
+	uint		flags)
+{
+	xfs_dqhash_t	*thishash;
+	xfs_mount_t	*mp;
+
+	mp = dqp->q_mount;
+
+	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+	ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
+
+	xfs_dqlock(dqp);
+	/*
+	 * We really can't afford to purge a dquot that is
+	 * referenced, because these are hard refs.
+	 * It shouldn't happen in general because we went thru _all_ inodes in
+	 * dqrele_all_inodes before calling this and didn't let the mountlock go.
+	 * However it is possible that we have dquots with temporary
+	 * references that are not attached to an inode. e.g. see xfs_setattr().
+	 */
+	if (dqp->q_nrefs != 0) {
+		xfs_dqunlock(dqp);
+		XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+		return (1);
+	}
+
+	ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+
+	/*
+	 * If we're turning off quotas, we have to make sure that, for
+	 * example, we don't delete quota disk blocks while dquots are
+	 * in the process of getting written to those disk blocks.
+	 * This dquot might well be on AIL, and we can't leave it there
+	 * if we're turning off quotas. Basically, we need this flush
+	 * lock, and are willing to block on it.
+	 */
+	if (! xfs_qm_dqflock_nowait(dqp)) {
+		/*
+		 * Block on the flush lock after nudging dquot buffer,
+		 * if it is incore.
+		 */
+		xfs_qm_dqflock_pushbuf_wait(dqp);
+	}
+
+	/*
+	 * XXXIf we're turning this type of quotas off, we don't care
+	 * about the dirty metadata sitting in this dquot. OTOH, if
+	 * we're unmounting, we do care, so we flush it and wait.
+	 */
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
+		/* dqflush unlocks dqflock */
+		/*
+		 * Given that dqpurge is a very rare occurence, it is OK
+		 * that we're holding the hashlist and mplist locks
+		 * across the disk write. But, ... XXXsup
+		 *
+		 * We don't care about getting disk errors here. We need
+		 * to purge this dquot anyway, so we go ahead regardless.
+		 */
+		(void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+		xfs_dqflock(dqp);
+	}
+	ASSERT(dqp->q_pincount == 0);
+	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+	       !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+
+	thishash = dqp->q_hash;
+	XQM_HASHLIST_REMOVE(thishash, dqp);
+	XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
+	/*
+	 * XXX Move this to the front of the freelist, if we can get the
+	 * freelist lock.
+	 */
+	ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+
+	dqp->q_mount = NULL;;
+	dqp->q_hash = NULL;
+	dqp->dq_flags = XFS_DQ_INACTIVE;
+	bzero(&dqp->q_core, sizeof(dqp->q_core));
+	xfs_dqfunlock(dqp);
+	xfs_dqunlock(dqp);
+	XFS_DQ_HASH_UNLOCK(thishash);
+	return (0);
+}
+
+
+/*
+ * Do some primitive error checking on ondisk dquot
+ * data structures. Not just for debugging, actually;
+ * this can be useful for detecting data corruption mainly due to
+ * disk failures.
+ */
+/* ARGSUSED */
+int
+xfs_qm_dqcheck(
+	xfs_disk_dquot_t *ddq,
+	xfs_dqid_t	 id,
+	uint		 type,	  /* used only when IO_dorepair is true */
+	uint		 flags,
+	char		 *str)
+{
+	int errs;
+
+	errs = 0;
+	/* ASSERT(flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN)); */
+	/*
+	 * We can encounter an uninitialized dquot buffer for 2 reasons:
+	 * 1. If we crash while deleting the quotainode(s), and those blks get used
+	 *    for some user data. This is because we take the path of regular
+	 *    file deletion; however, the size field of quotainodes is never
+	 *    updated, so all the tricks that we play in itruncate_finish
+	 *    don't quite matter.
+	 *
+	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
+	 *    But the allocation will be replayed so we'll end up with an
+	 *    uninitialized quota block.
+	 *
+	 * This is all fine; things are still consistent, and we haven't lost
+	 * any quota information. Just don't complain about bad dquot blks.
+	 */
+	if (INT_GET(ddq->d_magic, ARCH_CONVERT) != XFS_DQUOT_MAGIC) {
+		if (flags & XFS_QMOPT_DOWARN)
+			cmn_err(CE_ALERT,
+			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+			str, id, INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_MAGIC);
+		errs++;
+	}
+	if (INT_GET(ddq->d_version, ARCH_CONVERT) != XFS_DQUOT_VERSION) {
+		if (flags & XFS_QMOPT_DOWARN)
+			cmn_err(CE_ALERT,
+			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+			str, id, INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_VERSION);
+		errs++;
+	}
+
+	if (INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_USER && INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_GROUP) {
+		if (flags & XFS_QMOPT_DOWARN)
+			cmn_err(CE_ALERT,
+			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+			str, id, INT_GET(ddq->d_flags, ARCH_CONVERT));
+		errs++;
+	}
+
+	if (id != -1 && id != INT_GET(ddq->d_id, ARCH_CONVERT)) {
+		if (flags & XFS_QMOPT_DOWARN)
+			cmn_err(CE_ALERT,
+			"%s : ondisk-dquot 0x%x, ID mismatch: "
+			"0x%x expected, found id 0x%x",
+			str, ddq, id, INT_GET(ddq->d_id, ARCH_CONVERT));
+		errs++;
+	}
+
+	if (! errs) {
+		if (INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT) &&
+		    INT_GET(ddq->d_bcount, ARCH_CONVERT) >= INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT)) {
+			if (INT_ISZERO(ddq->d_btimer, ARCH_CONVERT) && !INT_ISZERO(ddq->d_id, ARCH_CONVERT)) {
+				if (flags & XFS_QMOPT_DOWARN)
+					cmn_err(CE_ALERT,
+					"%s : Dquot ID 0x%x (0x%x) "
+					"BLK TIMER NOT STARTED",
+					str, (int) INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
+				errs++;
+			}
+		}
+		if (INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT) &&
+		    INT_GET(ddq->d_icount, ARCH_CONVERT) >= INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT)) {
+			if (INT_ISZERO(ddq->d_itimer, ARCH_CONVERT) && !INT_ISZERO(ddq->d_id, ARCH_CONVERT)) {
+				if (flags & XFS_QMOPT_DOWARN)
+					cmn_err(CE_ALERT,
+					"%s : Dquot ID 0x%x (0x%x) "
+					"INODE TIMER NOT STARTED",
+					str, (int) INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
+				errs++;
+			}
+		}
+	}
+
+	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+		return (errs);
+
+	if (flags & XFS_QMOPT_DOWARN)
+		cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
+
+	/*
+	 * Typically, a repair is only requested by quotacheck.
+	 */
+	ASSERT(id != -1);
+	ASSERT(flags & XFS_QMOPT_DQREPAIR);
+	bzero(ddq, sizeof(xfs_dqblk_t));
+	xfs_qm_dqinit_core(id, type, (xfs_dqblk_t *)ddq);
+	return (errs);
+}
+
+#ifdef QUOTADEBUG
+void
+xfs_qm_dqprint(xfs_dquot_t *dqp)
+{
+	printk( "-----------KERNEL DQUOT----------------\n");
+	printk( "---- dquot ID	=  %d\n", (int) INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
+	printk( "---- type	=  %s\n", XFS_QM_ISUDQ(dqp) ? "USR" : "GRP");
+	printk( "---- fs	=  0x%p\n", dqp->q_mount);
+	printk( "---- blkno	=  0x%x\n", (int) dqp->q_blkno);
+	printk( "---- boffset	=  0x%x\n", (int) dqp->q_bufoffset);
+	printk( "---- blkhlimit =  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT),
+	       (int) INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT));
+	printk( "---- blkslimit =  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT));
+	printk( "---- inohlimit =  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT));
+	printk( "---- inoslimit =  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT));
+	printk( "---- bcount	=  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
+	printk( "---- icount	=  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_icount, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
+	printk( "---- btimer	=  %d\n", (int)INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT));
+	printk( "---- itimer	=  %d\n", (int)INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT));
+
+	printk( "---------------------------\n");
+}
+#endif
+
+/*
+ * Give the buffer a little push if it is incore and
+ * wait on the flush lock.
+ */
+void
+xfs_qm_dqflock_pushbuf_wait(
+	xfs_dquot_t	*dqp)
+{
+	xfs_buf_t	*bp;
+
+	/*
+	 * Check to see if the dquot has been flushed delayed
+	 * write.  If so, grab its buffer and send it
+	 * out immediately.  We'll be able to acquire
+	 * the flush lock when the I/O completes.
+	 */
+	bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
+		    XFS_QI_DQCHUNKLEN(dqp->q_mount),
+		    XFS_INCORE_TRYLOCK);
+	if (bp != NULL) {
+		if (XFS_BUF_ISDELAYWRITE(bp)) {
+			if (XFS_BUF_ISPINNED(bp)) {
+				xfs_log_force(dqp->q_mount,
+					      (xfs_lsn_t)0,
+					      XFS_LOG_FORCE);
+			}
+			xfs_bawrite(dqp->q_mount, bp);
+		} else {
+			xfs_buf_relse(bp);
+		}
+	}
+	xfs_dqflock(dqp);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dquot.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dquot.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dquot.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dquot.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DQUOT_H__
+#define __XFS_DQUOT_H__
+
+/*
+ * Dquots are structures that hold quota information about a user or a group,
+ * much like inodes are for files. In fact, dquots share many characteristics
+ * with inodes. However, dquots can also be a centralized resource, relative
+ * to a collection of inodes. In this respect, dquots share some characteristics
+ * of the superblock.
+ * XFS dquots exploit both those in its algorithms. They make every attempt
+ * to not be a bottleneck when quotas are on and have minimal impact, if any,
+ * when quotas are off.
+ */
+
+/*
+ * The hash chain headers (hash buckets)
+ */
+typedef struct xfs_dqhash {
+	struct xfs_dquot *qh_next;
+	mutex_t		  qh_lock;
+	uint		  qh_version;	/* ever increasing version */
+	uint		  qh_nelems;	/* number of dquots on the list */
+} xfs_dqhash_t;
+
+typedef struct xfs_dqlink {
+	struct xfs_dquot  *ql_next;	/* forward link */
+	struct xfs_dquot **ql_prevp;	/* pointer to prev ql_next */
+} xfs_dqlink_t;
+
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * This is the marker which is designed to occupy the first few
+ * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
+ * must come first.
+ * This serves as the marker ("sentinel") when we have to restart list
+ * iterations because of locking considerations.
+ */
+typedef struct xfs_dqmarker {
+	struct xfs_dquot*dqm_flnext;	/* link to freelist: must be first */
+	struct xfs_dquot*dqm_flprev;
+	xfs_dqlink_t	 dqm_mplist;	/* link to mount's list of dquots */
+	xfs_dqlink_t	 dqm_hashlist;	/* link to the hash chain */
+	uint		 dqm_flags;	/* various flags (XFS_DQ_*) */
+} xfs_dqmarker_t;
+
+/*
+ * The incore dquot structure
+ */
+typedef struct xfs_dquot {
+	xfs_dqmarker_t	 q_lists;	/* list ptrs, q_flags (marker) */
+	xfs_dqhash_t	*q_hash;	/* the hashchain header */
+	struct xfs_mount*q_mount;	/* filesystem this relates to */
+	struct xfs_trans*q_transp;	/* trans this belongs to currently */
+	uint		 q_nrefs;	/* # active refs from inodes */
+	xfs_daddr_t	 q_blkno;	/* blkno of dquot buffer */
+	int		 q_bufoffset;	/* off of dq in buffer (# dquots) */
+	xfs_fileoff_t	 q_fileoffset;	/* offset in quotas file */
+
+	struct xfs_dquot*q_gdquot;	/* group dquot, hint only */
+	xfs_disk_dquot_t q_core;	/* actual usage & quotas */
+	xfs_dq_logitem_t q_logitem;	/* dquot log item */
+	xfs_qcnt_t	 q_res_bcount;	/* total regular nblks used+reserved */
+	xfs_qcnt_t	 q_res_icount;	/* total inos allocd+reserved */
+	xfs_qcnt_t	 q_res_rtbcount;/* total realtime blks used+reserved */
+	mutex_t		 q_qlock;	/* quota lock */
+	sema_t		 q_flock;	/* flush lock */
+	uint		 q_pincount;	/* pin count for this dquot */
+	sv_t		 q_pinwait;	/* sync var for pinning */
+#ifdef DQUOT_TRACING
+	struct ktrace	*q_trace;	/* trace header structure */
+#endif
+} xfs_dquot_t;
+
+
+#define dq_flnext	q_lists.dqm_flnext
+#define dq_flprev	q_lists.dqm_flprev
+#define dq_mplist	q_lists.dqm_mplist
+#define dq_hashlist	q_lists.dqm_hashlist
+#define dq_flags	q_lists.dqm_flags
+
+#define XFS_DQHOLD(dqp)		((dqp)->q_nrefs++)
+
+/*
+ * Quota Accounting flags
+ */
+#define XFS_ALL_QUOTA_ACCT	(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD	(XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD	(XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD)
+#define XFS_ALL_QUOTA_ACTV	(XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
+#define XFS_ALL_QUOTA_ACCT_ENFD (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+				 XFS_GQUOTA_ACCT|XFS_GQUOTA_ENFD)
+
+#define XFS_IS_QUOTA_RUNNING(mp)  ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+
+/*
+ * Quota Limit Enforcement flags
+ */
+#define XFS_IS_QUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ENFD)
+#define XFS_IS_UQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_GQUOTA_ENFD)
+
+#ifdef DEBUG
+static inline int
+XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
+{
+	if (mutex_trylock(&dqp->q_qlock)) {
+		mutex_unlock(&dqp->q_qlock);
+		return 0;
+	}
+	return 1;
+}
+#endif
+
+/*
+ * The following three routines simply manage the q_flock
+ * semaphore embedded in the dquot.  This semaphore synchronizes
+ * processes attempting to flush the in-core dquot back to disk.
+ */
+#define xfs_dqflock(dqp)	 { psema(&((dqp)->q_flock), PINOD | PRECALC);\
+				   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
+#define xfs_dqfunlock(dqp)	 { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \
+				   vsema(&((dqp)->q_flock)); \
+				   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
+
+#define XFS_DQ_PINLOCK(dqp)	   mutex_spinlock( \
+				     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock))
+#define XFS_DQ_PINUNLOCK(dqp, s)   mutex_spinunlock( \
+				     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s)
+
+#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0)
+#define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
+#define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
+#define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
+#define XFS_DQ_TO_QINF(dqp)	((dqp)->q_mount->m_quotainfo)
+#define XFS_DQ_TO_QIP(dqp)	(XFS_QM_ISUDQ(dqp) ? \
+				 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
+				 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
+
+#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
+				     (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
+				     (XFS_IS_GQUOTA_ON((d)->q_mount))))
+#ifdef DQUOT_TRACING
+/*
+ * Dquot Tracing stuff.
+ */
+#define DQUOT_TRACE_SIZE	64
+#define DQUOT_KTRACE_ENTRY	1
+
+#define xfs_dqtrace_entry_ino(a,b,ip) \
+xfs_dqtrace_entry__((a), (b), (void*)__return_address, (ip))
+#define xfs_dqtrace_entry(a,b) \
+xfs_dqtrace_entry__((a), (b), (void*)__return_address, NULL)
+extern void		xfs_dqtrace_entry__(xfs_dquot_t *dqp, char *func,
+					    void *, xfs_inode_t *);
+#else
+#define xfs_dqtrace_entry(a,b)
+#define xfs_dqtrace_entry_ino(a,b,ip)
+#endif
+#ifdef QUOTADEBUG
+extern void		xfs_qm_dqprint(xfs_dquot_t *);
+#else
+#define xfs_qm_dqprint(a)
+#endif
+
+extern xfs_dquot_t	*xfs_qm_dqinit(xfs_mount_t *, xfs_dqid_t, uint);
+extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
+extern int		xfs_qm_dqflush(xfs_dquot_t *, uint);
+extern int		xfs_qm_dqpurge(xfs_dquot_t *, uint);
+extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
+extern int		xfs_qm_dqlock_nowait(xfs_dquot_t *);
+extern int		xfs_qm_dqflock_nowait(xfs_dquot_t *);
+extern void		xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
+extern void		xfs_qm_adjust_dqtimers(xfs_mount_t *,
+					       xfs_disk_dquot_t *);
+extern int		xfs_qm_dqwarn(xfs_disk_dquot_t *, uint);
+
+#endif /* __XFS_DQUOT_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dquot_item.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dquot_item.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dquot_item.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dquot_item.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+
+/*
+ * returns the number of iovecs needed to log the given dquot item.
+ */
+/* ARGSUSED */
+STATIC uint
+xfs_qm_dquot_logitem_size(
+	xfs_dq_logitem_t	*logitem)
+{
+	/*
+	 * we need only two iovecs, one for the format, one for the real thing
+	 */
+	return (2);
+}
+
+/*
+ * fills in the vector of log iovecs for the given dquot log item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_format(
+	xfs_dq_logitem_t	*logitem,
+	xfs_log_iovec_t		*logvec)
+{
+	ASSERT(logitem);
+	ASSERT(logitem->qli_dquot);
+
+	logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
+	logvec->i_len  = sizeof(xfs_dq_logformat_t);
+	logvec++;
+	logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
+	logvec->i_len  = sizeof(xfs_disk_dquot_t);
+
+	ASSERT(2 == logitem->qli_item.li_desc->lid_size);
+	logitem->qli_format.qlf_size = 2;
+
+}
+
+/*
+ * Increment the pin count of the given dquot.
+ * This value is protected by pinlock spinlock in the xQM structure.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pin(
+	xfs_dq_logitem_t *logitem)
+{
+	unsigned long	s;
+	xfs_dquot_t *dqp;
+
+	dqp = logitem->qli_dquot;
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	s = XFS_DQ_PINLOCK(dqp);
+	dqp->q_pincount++;
+	XFS_DQ_PINUNLOCK(dqp, s);
+}
+
+/*
+ * Decrement the pin count of the given dquot, and wake up
+ * anyone in xfs_dqwait_unpin() if the count goes to 0.	 The
+ * dquot must have been previously pinned with a call to xfs_dqpin().
+ */
+STATIC void
+xfs_qm_dquot_logitem_unpin(
+	xfs_dq_logitem_t *logitem)
+{
+	unsigned long	s;
+	xfs_dquot_t *dqp;
+
+	dqp = logitem->qli_dquot;
+	ASSERT(dqp->q_pincount > 0);
+	s = XFS_DQ_PINLOCK(dqp);
+	dqp->q_pincount--;
+	if (dqp->q_pincount == 0) {
+		sv_broadcast(&dqp->q_pinwait);
+	}
+	XFS_DQ_PINUNLOCK(dqp, s);
+}
+
+/* ARGSUSED */
+STATIC void
+xfs_qm_dquot_logitem_unpin_remove(
+	xfs_dq_logitem_t *logitem,
+	xfs_trans_t	 *tp)
+{
+	xfs_qm_dquot_logitem_unpin(logitem);
+}
+
+/*
+ * Given the logitem, this writes the corresponding dquot entry to disk
+ * asynchronously. This is called with the dquot entry securely locked;
+ * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
+ * at the end.
+ */
+STATIC void
+xfs_qm_dquot_logitem_push(
+	xfs_dq_logitem_t	*logitem)
+{
+	xfs_dquot_t	*dqp;
+
+	dqp = logitem->qli_dquot;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+
+	/*
+	 * Since we were able to lock the dquot's flush lock and
+	 * we found it on the AIL, the dquot must be dirty.  This
+	 * is because the dquot is removed from the AIL while still
+	 * holding the flush lock in xfs_dqflush_done().  Thus, if
+	 * we found it in the AIL and were able to obtain the flush
+	 * lock without sleeping, then there must not have been
+	 * anyone in the process of flushing the dquot.
+	 */
+	xfs_qm_dqflush(dqp, XFS_B_DELWRI);
+	xfs_dqunlock(dqp);
+}
+
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_qm_dquot_logitem_committed(
+	xfs_dq_logitem_t	*l,
+	xfs_lsn_t		lsn)
+{
+	/*
+	 * We always re-log the entire dquot when it becomes dirty,
+	 * so, the latest copy _is_ the only one that matters.
+	 */
+	return (lsn);
+}
+
+
+/*
+ * This is called to wait for the given dquot to be unpinned.
+ * Most of these pin/unpin routines are plagiarized from inode code.
+ */
+void
+xfs_qm_dqunpin_wait(
+	xfs_dquot_t	*dqp)
+{
+	SPLDECL(s);
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (dqp->q_pincount == 0) {
+		return;
+	}
+
+	/*
+	 * Give the log a push so we don't wait here too long.
+	 */
+	xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	s = XFS_DQ_PINLOCK(dqp);
+	if (dqp->q_pincount == 0) {
+		XFS_DQ_PINUNLOCK(dqp, s);
+		return;
+	}
+	sv_wait(&(dqp->q_pinwait), PINOD,
+		&(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
+}
+
+/*
+ * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
+ * the dquot is locked by us, but the flush lock isn't. So, here we are
+ * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
+ * If so, we want to push it out to help us take this item off the AIL as soon
+ * as possible.
+ *
+ * We must not be holding the AIL_LOCK at this point. Calling incore() to
+ * search the buffercache can be a time consuming thing, and AIL_LOCK is a
+ * spinlock.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pushbuf(
+	xfs_dq_logitem_t    *qip)
+{
+	xfs_dquot_t	*dqp;
+	xfs_mount_t	*mp;
+	xfs_buf_t	*bp;
+	uint		dopush;
+
+	dqp = qip->qli_dquot;
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	/*
+	 * The qli_pushbuf_flag keeps others from
+	 * trying to duplicate our effort.
+	 */
+	ASSERT(qip->qli_pushbuf_flag != 0);
+	ASSERT(qip->qli_push_owner == get_thread_id());
+
+	/*
+	 * If flushlock isn't locked anymore, chances are that the
+	 * inode flush completed and the inode was taken off the AIL.
+	 * So, just get out.
+	 */
+	if ((valusema(&(dqp->q_flock)) > 0)  ||
+	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+		qip->qli_pushbuf_flag = 0;
+		xfs_dqunlock(dqp);
+		return;
+	}
+	mp = dqp->q_mount;
+	bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
+		    XFS_QI_DQCHUNKLEN(mp),
+		    XFS_INCORE_TRYLOCK);
+	if (bp != NULL) {
+		if (XFS_BUF_ISDELAYWRITE(bp)) {
+			dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
+				  (valusema(&(dqp->q_flock)) <= 0));
+			qip->qli_pushbuf_flag = 0;
+			xfs_dqunlock(dqp);
+
+			if (XFS_BUF_ISPINNED(bp)) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE);
+			}
+			if (dopush) {
+#ifdef XFSRACEDEBUG
+				delay_for_intr();
+				delay(300);
+#endif
+				xfs_bawrite(mp, bp);
+			} else {
+				xfs_buf_relse(bp);
+			}
+		} else {
+			qip->qli_pushbuf_flag = 0;
+			xfs_dqunlock(dqp);
+			xfs_buf_relse(bp);
+		}
+		return;
+	}
+
+	qip->qli_pushbuf_flag = 0;
+	xfs_dqunlock(dqp);
+}
+
+/*
+ * This is called to attempt to lock the dquot associated with this
+ * dquot log item.  Don't sleep on the dquot lock or the flush lock.
+ * If the flush lock is already held, indicating that the dquot has
+ * been or is in the process of being flushed, then see if we can
+ * find the dquot's buffer in the buffer cache without sleeping.  If
+ * we can and it is marked delayed write, then we want to send it out.
+ * We delay doing so until the push routine, though, to avoid sleeping
+ * in any device strategy routines.
+ */
+STATIC uint
+xfs_qm_dquot_logitem_trylock(
+	xfs_dq_logitem_t	*qip)
+{
+	xfs_dquot_t		*dqp;
+	uint			retval;
+
+	dqp = qip->qli_dquot;
+	if (dqp->q_pincount > 0)
+		return (XFS_ITEM_PINNED);
+
+	if (! xfs_qm_dqlock_nowait(dqp))
+		return (XFS_ITEM_LOCKED);
+
+	retval = XFS_ITEM_SUCCESS;
+	if (! xfs_qm_dqflock_nowait(dqp)) {
+		/*
+		 * The dquot is already being flushed.	It may have been
+		 * flushed delayed write, however, and we don't want to
+		 * get stuck waiting for that to complete.  So, we want to check
+		 * to see if we can lock the dquot's buffer without sleeping.
+		 * If we can and it is marked for delayed write, then we
+		 * hold it and send it out from the push routine.  We don't
+		 * want to do that now since we might sleep in the device
+		 * strategy routine.  We also don't want to grab the buffer lock
+		 * here because we'd like not to call into the buffer cache
+		 * while holding the AIL_LOCK.
+		 * Make sure to only return PUSHBUF if we set pushbuf_flag
+		 * ourselves.  If someone else is doing it then we don't
+		 * want to go to the push routine and duplicate their efforts.
+		 */
+		if (qip->qli_pushbuf_flag == 0) {
+			qip->qli_pushbuf_flag = 1;
+			ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
+#ifdef DEBUG
+			qip->qli_push_owner = get_thread_id();
+#endif
+			/*
+			 * The dquot is left locked.
+			 */
+			retval = XFS_ITEM_PUSHBUF;
+		} else {
+			retval = XFS_ITEM_FLUSHING;
+			xfs_dqunlock_nonotify(dqp);
+		}
+	}
+
+	ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
+	return (retval);
+}
+
+
+/*
+ * Unlock the dquot associated with the log item.
+ * Clear the fields of the dquot and dquot log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_unlock(
+	xfs_dq_logitem_t    *ql)
+{
+	xfs_dquot_t	*dqp;
+
+	ASSERT(ql != NULL);
+	dqp = ql->qli_dquot;
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	/*
+	 * Clear the transaction pointer in the dquot
+	 */
+	dqp->q_transp = NULL;
+
+	/*
+	 * dquots are never 'held' from getting unlocked at the end of
+	 * a transaction.  Their locking and unlocking is hidden inside the
+	 * transaction layer, within trans_commit. Hence, no LI_HOLD flag
+	 * for the logitem.
+	 */
+	xfs_dqunlock(dqp);
+}
+
+
+/*
+ * The transaction with the dquot locked has aborted.  The dquot
+ * must not be dirty within the transaction.  We simply unlock just
+ * as if the transaction had been cancelled.
+ */
+STATIC void
+xfs_qm_dquot_logitem_abort(
+	xfs_dq_logitem_t    *ql)
+{
+	xfs_qm_dquot_logitem_unlock(ql);
+}
+
+/*
+ * this needs to stamp an lsn into the dquot, I think.
+ * rpc's that look at user dquot's would then have to
+ * push on the dependency recorded in the dquot
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_dquot_logitem_committing(
+	xfs_dq_logitem_t	*l,
+	xfs_lsn_t		lsn)
+{
+	return;
+}
+
+
+/*
+ * This is the ops vector for dquots
+ */
+struct xfs_item_ops xfs_dquot_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_qm_dquot_logitem_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+					xfs_qm_dquot_logitem_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))
+					xfs_qm_dquot_logitem_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_dquot_logitem_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort,
+	.iop_pushbuf	= (void(*)(xfs_log_item_t*))
+					xfs_qm_dquot_logitem_pushbuf,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_dquot_logitem_committing
+};
+
+/*
+ * Initialize the dquot log item for a newly allocated dquot.
+ * The dquot isn't locked at this point, but it isn't on any of the lists
+ * either, so we don't care.
+ */
+void
+xfs_qm_dquot_logitem_init(
+	struct xfs_dquot *dqp)
+{
+	xfs_dq_logitem_t  *lp;
+	lp = &dqp->q_logitem;
+
+	lp->qli_item.li_type = XFS_LI_DQUOT;
+	lp->qli_item.li_ops = &xfs_dquot_item_ops;
+	lp->qli_item.li_mountp = dqp->q_mount;
+	lp->qli_dquot = dqp;
+	lp->qli_format.qlf_type = XFS_LI_DQUOT;
+	lp->qli_format.qlf_id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
+	lp->qli_format.qlf_blkno = dqp->q_blkno;
+	lp->qli_format.qlf_len = 1;
+	/*
+	 * This is just the offset of this dquot within its buffer
+	 * (which is currently 1 FSB and probably won't change).
+	 * Hence 32 bits for this offset should be just fine.
+	 * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
+	 * here, and recompute it at recovery time.
+	 */
+	lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
+}
+
+/*------------------  QUOTAOFF LOG ITEMS  -------------------*/
+
+/*
+ * This returns the number of iovecs needed to log the given quotaoff item.
+ * We only need 1 iovec for an quotaoff item.  It just logs the
+ * quotaoff_log_format structure.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
+{
+	return (1);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given quotaoff log item. We use only 1 iovec, and we point that
+ * at the quotaoff_log_format structure embedded in the quotaoff item.
+ * It is at this point that we assert that all of the extent
+ * slots in the quotaoff item have been filled.
+ */
+STATIC void
+xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t	*qf,
+			   xfs_log_iovec_t	*log_vector)
+{
+	ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
+
+	log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
+	log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+	qf->qql_format.qf_size = 1;
+}
+
+
+/*
+ * Pinning has no meaning for an quotaoff item, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
+{
+	return;
+}
+
+
+/*
+ * Since pinning has no meaning for an quotaoff item, unpinning does
+ * not either.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
+{
+	return;
+}
+
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
+{
+	return;
+}
+
+/*
+ * Quotaoff items have no locking, so just return success.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
+{
+	return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Quotaoff items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
+{
+	return;
+}
+
+/*
+ * The quotaoff-start-item is logged only once and cannot be moved in the log,
+ * so simply return the lsn at which it's been logged.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
+{
+	return (lsn);
+}
+
+/*
+ * The transaction of which this QUOTAOFF is a part has been aborted.
+ * Just clean up after ourselves.
+ * Shouldn't this never happen in the case of qoffend logitems? XXX
+ */
+STATIC void
+xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf)
+{
+	kmem_free(qf, sizeof(xfs_qoff_logitem_t));
+}
+
+/*
+ * There isn't much you can do to push on an quotaoff item.  It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
+{
+	return;
+}
+
+
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_qm_qoffend_logitem_committed(
+	xfs_qoff_logitem_t *qfe,
+	xfs_lsn_t lsn)
+{
+	xfs_qoff_logitem_t	*qfs;
+	SPLDECL(s);
+
+	qfs = qfe->qql_start_lip;
+	AIL_LOCK(qfs->qql_item.li_mountp,s);
+	/*
+	 * Delete the qoff-start logitem from the AIL.
+	 * xfs_trans_delete_ail() drops the AIL lock.
+	 */
+	xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs, s);
+	kmem_free(qfs, sizeof(xfs_qoff_logitem_t));
+	kmem_free(qfe, sizeof(xfs_qoff_logitem_t));
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * XXX rcc - don't know quite what to do with this.  I think we can
+ * just ignore it.  The only time that isn't the case is if we allow
+ * the client to somehow see that quotas have been turned off in which
+ * we can't allow that to get back until the quotaoff hits the disk.
+ * So how would that happen?  Also, do we need different routines for
+ * quotaoff start and quotaoff end?  I suspect the answer is yes but
+ * to be sure, I need to look at the recovery code and see how quota off
+ * recovery is handled (do we roll forward or back or do something else).
+ * If we roll forwards or backwards, then we need two separate routines,
+ * one that does nothing and one that stamps in the lsn that matters
+ * (truly makes the quotaoff irrevocable).  If we do something else,
+ * then maybe we don't need two.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+{
+	return;
+}
+
+/* ARGSUSED */
+STATIC void
+xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+{
+	return;
+}
+
+struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_qm_qoff_logitem_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+					xfs_qm_qoff_logitem_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_qoffend_logitem_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_qoffend_logitem_committing
+};
+
+/*
+ * This is the ops vector shared by all quotaoff-start log items.
+ */
+struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_qm_qoff_logitem_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+					xfs_qm_qoff_logitem_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_qoff_logitem_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * Allocate and initialize an quotaoff item of the correct quota type(s).
+ */
+xfs_qoff_logitem_t *
+xfs_qm_qoff_logitem_init(
+	struct xfs_mount *mp,
+	xfs_qoff_logitem_t *start,
+	uint flags)
+{
+	xfs_qoff_logitem_t	*qf;
+
+	qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
+
+	qf->qql_item.li_type = XFS_LI_QUOTAOFF;
+	if (start)
+		qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
+	else
+		qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
+	qf->qql_item.li_mountp = mp;
+	qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
+	qf->qql_format.qf_flags = flags;
+	qf->qql_start_lip = start;
+	return (qf);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dquot_item.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dquot_item.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_dquot_item.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_dquot_item.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DQUOT_ITEM_H__
+#define __XFS_DQUOT_ITEM_H__
+
+/*
+ * These are the structures used to lay out dquots and quotaoff
+ * records on the log. Quite similar to those of inodes.
+ */
+
+/*
+ * log format struct for dquots.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+	__uint16_t		qlf_type;      /* dquot log item type */
+	__uint16_t		qlf_size;      /* size of this item */
+	xfs_dqid_t		qlf_id;	       /* usr/grp id number : 32 bits */
+	__int64_t		qlf_blkno;     /* blkno of dquot buffer */
+	__int32_t		qlf_len;       /* len of dquot buffer */
+	__uint32_t		qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+	unsigned short		qf_type;	/* quotaoff log item type */
+	unsigned short		qf_size;	/* size of this item */
+	unsigned int		qf_flags;	/* USR and/or GRP */
+	char			qf_pad[12];	/* padding for future */
+} xfs_qoff_logformat_t;
+
+
+#ifdef __KERNEL__
+
+struct xfs_dquot;
+struct xfs_trans;
+struct xfs_mount;
+typedef struct xfs_dq_logitem {
+	xfs_log_item_t		 qli_item;	   /* common portion */
+	struct xfs_dquot	*qli_dquot;	   /* dquot ptr */
+	xfs_lsn_t		 qli_flush_lsn;	   /* lsn at last flush */
+	unsigned short		 qli_pushbuf_flag; /* one bit used in push_ail */
+#ifdef DEBUG
+	uint64_t		 qli_push_owner;
+#endif
+	xfs_dq_logformat_t	 qli_format;	   /* logged structure */
+} xfs_dq_logitem_t;
+
+
+typedef struct xfs_qoff_logitem {
+	xfs_log_item_t		 qql_item;	/* common portion */
+	struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+	xfs_qoff_logformat_t	 qql_format;	/* logged structure */
+} xfs_qoff_logitem_t;
+
+
+extern void		   xfs_qm_dquot_logitem_init(struct xfs_dquot *);
+extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
+						    xfs_qoff_logitem_t *, uint);
+extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
+						   xfs_qoff_logitem_t *, uint);
+extern void		   xfs_trans_log_quotaoff_item(struct xfs_trans *,
+						       xfs_qoff_logitem_t *);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_DQUOT_ITEM_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_error.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_error.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_error.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_error.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+#ifdef DEBUG
+
+int	xfs_etrap[XFS_ERROR_NTRAP] = {
+	0,
+};
+
+int
+xfs_error_trap(int e)
+{
+	int i;
+
+	if (!e)
+		return 0;
+	for (i = 0; i < XFS_ERROR_NTRAP; i++) {
+		if (xfs_etrap[i] == 0)
+			break;
+		if (e != xfs_etrap[i])
+			continue;
+		cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
+		debug_stop_all_cpus((void *)-1LL);
+		BUG();
+		break;
+	}
+	return e;
+}
+#endif
+
+#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+
+int	xfs_etest[XFS_NUM_INJECT_ERROR];
+int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
+char *	xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+
+void
+xfs_error_test_init(void)
+{
+	bzero(xfs_etest, sizeof(xfs_etest));
+	bzero(xfs_etest_fsid, sizeof(xfs_etest_fsid));
+	bzero(xfs_etest_fsname, sizeof(xfs_etest_fsname));
+}
+
+int
+xfs_error_test(int error_tag, int *fsidp, char *expression,
+	       int line, char *file, unsigned long randfactor)
+{
+	int i;
+	int64_t fsid;
+
+	if (random() % randfactor)
+		return 0;
+
+	bcopy(fsidp, &fsid, sizeof(fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
+			cmn_err(CE_WARN,
+	"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"\n",
+				expression, file, line, xfs_etest_fsname[i]);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int
+xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+{
+	int i;
+	int len;
+	int64_t fsid;
+
+	bcopy(mp->m_fixedfsid, &fsid, sizeof(fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
+			cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
+			return 0;
+		}
+	}
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest[i] == 0) {
+			cmn_err(CE_WARN, "Turned on XFS error tag #%d",
+				error_tag);
+			xfs_etest[i] = error_tag;
+			xfs_etest_fsid[i] = fsid;
+			len = strlen(mp->m_fsname);
+			xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
+			strcpy(xfs_etest_fsname[i], mp->m_fsname);
+			return 0;
+		}
+	}
+
+	cmn_err(CE_WARN, "error tag overflow, too many turned on");
+
+	return 1;
+}
+
+int
+xfs_errortag_clear(int error_tag, xfs_mount_t *mp)
+{
+	int i;
+	int64_t fsid;
+
+	bcopy(mp->m_fixedfsid, &fsid, sizeof(fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+		if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
+			xfs_etest[i] = 0;
+			xfs_etest_fsid[i] = 0LL;
+			kmem_free(xfs_etest_fsname[i],
+				  strlen(xfs_etest_fsname[i]) + 1);
+			xfs_etest_fsname[i] = NULL;
+			cmn_err(CE_WARN, "Cleared XFS error tag #%d",
+				error_tag);
+			return 0;
+		}
+	}
+
+	cmn_err(CE_WARN, "XFS error tag %d not on", error_tag);
+
+	return 1;
+}
+
+int
+xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud)
+{
+	int i;
+	int cleared = 0;
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+		if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
+		     xfs_etest[i] != 0) {
+			cleared = 1;
+			cmn_err(CE_WARN, "Clearing XFS error tag #%d",
+				xfs_etest[i]);
+			xfs_etest[i] = 0;
+			xfs_etest_fsid[i] = 0LL;
+			kmem_free(xfs_etest_fsname[i],
+				  strlen(xfs_etest_fsname[i]) + 1);
+			xfs_etest_fsname[i] = NULL;
+		}
+	}
+
+	if (loud || cleared)
+		cmn_err(CE_WARN,
+			"Cleared all XFS error tags for filesystem \"%s\"",
+			fsname);
+
+	return 0;
+}
+
+int
+xfs_errortag_clearall(xfs_mount_t *mp)
+{
+	int64_t fsid;
+
+	bcopy(mp->m_fixedfsid, &fsid, sizeof(fsid_t));
+
+	return xfs_errortag_clearall_umount(fsid, mp->m_fsname, 1);
+}
+#endif /* DEBUG || INDUCE_IO_ERROR */
+
+static void
+xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
+{
+	char	*newfmt;
+	int	len = 16 + mp->m_fsname_len + strlen(fmt);
+
+	newfmt = kmem_alloc(len, KM_SLEEP);
+	sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
+	icmn_err(level, newfmt, ap);
+	kmem_free(newfmt, len);
+}
+
+void
+xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	xfs_fs_vcmn_err(level, mp, fmt, ap);
+	va_end(ap);
+}
+
+void
+xfs_cmn_err(uint64_t panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
+{
+	va_list ap;
+
+	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
+	    && (level & CE_ALERT)) {
+		level &= ~CE_ALERT;
+		level |= CE_PANIC;
+		cmn_err(CE_ALERT, "Transforming an alert into a panic.");
+	}
+	va_start(ap, fmt);
+	xfs_fs_vcmn_err(level, mp, fmt, ap);
+	va_end(ap);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_error.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_error.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_error.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_error.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ERROR_H__
+#define __XFS_ERROR_H__
+
+#define prdev(fmt,dev,args...) \
+	printk("XFS: device 0x%x- " fmt "\n", dev, ## args)
+
+#define XFS_ERECOVER	1	/* Failure to recover log */
+#define XFS_ELOGSTAT	2	/* Failure to stat log in user space */
+#define XFS_ENOLOGSPACE 3	/* Reservation too large */
+#define XFS_ENOTSUP	4	/* Operation not supported */
+#define XFS_ENOLSN	5	/* Can't find the lsn you asked for */
+#define XFS_ENOTFOUND	6
+#define XFS_ENOTXFS	7	/* Not XFS filesystem */
+
+#ifdef DEBUG
+#define XFS_ERROR_NTRAP 10
+extern int	xfs_etrap[XFS_ERROR_NTRAP];
+extern int	xfs_error_trap(int);
+#define XFS_ERROR(e)	xfs_error_trap(e)
+#else
+#define XFS_ERROR(e)	(e)
+#endif
+
+
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+
+#define XFS_ERRTAG_NOERROR				0
+#define XFS_ERRTAG_IFLUSH_1				1
+#define XFS_ERRTAG_IFLUSH_2				2
+#define XFS_ERRTAG_IFLUSH_3				3
+#define XFS_ERRTAG_IFLUSH_4				4
+#define XFS_ERRTAG_IFLUSH_5				5
+#define XFS_ERRTAG_IFLUSH_6				6
+#define XFS_ERRTAG_DA_READ_BUF				7
+#define XFS_ERRTAG_BTREE_CHECK_LBLOCK			8
+#define XFS_ERRTAG_BTREE_CHECK_SBLOCK			9
+#define XFS_ERRTAG_ALLOC_READ_AGF			10
+#define XFS_ERRTAG_IALLOC_READ_AGI			11
+#define XFS_ERRTAG_ITOBP_INOTOBP			12
+#define XFS_ERRTAG_IUNLINK				13
+#define XFS_ERRTAG_IUNLINK_REMOVE			14
+#define XFS_ERRTAG_DIR_INO_VALIDATE			15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK			16
+#define XFS_ERRTAG_IODONE_IOERR				17
+#define XFS_ERRTAG_STRATREAD_IOERR			18
+#define XFS_ERRTAG_STRATCMPL_IOERR			19
+#define XFS_ERRTAG_DIOWRITE_IOERR			20
+#define XFS_ERRTAG_MAX					21
+
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT				100
+#define XFS_RANDOM_IFLUSH_1				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK			(XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR				(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR			(XFS_RANDOM_DEFAULT/10)
+
+#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+extern int	xfs_error_test(int, int *, char *, int, char *, unsigned long);
+void xfs_error_test_init(void);
+
+#define XFS_NUM_INJECT_ERROR				10
+
+#ifdef __ANSI_CPP__
+#define XFS_TEST_ERROR(expr, mp, tag, rf)		\
+	((expr) || \
+	 xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
+			 (rf)))
+#else
+#define XFS_TEST_ERROR(expr, mp, tag, rf)		\
+	((expr) || \
+	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
+			(rf)))
+#endif /* __ANSI_CPP__ */
+
+int		xfs_errortag_add(int error_tag, xfs_mount_t *mp);
+int		xfs_errortag_clear(int error_tag, xfs_mount_t *mp);
+
+int		xfs_errortag_clearall(xfs_mount_t *mp);
+int		xfs_errortag_clearall_umount(int64_t fsid, char *fsname,
+						int loud);
+#else
+#define XFS_TEST_ERROR(expr, mp, tag, rf)	(expr)
+#define xfs_errortag_add(tag, mp)		(ENOSYS)
+#define xfs_errortag_clearall(mp)		(ENOSYS)
+#endif /* (DEBUG || INDUCE_IO_ERROR) */
+
+/*
+ * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
+ *			a panic by setting xfs_panic_mask in the
+ *			stune file.
+ */
+#define		XFS_NO_PTAG			0LL
+#define		XFS_PTAG_IFLUSH			0x0000000000000001LL
+#define		XFS_PTAG_LOGRES			0x0000000000000002LL
+#define		XFS_PTAG_AILDELETE		0x0000000000000004LL
+
+struct xfs_mount;
+/* PRINTFLIKE4 */
+void		xfs_cmn_err(uint64_t panic_tag, int level, struct xfs_mount *mp,
+			    char *fmt, ...);
+/* PRINTFLIKE3 */
+void		xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
+
+#endif	/* __XFS_ERROR_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_extfree_item.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_extfree_item.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_extfree_item.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_extfree_item.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains the implementation of the xfs_efi_log_item
+ * and xfs_efd_log_item items.
+ */
+
+#include <xfs.h>
+
+
+kmem_zone_t	*xfs_efi_zone;
+kmem_zone_t	*xfs_efd_zone;
+
+STATIC void	xfs_efi_item_unlock(xfs_efi_log_item_t *);
+STATIC void	xfs_efi_item_abort(xfs_efi_log_item_t *);
+STATIC void	xfs_efd_item_abort(xfs_efd_log_item_t *);
+
+
+
+/*
+ * This returns the number of iovecs needed to log the given efi item.
+ * We only need 1 iovec for an efi item.  It just logs the efi_log_format
+ * structure.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efi_item_size(xfs_efi_log_item_t *efip)
+{
+	return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efi log item. We use only 1 iovec, and we point that
+ * at the efi_log_format structure embedded in the efi item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efi item have been filled.
+ */
+STATIC void
+xfs_efi_item_format(xfs_efi_log_item_t	*efip,
+		    xfs_log_iovec_t	*log_vector)
+{
+	uint	size;
+
+	ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+
+	efip->efi_format.efi_type = XFS_LI_EFI;
+
+	size = sizeof(xfs_efi_log_format_t);
+	size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+	efip->efi_format.efi_size = 1;
+
+	log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
+	log_vector->i_len = size;
+	ASSERT(size >= sizeof(xfs_efi_log_format_t));
+}
+
+
+/*
+ * Pinning has no meaning for an efi item, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_pin(xfs_efi_log_item_t *efip)
+{
+	return;
+}
+
+
+/*
+ * While EFIs cannot really be pinned, the unpin operation is the
+ * last place at which the EFI is manipulated during a transaction.
+ * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * free the EFI.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
+{
+	int		nexts;
+	int		size;
+	xfs_mount_t	*mp;
+	SPLDECL(s);
+
+	mp = efip->efi_item.li_mountp;
+	AIL_LOCK(mp, s);
+	if (efip->efi_flags & XFS_EFI_CANCELED) {
+		/*
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			size = sizeof(xfs_efi_log_item_t);
+			size += (nexts - 1) * sizeof(xfs_extent_t);
+			kmem_free(efip, size);
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+		}
+	} else {
+		efip->efi_flags |= XFS_EFI_COMMITTED;
+		AIL_UNLOCK(mp, s);
+	}
+
+	return;
+}
+
+/*
+ * like unpin only we have to also clear the xaction descriptor
+ * pointing the log item if we free the item.  This routine duplicates
+ * unpin because efi_flags is protected by the AIL lock.  Freeing
+ * the descriptor and then calling unpin would force us to drop the AIL
+ * lock which would open up a race condition.
+ */
+STATIC void
+xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
+{
+	int		nexts;
+	int		size;
+	xfs_mount_t	*mp;
+	xfs_log_item_desc_t	*lidp;
+	SPLDECL(s);
+
+	mp = efip->efi_item.li_mountp;
+	AIL_LOCK(mp, s);
+	if (efip->efi_flags & XFS_EFI_CANCELED) {
+		/*
+		 * free the xaction descriptor pointing to this item
+		 */
+		lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
+		xfs_trans_free_item(tp, lidp);
+		/*
+		 * pull the item off the AIL.
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+		/*
+		 * now free the item itself
+		 */
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			size = sizeof(xfs_efi_log_item_t);
+			size += (nexts - 1) * sizeof(xfs_extent_t);
+			kmem_free(efip, size);
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+		}
+	} else {
+		efip->efi_flags |= XFS_EFI_COMMITTED;
+		AIL_UNLOCK(mp, s);
+	}
+
+	return;
+}
+
+/*
+ * Efi items have no locking or pushing.  However, since EFIs are
+ * pulled from the AIL when their corresponding EFDs are committed
+ * to disk, their situation is very similar to being pinned.  Return
+ * XFS_ITEM_PINNED so that the caller will eventually flush the log.
+ * This should help in getting the EFI out of the AIL.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * Efi items have no locking, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
+{
+	if (efip->efi_item.li_flags & XFS_LI_ABORTED)
+		xfs_efi_item_abort(efip);
+	return;
+}
+
+/*
+ * The EFI is logged only once and cannot be moved in the log, so
+ * simply return the lsn at which it's been logged.  The canceled
+ * flag is not paid any attention here.	 Checking for that is delayed
+ * until the EFI is unpinned.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+{
+	return lsn;
+}
+
+/*
+ * This is called when the transaction logging the EFI is aborted.
+ * Free up the EFI and return.	No need to clean up the slot for
+ * the item in the transaction.	 That was done by the unpin code
+ * which is called prior to this routine in the abort/fs-shutdown path.
+ */
+STATIC void
+xfs_efi_item_abort(xfs_efi_log_item_t *efip)
+{
+	int	nexts;
+	int	size;
+
+	nexts = efip->efi_format.efi_nextents;
+	if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+		size = sizeof(xfs_efi_log_item_t);
+		size += (nexts - 1) * sizeof(xfs_extent_t);
+		kmem_free(efip, size);
+	} else {
+		kmem_zone_free(xfs_efi_zone, efip);
+	}
+	return;
+}
+
+/*
+ * There isn't much you can do to push on an efi item.	It is simply
+ * stuck waiting for all of its corresponding efd items to be
+ * committed to disk.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_push(xfs_efi_log_item_t *efip)
+{
+	return;
+}
+
+/*
+ * The EFI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+{
+	return;
+}
+
+/*
+ * This is the ops vector shared by all efi log items.
+ */
+struct xfs_item_ops xfs_efi_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_efi_item_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_efi_item_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+					xfs_efi_item_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_efi_item_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_efi_item_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_efi_item_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_efi_item_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_efi_item_committing
+};
+
+
+/*
+ * Allocate and initialize an efi item with the given number of extents.
+ */
+xfs_efi_log_item_t *
+xfs_efi_init(xfs_mount_t	*mp,
+	     uint		nextents)
+
+{
+	xfs_efi_log_item_t	*efip;
+	uint			size;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+		size = (uint)(sizeof(xfs_efi_log_item_t) +
+			((nextents - 1) * sizeof(xfs_extent_t)));
+		efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+	} else {
+		efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone,
+							     KM_SLEEP);
+	}
+
+	efip->efi_item.li_type = XFS_LI_EFI;
+	efip->efi_item.li_ops = &xfs_efi_item_ops;
+	efip->efi_item.li_mountp = mp;
+	efip->efi_format.efi_nextents = nextents;
+	efip->efi_format.efi_id = (__psint_t)(void*)efip;
+
+	return (efip);
+}
+
+/*
+ * This is called by the efd item code below to release references to
+ * the given efi item.	Each efd calls this with the number of
+ * extents that it has logged, and when the sum of these reaches
+ * the total number of extents logged by this efi item we can free
+ * the efi item.
+ *
+ * Freeing the efi item requires that we remove it from the AIL.
+ * We'll use the AIL lock to protect our counters as well as
+ * the removal from the AIL.
+ */
+void
+xfs_efi_release(xfs_efi_log_item_t	*efip,
+		uint			nextents)
+{
+	xfs_mount_t	*mp;
+	int		extents_left;
+	uint		size;
+	int		nexts;
+	SPLDECL(s);
+
+	mp = efip->efi_item.li_mountp;
+	ASSERT(efip->efi_next_extent > 0);
+	ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
+
+	AIL_LOCK(mp, s);
+	ASSERT(efip->efi_next_extent >= nextents);
+	efip->efi_next_extent -= nextents;
+	extents_left = efip->efi_next_extent;
+	if (extents_left == 0) {
+		/*
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+	} else {
+		AIL_UNLOCK(mp, s);
+	}
+
+	if (extents_left == 0) {
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			size = sizeof(xfs_efi_log_item_t);
+			size += (nexts - 1) * sizeof(xfs_extent_t);
+			kmem_free(efip, size);
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+		}
+	}
+}
+
+/*
+ * This is called when the transaction that should be committing the
+ * EFD corresponding to the given EFI is aborted.  The committed and
+ * canceled flags are used to coordinate the freeing of the EFI and
+ * the references by the transaction that committed it.
+ */
+STATIC void
+xfs_efi_cancel(
+	xfs_efi_log_item_t	*efip)
+{
+	int		nexts;
+	int		size;
+	xfs_mount_t	*mp;
+	SPLDECL(s);
+
+	mp = efip->efi_item.li_mountp;
+	AIL_LOCK(mp, s);
+	if (efip->efi_flags & XFS_EFI_COMMITTED) {
+		/*
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			size = sizeof(xfs_efi_log_item_t);
+			size += (nexts - 1) * sizeof(xfs_extent_t);
+			kmem_free(efip, size);
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+		}
+	} else {
+		efip->efi_flags |= XFS_EFI_CANCELED;
+		AIL_UNLOCK(mp, s);
+	}
+
+	return;
+}
+
+
+
+
+
+/*
+ * This returns the number of iovecs needed to log the given efd item.
+ * We only need 1 iovec for an efd item.  It just logs the efd_log_format
+ * structure.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efd_item_size(xfs_efd_log_item_t *efdp)
+{
+	return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efd log item. We use only 1 iovec, and we point that
+ * at the efd_log_format structure embedded in the efd item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efd item have been filled.
+ */
+STATIC void
+xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
+		    xfs_log_iovec_t	*log_vector)
+{
+	uint	size;
+
+	ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
+
+	efdp->efd_format.efd_type = XFS_LI_EFD;
+
+	size = sizeof(xfs_efd_log_format_t);
+	size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+	efdp->efd_format.efd_size = 1;
+
+	log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
+	log_vector->i_len = size;
+	ASSERT(size >= sizeof(xfs_efd_log_format_t));
+}
+
+
+/*
+ * Pinning has no meaning for an efd item, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
+{
+	return;
+}
+
+
+/*
+ * Since pinning has no meaning for an efd item, unpinning does
+ * not either.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
+{
+	return;
+}
+
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp)
+{
+	return;
+}
+
+/*
+ * Efd items have no locking, so just return success.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
+{
+	return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Efd items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
+{
+	if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
+		xfs_efd_item_abort(efdp);
+	return;
+}
+
+/*
+ * When the efd item is committed to disk, all we need to do
+ * is delete our reference to our partner efi item and then
+ * free ourselves.  Since we're freeing ourselves we must
+ * return -1 to keep the transaction code from further referencing
+ * this item.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
+{
+	uint	size;
+	int	nexts;
+
+	/*
+	 * If we got a log I/O error, it's always the case that the LR with the
+	 * EFI got unpinned and freed before the EFD got aborted.
+	 */
+	if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+		xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
+
+	nexts = efdp->efd_format.efd_nextents;
+	if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+		size = sizeof(xfs_efd_log_item_t);
+		size += (nexts - 1) * sizeof(xfs_extent_t);
+		kmem_free(efdp, size);
+	} else {
+		kmem_zone_free(xfs_efd_zone, efdp);
+	}
+
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * The transaction of which this EFD is a part has been aborted.
+ * Inform its companion EFI of this fact and then clean up after
+ * ourselves.  No need to clean up the slot for the item in the
+ * transaction.	 That was done by the unpin code which is called
+ * prior to this routine in the abort/fs-shutdown path.
+ */
+STATIC void
+xfs_efd_item_abort(xfs_efd_log_item_t *efdp)
+{
+	int	nexts;
+	int	size;
+
+	/*
+	 * If we got a log I/O error, it's always the case that the LR with the
+	 * EFI got unpinned and freed before the EFD got aborted. So don't
+	 * reference the EFI at all in that case.
+	 */
+	if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+		xfs_efi_cancel(efdp->efd_efip);
+
+	nexts = efdp->efd_format.efd_nextents;
+	if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+		size = sizeof(xfs_efd_log_item_t);
+		size += (nexts - 1) * sizeof(xfs_extent_t);
+		kmem_free(efdp, size);
+	} else {
+		kmem_zone_free(xfs_efd_zone, efdp);
+	}
+	return;
+}
+
+/*
+ * There isn't much you can do to push on an efd item.	It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_push(xfs_efd_log_item_t *efdp)
+{
+	return;
+}
+
+/*
+ * The EFD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn)
+{
+	return;
+}
+
+/*
+ * This is the ops vector shared by all efd log items.
+ */
+struct xfs_item_ops xfs_efd_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_efd_item_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_efd_item_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+					xfs_efd_item_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_efd_item_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_efd_item_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_efd_item_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_efd_item_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_efd_item_committing
+};
+
+
+/*
+ * Allocate and initialize an efd item with the given number of extents.
+ */
+xfs_efd_log_item_t *
+xfs_efd_init(xfs_mount_t	*mp,
+	     xfs_efi_log_item_t *efip,
+	     uint		nextents)
+
+{
+	xfs_efd_log_item_t	*efdp;
+	uint			size;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
+		size = (uint)(sizeof(xfs_efd_log_item_t) +
+			((nextents - 1) * sizeof(xfs_extent_t)));
+		efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+	} else {
+		efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone,
+							     KM_SLEEP);
+	}
+
+	efdp->efd_item.li_type = XFS_LI_EFD;
+	efdp->efd_item.li_ops = &xfs_efd_item_ops;
+	efdp->efd_item.li_mountp = mp;
+	efdp->efd_efip = efip;
+	efdp->efd_format.efd_nextents = nextents;
+	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+	return (efdp);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_extfree_item.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_extfree_item.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_extfree_item.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_extfree_item.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_EXTFREE_ITEM_H__
+#define __XFS_EXTFREE_ITEM_H__
+
+struct xfs_mount;
+struct kmem_zone;
+
+typedef struct xfs_extent {
+	xfs_dfsbno_t	ext_start;
+	xfs_extlen_t	ext_len;
+} xfs_extent_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.	 The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+	unsigned short		efi_type;	/* efi log item type */
+	unsigned short		efi_size;	/* size of this item */
+	uint			efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_t		efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_t;
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.	 The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+	unsigned short		efd_type;	/* efd log item type */
+	unsigned short		efd_size;	/* size of this item */
+	uint			efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_t		efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_t;
+
+
+#ifdef __KERNEL__
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_EFI_MAX_FAST_EXTENTS	16
+
+/*
+ * Define EFI flags.
+ */
+#define XFS_EFI_RECOVERED	0x1
+#define XFS_EFI_COMMITTED	0x2
+#define XFS_EFI_CANCELED	0x4
+
+/*
+ * This is the "extent free intention" log item.  It is used
+ * to log the fact that some extents need to be free.  It is
+ * used in conjunction with the "extent free done" log item
+ * described below.
+ */
+typedef struct xfs_efi_log_item {
+	xfs_log_item_t		efi_item;
+	uint			efi_flags;	/* misc flags */
+	uint			efi_next_extent;
+	xfs_efi_log_format_t	efi_format;
+} xfs_efi_log_item_t;
+
+/*
+ * This is the "extent free done" log item.  It is used to log
+ * the fact that some extents earlier mentioned in an efi item
+ * have been freed.
+ */
+typedef struct xfs_efd_log_item {
+	xfs_log_item_t		efd_item;
+	xfs_efi_log_item_t	*efd_efip;
+	uint			efd_next_extent;
+	xfs_efd_log_format_t	efd_format;
+} xfs_efd_log_item_t;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_EFD_MAX_FAST_EXTENTS	16
+
+extern struct kmem_zone	*xfs_efi_zone;
+extern struct kmem_zone	*xfs_efd_zone;
+
+xfs_efi_log_item_t	*xfs_efi_init(struct xfs_mount *, uint);
+xfs_efd_log_item_t	*xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
+				      uint);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_EXTFREE_ITEM_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_fs.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_fs.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_fs.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_fs.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 1995-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307,
+ * USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef _LINUX_XFS_FS_H
+#define _LINUX_XFS_FS_H
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+
+
+/*
+ * SGI's XFS filesystem's major stuff (constants, structures)
+ */
+
+#define XFS_NAME	"xfs"
+
+/*
+ * Direct I/O attribute record used with XFS_IOC_DIOINFO
+ * d_miniosz is the min xfer size, xfer size multiple and file seek offset
+ * alignment.
+ */
+struct dioattr {
+	__u32		d_mem;		/* data buffer memory alignment */
+	__u32		d_miniosz;	/* min xfer size		*/
+	__u32		d_maxiosz;	/* max xfer size		*/
+};
+
+/*
+ * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+	__u32		fsx_xflags;	/* xflags field value (get/set) */
+	__u32		fsx_extsize;	/* extsize field value (get/set)*/
+	__u32		fsx_nextents;	/* nextents field value (get)	*/
+	unsigned char	fsx_pad[16];
+};
+
+/*
+ * Flags for the bs_xflags/fsx_xflags field
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_DIFLAG_s.
+ */
+#define XFS_XFLAG_REALTIME	0x00000001
+#define XFS_XFLAG_PREALLOC	0x00000002
+#define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
+#define XFS_XFLAG_ALL		\
+	( XFS_XFLAG_REALTIME|XFS_XFLAG_PREALLOC|XFS_XFLAG_HASATTR )
+
+
+/*
+ * Structure for XFS_IOC_GETBMAP.
+ * On input, fill in bmv_offset and bmv_length of the first structure
+ * to indicate the area of interest in the file, and bmv_entry with the
+ * number of array elements given.  The first structure is updated on
+ * return to give the offset and length for the next call.
+ */
+struct getbmap {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output)  */
+};
+
+/*
+ *	Structure for XFS_IOC_GETBMAPX.	 Fields bmv_offset through bmv_entries
+ *	are used exactly as in the getbmap structure.  The getbmapx structure
+ *	has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field
+ *	is only used for the first structure.  It contains input flags
+ *	specifying XFS_IOC_GETBMAPX actions.  The bmv_oflags field is filled
+ *	in by the XFS_IOC_GETBMAPX command for each returned structure after
+ *	the first.
+ */
+struct getbmapx {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output). */
+	__s32		bmv_iflags;	/* input flags (1st structure)	    */
+	__s32		bmv_oflags;	/* output flags (after 1st structure)*/
+	__s32		bmv_unused1;	/* future use			    */
+	__s32		bmv_unused2;	/* future use			    */
+};
+
+/*	bmv_iflags values - set by XFS_IOC_GETBMAPX caller.	*/
+#define BMV_IF_ATTRFORK		0x1	/* return attr fork rather than data */
+#define BMV_IF_NO_DMAPI_READ	0x2	/* Do not generate DMAPI read event  */
+#define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */
+#define BMV_IF_VALID	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
+#ifdef __KERNEL__
+#define BMV_IF_EXTENDED 0x40000000	/* getpmapx if set */
+#endif
+
+/*	bmv_oflags values - returned for for each non-header segment */
+#define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
+
+/*	Convert getbmap <-> getbmapx - move fields from p1 to p2. */
+#define GETBMAP_CONVERT(p1,p2) {	\
+	p2.bmv_offset = p1.bmv_offset;	\
+	p2.bmv_block = p1.bmv_block;	\
+	p2.bmv_length = p1.bmv_length;	\
+	p2.bmv_count = p1.bmv_count;	\
+	p2.bmv_entries = p1.bmv_entries;  }
+
+
+/*
+ * Structure for XFS_IOC_FSSETDM.
+ * For use by backup and restore programs to set the XFS on-disk inode
+ * fields di_dmevmask and di_dmstate.  These must be set to exactly and
+ * only values previously obtained via xfs_bulkstat!  (Specifically the
+ * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
+ */
+struct fsdmidata {
+	__u32		fsd_dmevmask;	/* corresponds to di_dmevmask */
+	__u16		fsd_padding;
+	__u16		fsd_dmstate;	/* corresponds to di_dmstate  */
+};
+
+/*
+ * File segment locking set data type for 64 bit access.
+ * Also used for all the RESV/FREE interfaces.
+ */
+typedef struct xfs_flock64 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	pid_t		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+} xfs_flock64_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V1
+ */
+typedef struct xfs_fsop_geom_v1 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+} xfs_fsop_geom_v1_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY
+ */
+typedef struct xfs_fsop_geom {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+	__u32		logsunit;	/* log stripe unit, bytes */
+} xfs_fsop_geom_t;
+
+/* Output for XFS_FS_COUNTS */
+typedef struct xfs_fsop_counts {
+	__u64	freedata;	/* free data section blocks */
+	__u64	freertx;	/* free rt extents */
+	__u64	freeino;	/* free inodes */
+	__u64	allocino;	/* total allocated inodes */
+} xfs_fsop_counts_t;
+
+/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */
+typedef struct xfs_fsop_resblks {
+	__u64  resblks;
+	__u64  resblks_avail;
+} xfs_fsop_resblks_t;
+
+#define XFS_FSOP_GEOM_VERSION	0
+
+#define XFS_FSOP_GEOM_FLAGS_ATTR	0x01	/* attributes in use	*/
+#define XFS_FSOP_GEOM_FLAGS_NLINK	0x02	/* 32-bit nlink values	*/
+#define XFS_FSOP_GEOM_FLAGS_QUOTA	0x04	/* quotas enabled	*/
+#define XFS_FSOP_GEOM_FLAGS_IALIGN	0x08	/* inode alignment	*/
+#define XFS_FSOP_GEOM_FLAGS_DALIGN	0x10	/* large data alignment */
+#define XFS_FSOP_GEOM_FLAGS_SHARED	0x20	/* read-only shared	*/
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG	0x40	/* special extent flag	*/
+#define XFS_FSOP_GEOM_FLAGS_DIRV2	0x80	/* directory version 2	*/
+#define XFS_FSOP_GEOM_FLAGS_LOGV2      0x100	/* log format version 2 */
+
+
+/*
+ * Minimum and maximum sizes need for growth checks
+ */
+#define XFS_MIN_AG_BLOCKS	64
+#define XFS_MIN_LOG_BLOCKS	512
+#define XFS_MAX_LOG_BLOCKS	(64 * 1024)
+#define XFS_MIN_LOG_BYTES	(256 * 1024)
+#define XFS_MAX_LOG_BYTES	(128 * 1024 * 1024)
+
+/*
+ * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
+ */
+typedef struct xfs_growfs_data {
+	__u64		newblocks;	/* new data subvol size, fsblocks */
+	__u32		imaxpct;	/* new inode space percentage limit */
+} xfs_growfs_data_t;
+
+typedef struct xfs_growfs_log {
+	__u32		newblocks;	/* new log size, fsblocks */
+	__u32		isint;		/* 1 if new log is internal */
+} xfs_growfs_log_t;
+
+typedef struct xfs_growfs_rt {
+	__u64		newblocks;	/* new realtime size, fsblocks */
+	__u32		extsize;	/* new realtime extent size, fsblocks */
+} xfs_growfs_rt_t;
+
+
+/*
+ * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
+ */
+typedef struct xfs_bstime {
+	time_t		tv_sec;		/* seconds		*/
+	__s32		tv_nsec;	/* and nanoseconds	*/
+} xfs_bstime_t;
+
+typedef struct xfs_bstat {
+	__u64		bs_ino;		/* inode number			*/
+	__u16		bs_mode;	/* type and mode		*/
+	__u16		bs_nlink;	/* number of links		*/
+	__u32		bs_uid;		/* user id			*/
+	__u32		bs_gid;		/* group id			*/
+	__u32		bs_rdev;	/* device value			*/
+	__s32		bs_blksize;	/* block size			*/
+	__s64		bs_size;	/* file size			*/
+	xfs_bstime_t	bs_atime;	/* access time			*/
+	xfs_bstime_t	bs_mtime;	/* modify time			*/
+	xfs_bstime_t	bs_ctime;	/* inode change time		*/
+	int64_t		bs_blocks;	/* number of blocks		*/
+	__u32		bs_xflags;	/* extended flags		*/
+	__s32		bs_extsize;	/* extent size			*/
+	__s32		bs_extents;	/* number of extents		*/
+	__u32		bs_gen;		/* generation count		*/
+	__u16		bs_projid;	/* project id			*/
+	unsigned char	bs_pad[14];	/* pad space, unused		*/
+	__u32		bs_dmevmask;	/* DMIG event mask		*/
+	__u16		bs_dmstate;	/* DMIG state info		*/
+	__u16		bs_aextents;	/* attribute number of extents	*/
+} xfs_bstat_t;
+
+/*
+ * The user-level BulkStat Request interface structure.
+ */
+typedef struct xfs_fsop_bulkreq {
+	__u64		*lastip;	/* last inode # pointer		*/
+	__s32		icount;		/* count of entries in buffer	*/
+	void		*ubuffer;	/* user buffer for inode desc.	*/
+	__s32		*ocount;	/* output count pointer		*/
+} xfs_fsop_bulkreq_t;
+
+
+/*
+ * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
+ */
+typedef struct xfs_inogrp {
+	__u64		xi_startino;	/* starting inode number	*/
+	__s32		xi_alloccount;	/* # bits set in allocmask	*/
+	__u64		xi_allocmask;	/* mask of allocated inodes	*/
+} xfs_inogrp_t;
+
+
+/*
+ * Error injection.
+ */
+typedef struct xfs_error_injection {
+	__s32		fd;
+	__s32		errtag;
+} xfs_error_injection_t;
+
+
+/*
+ * The user-level Handle Request interface structure.
+ */
+typedef struct xfs_fsop_handlereq {
+	__u32		fd;		/* fd for FD_TO_HANDLE		*/
+	void		*path;		/* user pathname		*/
+	__u32		oflags;		/* open flags			*/
+	void		*ihandle;	/* user supplied handle		*/
+	__u32		ihandlen;	/* user supplied length		*/
+	void		*ohandle;	/* user buffer for handle	*/
+	__u32		*ohandlen;	/* user buffer length		*/
+} xfs_fsop_handlereq_t;
+
+/*
+ * Compound structures for passing args through Handle Request interfaces
+ * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and
+ *	     XFS_IOC_ATTRMULTI_BY_HANDLE
+ */
+
+typedef struct xfs_fsop_setdm_handlereq {
+	struct xfs_fsop_handlereq hreq; /* handle interface structure */
+	struct fsdmidata *data;		/* DMAPI data to set	      */
+} xfs_fsop_setdm_handlereq_t;
+
+typedef struct xfs_attrlist_cursor {
+	__u32	opaque[4];
+} xfs_attrlist_cursor_t;
+
+typedef struct xfs_fsop_attrlist_handlereq {
+	struct xfs_fsop_handlereq hreq; /* handle interface structure */
+	struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
+	__u32 flags;			/* flags, use ROOT/USER names */
+	__u32 buflen;			/* length of buffer supplied  */
+	void *buffer;			/* attrlist data to return    */
+} xfs_fsop_attrlist_handlereq_t;
+
+typedef struct xfs_attr_multiop {
+	__u32	am_opcode;
+	__s32	am_error;
+	void	*am_attrname;
+	void	*am_attrvalue;
+	__u32	am_length;
+	__u32	am_flags;
+} xfs_attr_multiop_t;
+
+typedef struct xfs_fsop_attrmulti_handlereq {
+	struct xfs_fsop_handlereq hreq; /* handle interface structure */
+	__u32 opcount;			/* count of following multiop */
+	struct xfs_attr_multiop *ops;	/* attr_multi data to get/set */
+} xfs_fsop_attrmulti_handlereq_t;
+
+/*
+ * File system identifier. Should be unique (at least per machine).
+ */
+typedef struct {
+	__u32 val[2];			/* file system id type */
+} xfs_fsid_t;
+
+/*
+ * File identifier.  Should be unique per filesystem on a single machine.
+ * This is typically called by a stateless file server in order to generate
+ * "file handles".
+ */
+#define MAXFIDSZ	46
+typedef struct fid {
+	__u16		fid_len;		/* length of data in bytes */
+	unsigned char	fid_data[MAXFIDSZ];	/* data (variable length)  */
+} fid_t;
+
+typedef struct xfs_fid {
+	__u16	xfs_fid_len;		/* length of remainder	*/
+	__u16	xfs_fid_pad;
+	__u32	xfs_fid_gen;		/* generation number	*/
+	__u64	xfs_fid_ino;		/* 64 bits inode number */
+} xfs_fid_t;
+
+typedef struct xfs_fid2 {
+	__u16	fid_len;	/* length of remainder */
+	__u16	fid_pad;	/* padding, must be zero */
+	__u32	fid_gen;	/* generation number */
+	__u64	fid_ino;	/* inode number */
+} xfs_fid2_t;
+
+typedef struct xfs_handle {
+	union {
+		__s64	    align;	/* force alignment of ha_fid	 */
+		xfs_fsid_t  _ha_fsid;	/* unique file system identifier */
+	} ha_u;
+	xfs_fid_t	ha_fid;		/* file system specific file ID	 */
+} xfs_handle_t;
+#define ha_fsid ha_u._ha_fsid
+
+#define XFS_HSIZE(handle)	(((char *) &(handle).ha_fid.xfs_fid_pad	 \
+				 - (char *) &(handle))			  \
+				 + (handle).ha_fid.xfs_fid_len)
+
+#define XFS_HANDLE_CMP(h1, h2)	bcmp(h1, h2, sizeof (xfs_handle_t))
+
+#define FSHSIZE		sizeof (fsid_t)
+
+
+/*
+ * ioctl commands that replace IRIX fcntl()'s
+ * For 'documentation' purposed more than anything else,
+ * the "cmd #" field reflects the IRIX fcntl number.
+ */
+#define XFS_IOC_ALLOCSP		_IOW ('X', 10, struct xfs_flock64)
+#define XFS_IOC_FREESP		_IOW ('X', 11, struct xfs_flock64)
+#define XFS_IOC_DIOINFO		_IOR ('X', 30, struct dioattr)
+#define XFS_IOC_FSGETXATTR	_IOR ('X', 31, struct fsxattr)
+#define XFS_IOC_FSSETXATTR	_IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_ALLOCSP64	_IOW ('X', 36, struct xfs_flock64)
+#define XFS_IOC_FREESP64	_IOW ('X', 37, struct xfs_flock64)
+#define XFS_IOC_GETBMAP		_IOWR('X', 38, struct getbmap)
+#define XFS_IOC_FSSETDM		_IOW ('X', 39, struct fsdmidata)
+#define XFS_IOC_RESVSP		_IOW ('X', 40, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP	_IOW ('X', 41, struct xfs_flock64)
+#define XFS_IOC_RESVSP64	_IOW ('X', 42, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP64	_IOW ('X', 43, struct xfs_flock64)
+#define XFS_IOC_GETBMAPA	_IOWR('X', 44, struct getbmap)
+#define XFS_IOC_FSGETXATTRA	_IOR ('X', 45, struct fsxattr)
+/*	XFS_IOC_SETBIOSIZE ---- deprecated 46	   */
+/*	XFS_IOC_GETBIOSIZE ---- deprecated 47	   */
+#define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
+
+/*
+ * ioctl commands that replace IRIX syssgi()'s
+ */
+#define XFS_IOC_FSGEOMETRY_V1	     _IOR ('X', 100, struct xfs_fsop_geom_v1)
+#define XFS_IOC_FSBULKSTAT	     _IOWR('X', 101, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE    _IOWR('X', 102, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS	     _IOWR('X', 103, struct xfs_fsop_bulkreq)
+#define XFS_IOC_PATH_TO_FSHANDLE     _IOWR('X', 104, struct xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE	     _IOWR('X', 105, struct xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE	     _IOWR('X', 106, struct xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE	     _IOWR('X', 107, struct xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE   _IOWR('X', 108, struct xfs_fsop_handlereq)
+#define XFS_IOC_SWAPEXT		     _IOWR('X', 109, struct xfs_swapext)
+#define XFS_IOC_FSGROWFSDATA	     _IOW ('X', 110, struct xfs_growfs_data)
+#define XFS_IOC_FSGROWFSLOG	     _IOW ('X', 111, struct xfs_growfs_log)
+#define XFS_IOC_FSGROWFSRT	     _IOW ('X', 112, struct xfs_growfs_rt)
+#define XFS_IOC_FSCOUNTS	     _IOR ('X', 113, struct xfs_fsop_counts)
+#define XFS_IOC_SET_RESBLKS	     _IOR ('X', 114, struct xfs_fsop_resblks)
+#define XFS_IOC_GET_RESBLKS	     _IOR ('X', 115, struct xfs_fsop_resblks)
+#define XFS_IOC_ERROR_INJECTION	     _IOW ('X', 116, struct xfs_error_injection)
+#define XFS_IOC_ERROR_CLEARALL	     _IOW ('X', 117, struct xfs_error_injection)
+/*	XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118	 */
+#define XFS_IOC_FREEZE		     _IOWR('X', 119, int)
+#define XFS_IOC_THAW		     _IOWR('X', 120, int)
+#define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
+#define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
+#define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
+#define XFS_IOC_FSGEOMETRY	     _IOR ('X', 124, struct xfs_fsop_geom)
+/*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
+
+
+/*
+ * Block I/O parameterization.	A basic block (BB) is the lowest size of
+ * filesystem allocation, and must equal 512.  Length units given to bio
+ * routines are in BB's.
+ */
+#define BBSHIFT		9
+#define BBSIZE		(1<<BBSHIFT)
+#define BBMASK		(BBSIZE-1)
+#define BTOBB(bytes)	(((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define BTOBBT(bytes)	((__u64)(bytes) >> BBSHIFT)
+#define BBTOB(bbs)	((bbs) << BBSHIFT)
+#define OFFTOBB(bytes)	(((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define OFFTOBBT(bytes) ((__u64)(bytes) >> BBSHIFT)
+#define BBTOOFF(bbs)	((__u64)(bbs) << BBSHIFT)
+
+#define SEEKLIMIT32	0x7fffffff
+#define BBSEEKLIMIT32	BTOBBT(SEEKLIMIT32)
+#define SEEKLIMIT	0x7fffffffffffffffLL
+#define BBSEEKLIMIT	OFFTOBBT(SEEKLIMIT)
+
+#endif	/* _LINUX_XFS_FS_H */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_fsops.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_fsops.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_fsops.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_fsops.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * File system operations
+ */
+
+int
+xfs_fs_geometry(
+	xfs_mount_t		*mp,
+	xfs_fsop_geom_t		*geo,
+	int			new_version)
+{
+	geo->blocksize = mp->m_sb.sb_blocksize;
+	geo->rtextsize = mp->m_sb.sb_rextsize;
+	geo->agblocks = mp->m_sb.sb_agblocks;
+	geo->agcount = mp->m_sb.sb_agcount;
+	geo->logblocks = mp->m_sb.sb_logblocks;
+	geo->sectsize = mp->m_sb.sb_sectsize;
+	geo->inodesize = mp->m_sb.sb_inodesize;
+	geo->imaxpct = mp->m_sb.sb_imax_pct;
+	geo->datablocks = mp->m_sb.sb_dblocks;
+	geo->rtblocks = mp->m_sb.sb_rblocks;
+	geo->rtextents = mp->m_sb.sb_rextents;
+	geo->logstart = mp->m_sb.sb_logstart;
+	ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid));
+	memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid));
+	if (new_version >= 2) {
+		geo->sunit = mp->m_sb.sb_unit;
+		geo->swidth = mp->m_sb.sb_width;
+	}
+	if (new_version >= 3) {
+		geo->version = XFS_FSOP_GEOM_VERSION;
+		geo->flags =
+			(XFS_SB_VERSION_HASATTR(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
+			(XFS_SB_VERSION_HASNLINK(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
+			(XFS_SB_VERSION_HASQUOTA(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
+			(XFS_SB_VERSION_HASALIGN(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
+			(XFS_SB_VERSION_HASDALIGN(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
+			(XFS_SB_VERSION_HASSHARED(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
+			(XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
+			(XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DIRV2 : 0);
+		geo->logsectsize = mp->m_sb.sb_sectsize;	/* XXX */
+		geo->rtsectsize = mp->m_sb.sb_sectsize;		/* XXX */
+		geo->dirblocksize = mp->m_dirblksize;
+	}
+	if (new_version >= 4) {
+		geo->flags |=
+			(XFS_SB_VERSION_HASLOGV2(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
+		geo->logsunit = mp->m_sb.sb_logsunit;
+	}
+	return 0;
+}
+
+static int
+xfs_growfs_data_private(
+	xfs_mount_t		*mp,		/* mount point for filesystem */
+	xfs_growfs_data_t	*in)		/* growfs data input struct */
+{
+	xfs_agf_t		*agf;
+	xfs_agi_t		*agi;
+	xfs_agnumber_t		agno;
+	xfs_extlen_t		agsize;
+	xfs_extlen_t		tmpsize;
+	xfs_alloc_rec_t		*arec;
+	xfs_btree_sblock_t	*block;
+	xfs_buf_t		*bp;
+	int			bsize;
+	int			bucket;
+	xfs_daddr_t		disk_addr;
+	int			dpct;
+	int			error;
+	xfs_agnumber_t		nagcount;
+	xfs_rfsblock_t		nb, nb_mod;
+	xfs_rfsblock_t		new;
+	xfs_rfsblock_t		nfree;
+	xfs_agnumber_t		oagcount;
+	int			pct;
+	xfs_sb_t		*sbp;
+	int			sectbb;
+	xfs_trans_t		*tp;
+
+	nb = in->newblocks;
+	pct = in->imaxpct;
+	if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+		return XFS_ERROR(EINVAL);
+	dpct = pct - mp->m_sb.sb_imax_pct;
+	error = xfs_read_buf(mp, mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - 1, 1,
+		0, &bp);
+	if (error)
+		return error;
+	ASSERT(bp);
+	xfs_buf_relse(bp);
+
+	new = nb;	/* use new as a temporary here */
+	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
+	nagcount = new + (nb_mod != 0);
+	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
+		nagcount--;
+		nb = nagcount * mp->m_sb.sb_agblocks;
+		if (nb < mp->m_sb.sb_dblocks)
+			return XFS_ERROR(EINVAL);
+	}
+	new = in->newblocks - mp->m_sb.sb_dblocks;
+	oagcount = mp->m_sb.sb_agcount;
+	if (nagcount > oagcount) {
+		down_write(&mp->m_peraglock);
+		mp->m_perag = kmem_realloc(mp->m_perag,
+			sizeof(xfs_perag_t) * nagcount,
+			sizeof(xfs_perag_t) * oagcount,
+			KM_SLEEP);
+		bzero(&mp->m_perag[oagcount],
+			(nagcount - oagcount) * sizeof(xfs_perag_t));
+		mp->m_flags |= XFS_MOUNT_32BITINODES;
+		xfs_initialize_perag(mp, nagcount);
+		up_write(&mp->m_peraglock);
+	}
+	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
+	if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
+			XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	/* new ag's */
+	sectbb = BTOBB(mp->m_sb.sb_sectsize);
+	bsize = mp->m_sb.sb_blocksize;
+
+	nfree = 0;
+	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+		/*
+		 * AG freelist header block
+		 */
+		disk_addr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR);
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				  disk_addr,
+				  sectbb, 0);
+		agf = XFS_BUF_TO_AGF(bp);
+		bzero(agf, mp->m_sb.sb_sectsize);
+		INT_SET(agf->agf_magicnum, ARCH_CONVERT, XFS_AGF_MAGIC);
+		INT_SET(agf->agf_versionnum, ARCH_CONVERT, XFS_AGF_VERSION);
+		INT_SET(agf->agf_seqno, ARCH_CONVERT, agno);
+		if (agno == nagcount - 1)
+			agsize =
+				nb -
+				(agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+		else
+			agsize = mp->m_sb.sb_agblocks;
+		INT_SET(agf->agf_length, ARCH_CONVERT, agsize);
+		INT_SET(agf->agf_roots[XFS_BTNUM_BNOi], ARCH_CONVERT, XFS_BNO_BLOCK(mp));
+		INT_SET(agf->agf_roots[XFS_BTNUM_CNTi], ARCH_CONVERT, XFS_CNT_BLOCK(mp));
+		INT_SET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT, 1);
+		INT_SET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT, 1);
+		INT_ZERO(agf->agf_flfirst, ARCH_CONVERT);
+		INT_SET(agf->agf_fllast, ARCH_CONVERT, XFS_AGFL_SIZE - 1);
+		INT_ZERO(agf->agf_flcount, ARCH_CONVERT);
+		tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
+		INT_SET(agf->agf_freeblks, ARCH_CONVERT, tmpsize);
+		INT_SET(agf->agf_longest, ARCH_CONVERT, tmpsize);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * AG inode header block
+		 */
+		disk_addr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				  disk_addr,
+				  sectbb, 0);
+		agi = XFS_BUF_TO_AGI(bp);
+		bzero(agi, mp->m_sb.sb_sectsize);
+		INT_SET(agi->agi_magicnum, ARCH_CONVERT, XFS_AGI_MAGIC);
+		INT_SET(agi->agi_versionnum, ARCH_CONVERT, XFS_AGI_VERSION);
+		INT_SET(agi->agi_seqno, ARCH_CONVERT, agno);
+		INT_SET(agi->agi_length, ARCH_CONVERT, agsize);
+		INT_ZERO(agi->agi_count, ARCH_CONVERT);
+		INT_SET(agi->agi_root, ARCH_CONVERT, XFS_IBT_BLOCK(mp));
+		INT_SET(agi->agi_level, ARCH_CONVERT, 1);
+		INT_ZERO(agi->agi_freecount, ARCH_CONVERT);
+		INT_SET(agi->agi_newino, ARCH_CONVERT, NULLAGINO);
+		INT_SET(agi->agi_dirino, ARCH_CONVERT, NULLAGINO);
+		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+			INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT, NULLAGINO);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * BNO btree root block
+		 */
+		disk_addr = XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp));
+		bp = xfs_buf_get(mp->m_ddev_targp,
+			disk_addr,
+			BTOBB(bsize), 0);
+		block = XFS_BUF_TO_SBLOCK(bp);
+		bzero(block, bsize);
+		INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTB_MAGIC);
+		INT_ZERO(block->bb_level, ARCH_CONVERT);
+		INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+		INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+		INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+		arec = XFS_BTREE_REC_ADDR(bsize, xfs_alloc, block, 1,
+			mp->m_alloc_mxr[0]);
+		INT_SET(arec->ar_startblock, ARCH_CONVERT, XFS_PREALLOC_BLOCKS(mp));
+		INT_SET(arec->ar_blockcount, ARCH_CONVERT, agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT));
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * CNT btree root block
+		 */
+		disk_addr = XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp));
+		bp = xfs_buf_get(mp->m_ddev_targp,
+			disk_addr,
+			BTOBB(bsize), 0);
+		block = XFS_BUF_TO_SBLOCK(bp);
+		bzero(block, bsize);
+		INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTC_MAGIC);
+		INT_ZERO(block->bb_level, ARCH_CONVERT);
+		INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+		INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+		INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+		arec = XFS_BTREE_REC_ADDR(bsize, xfs_alloc, block, 1,
+			mp->m_alloc_mxr[0]);
+		INT_SET(arec->ar_startblock, ARCH_CONVERT, XFS_PREALLOC_BLOCKS(mp));
+		INT_SET(arec->ar_blockcount, ARCH_CONVERT, agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT));
+		nfree += INT_GET(arec->ar_blockcount, ARCH_CONVERT);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * INO btree root block
+		 */
+		disk_addr = XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp));
+		bp = xfs_buf_get(mp->m_ddev_targp,
+			disk_addr,
+			BTOBB(bsize), 0);
+		block = XFS_BUF_TO_SBLOCK(bp);
+		bzero(block, bsize);
+		INT_SET(block->bb_magic, ARCH_CONVERT, XFS_IBT_MAGIC);
+		INT_ZERO(block->bb_level, ARCH_CONVERT);
+		INT_ZERO(block->bb_numrecs, ARCH_CONVERT);
+		INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+		INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+	}
+	xfs_trans_agblocks_delta(tp, nfree);
+	/*
+	 * There are new blocks in the old last a.g.
+	 */
+	if (new) {
+		/*
+		 * Change the agi length.
+		 */
+		error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+		if (error) {
+			goto error0;
+		}
+		ASSERT(bp);
+		agi = XFS_BUF_TO_AGI(bp);
+		INT_MOD(agi->agi_length, ARCH_CONVERT, new);
+		ASSERT(nagcount == oagcount
+		       || INT_GET(agi->agi_length, ARCH_CONVERT) == mp->m_sb.sb_agblocks);
+		xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+		/*
+		 * Change agf length.
+		 */
+		error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
+		if (error) {
+			goto error0;
+		}
+		ASSERT(bp);
+		agf = XFS_BUF_TO_AGF(bp);
+		INT_MOD(agf->agf_length, ARCH_CONVERT, new);
+		ASSERT(INT_GET(agf->agf_length, ARCH_CONVERT) ==
+				INT_GET(agi->agi_length, ARCH_CONVERT));
+		/*
+		 * Free the new space.
+		 */
+		error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
+				INT_GET(agf->agf_length, ARCH_CONVERT) - new), new);
+		if (error) {
+			goto error0;
+		}
+	}
+	if (nagcount > oagcount)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
+	if (nb > mp->m_sb.sb_dblocks)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
+				 nb - mp->m_sb.sb_dblocks);
+	if (nfree)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
+	if (dpct)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+	error = xfs_trans_commit(tp, 0, NULL);
+	if (error) {
+		return error;
+	}
+	if (mp->m_sb.sb_imax_pct) {
+		__uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+		do_div(icount, 100);
+		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+	} else
+		mp->m_maxicount = 0;
+	for (agno = 1; agno < nagcount; agno++) {
+		error = xfs_read_buf(mp, mp->m_ddev_targp,
+				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+				  sectbb, 0, &bp);
+		if (error) {
+			xfs_fs_cmn_err(CE_WARN, mp,
+			"error %d reading secondary superblock for ag %d\n",
+				error, agno);
+			break;
+		}
+		sbp = XFS_BUF_TO_SBP(bp);
+		xfs_xlatesb(sbp, &mp->m_sb, -1, ARCH_CONVERT, XFS_SB_ALL_BITS);
+		/*
+		 * If we get an error writing out the alternate superblocks,
+		 * just issue a warning and continue.  The real work is
+		 * already done and committed.
+		 */
+		if (!(error = xfs_bwrite(mp, bp))) {
+			continue;
+		} else {
+			xfs_fs_cmn_err(CE_WARN, mp,
+		"write error %d updating secondary superblock for ag %d\n",
+				error, agno);
+			break; /* no point in continuing */
+		}
+	}
+	return 0;
+
+ error0:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	return error;
+}
+
+static int
+xfs_growfs_log_private(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_growfs_log_t	*in)	/* growfs log input struct */
+{
+	xfs_extlen_t		nb;
+
+	nb = in->newblocks;
+	if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
+		return XFS_ERROR(EINVAL);
+	if (nb == mp->m_sb.sb_logblocks &&
+	    in->isint == (mp->m_sb.sb_logstart != 0))
+		return XFS_ERROR(EINVAL);
+	/*
+	 * Moving the log is hard, need new interfaces to sync
+	 * the log first, hold off all activity while moving it.
+	 * Can have shorter or longer log in the same space,
+	 * or transform internal to external log or vice versa.
+	 */
+	return XFS_ERROR(ENOSYS);
+}
+
+/*
+ * protected versions of growfs function acquire and release locks on the mount
+ * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
+ * XFS_IOC_FSGROWFSRT
+ */
+
+
+int
+xfs_growfs_data(
+	xfs_mount_t		*mp,
+	xfs_growfs_data_t	*in)
+{
+	int error;
+	if (!cpsema(&mp->m_growlock))
+		return XFS_ERROR(EWOULDBLOCK);
+	error = xfs_growfs_data_private(mp, in);
+	vsema(&mp->m_growlock);
+	return error;
+}
+
+int
+xfs_growfs_log(
+	xfs_mount_t		*mp,
+	xfs_growfs_log_t	*in)
+{
+	int error;
+	if (!cpsema(&mp->m_growlock))
+		return XFS_ERROR(EWOULDBLOCK);
+	error = xfs_growfs_log_private(mp, in);
+	vsema(&mp->m_growlock);
+	return error;
+}
+
+/*
+ * exported through ioctl XFS_IOC_FSCOUNTS
+ */
+
+int
+xfs_fs_counts(
+	xfs_mount_t		*mp,
+	xfs_fsop_counts_t	*cnt)
+{
+	unsigned long	s;
+
+	s = XFS_SB_LOCK(mp);
+	cnt->freedata = mp->m_sb.sb_fdblocks;
+	cnt->freertx = mp->m_sb.sb_frextents;
+	cnt->freeino = mp->m_sb.sb_ifree;
+	cnt->allocino = mp->m_sb.sb_icount;
+	XFS_SB_UNLOCK(mp, s);
+	return 0;
+}
+
+/*
+ * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
+ *
+ * xfs_reserve_blocks is called to set m_resblks
+ * in the in-core mount table. The number of unused reserved blocks
+ * is kept in m_resbls_avail.
+ *
+ * Reserve the requested number of blocks if available. Otherwise return
+ * as many as possible to satisfy the request. The actual number
+ * reserved are returned in outval
+ *
+ * A null inval pointer indicates that only the current reserved blocks
+ * available  should  be returned no settings are changed.
+ */
+
+int
+xfs_reserve_blocks(
+	xfs_mount_t		*mp,
+	__uint64_t		*inval,
+	xfs_fsop_resblks_t	*outval)
+{
+	__uint64_t		lcounter, delta;
+	__uint64_t		request;
+	unsigned long s;
+
+	/* If inval is null, report current values and return */
+
+	if (inval == (__uint64_t *)NULL) {
+		outval->resblks = mp->m_resblks;
+		outval->resblks_avail = mp->m_resblks_avail;
+		return(0);
+	}
+
+	request = *inval;
+	s = XFS_SB_LOCK(mp);
+
+	/*
+	 * If our previous reservation was larger than the current value,
+	 * then move any unused blocks back to the free pool.
+	 */
+
+	if (mp->m_resblks > request) {
+		lcounter = mp->m_resblks_avail - request;
+		if (lcounter  > 0) {		/* release unused blocks */
+			mp->m_sb.sb_fdblocks += lcounter;
+			mp->m_resblks_avail -= lcounter;
+		}
+		mp->m_resblks = request;
+	} else {
+		delta = request - mp->m_resblks;
+		lcounter = mp->m_sb.sb_fdblocks;
+		lcounter -= delta;
+		if (lcounter < 0) {
+			/* We can't satisfy the request, just get what we can */
+			mp->m_resblks += mp->m_sb.sb_fdblocks;
+			mp->m_resblks_avail += mp->m_sb.sb_fdblocks;
+			mp->m_sb.sb_fdblocks = 0;
+		} else {
+			mp->m_sb.sb_fdblocks = lcounter;
+			mp->m_resblks = request;
+			mp->m_resblks_avail += delta;
+		}
+	}
+
+	outval->resblks = mp->m_resblks;
+	outval->resblks_avail = mp->m_resblks_avail;
+	XFS_SB_UNLOCK(mp, s);
+	return(0);
+}
+
+void
+xfs_fs_log_dummy(xfs_mount_t *mp)
+{
+	xfs_trans_t *tp;
+	xfs_inode_t *ip;
+
+
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+	atomic_inc(&mp->m_active_trans);
+	if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
+		xfs_trans_cancel(tp, 0);
+		return;
+	}
+
+	ip = mp->m_rootip;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_commit(tp, XFS_TRANS_SYNC, NULL);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
+int
+xfs_fs_freeze(
+	xfs_mount_t	*mp)
+{
+	vfs_t		*vfsp;
+	/*REFERENCED*/
+	int		error;
+
+	vfsp = XFS_MTOVFS(mp);
+
+	/* Stop new writers */
+	xfs_start_freeze(mp, XFS_FREEZE_WRITE);
+
+	/* Flush the refcache */
+	xfs_refcache_purge_mp(mp);
+
+	/* Flush delalloc and delwri data */
+	VFS_SYNC(vfsp, SYNC_DELWRI|SYNC_WAIT, NULL, error);
+
+	/* Pause transaction subsystem */
+	xfs_start_freeze(mp, XFS_FREEZE_TRANS);
+
+	/* Flush any remaining inodes into buffers */
+	VFS_SYNC(vfsp, SYNC_ATTR|SYNC_WAIT, NULL, error);
+
+	/* Push all buffers out to disk */
+	xfs_binval(mp->m_ddev_targp);
+	if (mp->m_rtdev_targp) {
+		xfs_binval(mp->m_rtdev_targp);
+	}
+
+	/* Push the superblock and write an unmount record */
+	xfs_log_unmount_write(mp);
+	xfs_unmountfs_writesb(mp);
+
+	return 0;
+}
+
+
+int
+xfs_fs_thaw(
+	xfs_mount_t	*mp)
+{
+	xfs_finish_freeze(mp);
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_fsops.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_fsops.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_fsops.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_fsops.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_FSOPS_H__
+#define __XFS_FSOPS_H__
+
+int
+xfs_fs_geometry(
+	xfs_mount_t		*mp,
+	xfs_fsop_geom_t		*geo,
+	int			new_version);
+
+int
+xfs_growfs_data(
+	xfs_mount_t		*mp,
+	xfs_growfs_data_t	*in);
+
+int
+xfs_growfs_log(
+	xfs_mount_t		*mp,
+	xfs_growfs_log_t	*in);
+
+int
+xfs_fs_counts(
+	xfs_mount_t		*mp,
+	xfs_fsop_counts_t	*cnt);
+
+int
+xfs_reserve_blocks(
+	xfs_mount_t		*mp,
+	__uint64_t		*inval,
+	xfs_fsop_resblks_t	*outval);
+
+int
+xfs_fs_freeze(
+	xfs_mount_t		*mp);
+
+int
+xfs_fs_thaw(
+	xfs_mount_t		*mp);
+
+#endif	/* __XFS_FSOPS_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ialloc.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ialloc.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ialloc.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ialloc.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1336 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Log specified fields for the inode given by bp and off.
+ */
+STATIC void
+xfs_ialloc_log_di(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*bp,		/* inode buffer */
+	int		off,		/* index of inode in buffer */
+	int		fields)		/* bitmask of fields to log */
+{
+	int			first;		/* first byte number */
+	int			ioffset;	/* off in bytes */
+	int			last;		/* last byte number */
+	xfs_mount_t		*mp;		/* mount point structure */
+	static const short	offsets[] = {	/* field offsets */
+						/* keep in sync with bits */
+		offsetof(xfs_dinode_core_t, di_magic),
+		offsetof(xfs_dinode_core_t, di_mode),
+		offsetof(xfs_dinode_core_t, di_version),
+		offsetof(xfs_dinode_core_t, di_format),
+		offsetof(xfs_dinode_core_t, di_onlink),
+		offsetof(xfs_dinode_core_t, di_uid),
+		offsetof(xfs_dinode_core_t, di_gid),
+		offsetof(xfs_dinode_core_t, di_nlink),
+		offsetof(xfs_dinode_core_t, di_projid),
+		offsetof(xfs_dinode_core_t, di_pad),
+		offsetof(xfs_dinode_core_t, di_atime),
+		offsetof(xfs_dinode_core_t, di_mtime),
+		offsetof(xfs_dinode_core_t, di_ctime),
+		offsetof(xfs_dinode_core_t, di_size),
+		offsetof(xfs_dinode_core_t, di_nblocks),
+		offsetof(xfs_dinode_core_t, di_extsize),
+		offsetof(xfs_dinode_core_t, di_nextents),
+		offsetof(xfs_dinode_core_t, di_anextents),
+		offsetof(xfs_dinode_core_t, di_forkoff),
+		offsetof(xfs_dinode_core_t, di_aformat),
+		offsetof(xfs_dinode_core_t, di_dmevmask),
+		offsetof(xfs_dinode_core_t, di_dmstate),
+		offsetof(xfs_dinode_core_t, di_flags),
+		offsetof(xfs_dinode_core_t, di_gen),
+		offsetof(xfs_dinode_t, di_next_unlinked),
+		offsetof(xfs_dinode_t, di_u),
+		offsetof(xfs_dinode_t, di_a),
+		sizeof(xfs_dinode_t)
+	};
+
+
+	ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
+	ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
+	mp = tp->t_mountp;
+	/*
+	 * Get the inode-relative first and last bytes for these fields
+	 */
+	xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
+	/*
+	 * Convert to buffer offsets and log it.
+	 */
+	ioffset = off << mp->m_sb.sb_inodelog;
+	first += ioffset;
+	last += ioffset;
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * Allocation group level functions.
+ */
+
+/*
+ * Allocate new inodes in the allocation group specified by agbp.
+ * Return 0 for success, else error code.
+ */
+STATIC int				/* error code or 0 */
+xfs_ialloc_ag_alloc(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*agbp,		/* alloc group buffer */
+	int		*alloc)
+{
+	xfs_agi_t	*agi;		/* allocation group header */
+	xfs_alloc_arg_t args;		/* allocation argument structure */
+	int		blks_per_cluster;  /* fs blocks per inode cluster */
+	xfs_btree_cur_t *cur;		/* inode btree cursor */
+	xfs_daddr_t	d;		/* disk addr of buffer */
+	int		error;
+	xfs_buf_t	*fbuf;		/* new free inodes' buffer */
+	xfs_dinode_t	*free;		/* new free inode structure */
+	int		i;		/* inode counter */
+	int		j;		/* block counter */
+	int		nbufs;		/* num bufs of new inodes */
+	xfs_agino_t	newino;		/* new first inode's number */
+	xfs_agino_t	newlen;		/* new number of inodes */
+	int		ninodes;	/* num inodes per buf */
+	xfs_agino_t	thisino;	/* current inode number, for loop */
+	int		version;	/* inode version number to use */
+	static xfs_timestamp_t ztime;	/* zero xfs timestamp */
+	int		isaligned;	/* inode allocation at stripe unit */
+					/* boundary */
+	xfs_dinode_core_t dic;		/* a dinode_core to copy to new */
+					/* inodes */
+
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+
+	/*
+	 * Locking will ensure that we don't have two callers in here
+	 * at one time.
+	 */
+	newlen = XFS_IALLOC_INODES(args.mp);
+	if (args.mp->m_maxicount &&
+	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+		return XFS_ERROR(ENOSPC);
+	args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+	/*
+	 * Set the alignment for the allocation.
+	 * If stripe alignment is turned on then align at stripe unit
+	 * boundary.
+	 * If the cluster size is smaller than a filesystem block
+	 * then we're doing I/O for inodes in filesystem block size pieces,
+	 * so don't need alignment anyway.
+	 */
+	isaligned = 0;
+	if (args.mp->m_sinoalign) {
+		ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+		args.alignment = args.mp->m_dalign;
+		isaligned = 1;
+	} else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
+	    args.mp->m_sb.sb_inoalignmt >=
+	    XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
+		args.alignment = args.mp->m_sb.sb_inoalignmt;
+	else
+		args.alignment = 1;
+	agi = XFS_BUF_TO_AGI(agbp);
+	/*
+	 * Need to figure out where to allocate the inode blocks.
+	 * Ideally they should be spaced out through the a.g.
+	 * For now, just allocate blocks up front.
+	 */
+	args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
+				    args.agbno);
+	/*
+	 * Allocate a fixed-size extent of inodes.
+	 */
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.mod = args.total = args.wasdel = args.isfl = args.userdata =
+		args.minalignslop = 0;
+	args.prod = 1;
+	/*
+	 * Allow space for the inode btree to split.
+	 */
+	args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+	if ((error = xfs_alloc_vextent(&args)))
+		return error;
+
+	/*
+	 * If stripe alignment is turned on, then try again with cluster
+	 * alignment.
+	 */
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp,
+				INT_GET(agi->agi_seqno, ARCH_CONVERT), args.agbno);
+		if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
+			args.mp->m_sb.sb_inoalignmt >=
+			XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
+				args.alignment = args.mp->m_sb.sb_inoalignmt;
+		else
+			args.alignment = 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			 return error;
+	}
+
+	if (args.fsbno == NULLFSBLOCK) {
+		*alloc = 0;
+		return 0;
+	}
+	ASSERT(args.len == args.minlen);
+	/*
+	 * Convert the results.
+	 */
+	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+	/*
+	 * Loop over the new block(s), filling in the inodes.
+	 * For small block sizes, manipulate the inodes in buffers
+	 * which are multiples of the blocks size.
+	 */
+	if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
+		blks_per_cluster = 1;
+		nbufs = (int)args.len;
+		ninodes = args.mp->m_sb.sb_inopblock;
+	} else {
+		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
+				   args.mp->m_sb.sb_blocksize;
+		nbufs = (int)args.len / blks_per_cluster;
+		ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
+	}
+	/*
+	 * Figure out what version number to use in the inodes we create.
+	 * If the superblock version has caught up to the one that supports
+	 * the new inode format, then use the new inode version.  Otherwise
+	 * use the old version so that old kernels will continue to be
+	 * able to use the file system.
+	 */
+	if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb))
+		version = XFS_DINODE_VERSION_2;
+	else
+		version = XFS_DINODE_VERSION_1;
+	for (j = 0; j < nbufs; j++) {
+		/*
+		 * Get the block.
+		 */
+		d = XFS_AGB_TO_DADDR(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
+				     args.agbno + (j * blks_per_cluster));
+		fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
+					 args.mp->m_bsize * blks_per_cluster,
+					 XFS_BUF_LOCK);
+		ASSERT(fbuf);
+		ASSERT(!XFS_BUF_GETERROR(fbuf));
+		/*
+		 * Loop over the inodes in this buffer.
+		 */
+		INT_SET(dic.di_magic, ARCH_CONVERT, XFS_DINODE_MAGIC);
+		INT_ZERO(dic.di_mode, ARCH_CONVERT);
+		INT_SET(dic.di_version, ARCH_CONVERT, version);
+		INT_ZERO(dic.di_format, ARCH_CONVERT);
+		INT_ZERO(dic.di_onlink, ARCH_CONVERT);
+		INT_ZERO(dic.di_uid, ARCH_CONVERT);
+		INT_ZERO(dic.di_gid, ARCH_CONVERT);
+		INT_ZERO(dic.di_nlink, ARCH_CONVERT);
+		INT_ZERO(dic.di_projid, ARCH_CONVERT);
+		bzero(&(dic.di_pad[0]),sizeof(dic.di_pad));
+		INT_SET(dic.di_atime.t_sec, ARCH_CONVERT, ztime.t_sec);
+		INT_SET(dic.di_atime.t_nsec, ARCH_CONVERT, ztime.t_nsec);
+
+		INT_SET(dic.di_mtime.t_sec, ARCH_CONVERT, ztime.t_sec);
+		INT_SET(dic.di_mtime.t_nsec, ARCH_CONVERT, ztime.t_nsec);
+
+		INT_SET(dic.di_ctime.t_sec, ARCH_CONVERT, ztime.t_sec);
+		INT_SET(dic.di_ctime.t_nsec, ARCH_CONVERT, ztime.t_nsec);
+
+		INT_ZERO(dic.di_size, ARCH_CONVERT);
+		INT_ZERO(dic.di_nblocks, ARCH_CONVERT);
+		INT_ZERO(dic.di_extsize, ARCH_CONVERT);
+		INT_ZERO(dic.di_nextents, ARCH_CONVERT);
+		INT_ZERO(dic.di_anextents, ARCH_CONVERT);
+		INT_ZERO(dic.di_forkoff, ARCH_CONVERT);
+		INT_ZERO(dic.di_aformat, ARCH_CONVERT);
+		INT_ZERO(dic.di_dmevmask, ARCH_CONVERT);
+		INT_ZERO(dic.di_dmstate, ARCH_CONVERT);
+		INT_ZERO(dic.di_flags, ARCH_CONVERT);
+		INT_ZERO(dic.di_gen, ARCH_CONVERT);
+
+		for (i = 0; i < ninodes; i++) {
+			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
+			bcopy (&dic, &(free->di_core), sizeof(xfs_dinode_core_t));
+			INT_SET(free->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
+			xfs_ialloc_log_di(tp, fbuf, i,
+				XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
+		}
+		xfs_trans_inode_alloc_buf(tp, fbuf);
+	}
+	INT_MOD(agi->agi_count, ARCH_CONVERT, newlen);
+	INT_MOD(agi->agi_freecount, ARCH_CONVERT, newlen);
+	down_read(&args.mp->m_peraglock);
+	args.mp->m_perag[INT_GET(agi->agi_seqno, ARCH_CONVERT)].pagi_freecount += newlen;
+	up_read(&args.mp->m_peraglock);
+	INT_SET(agi->agi_newino, ARCH_CONVERT, newino);
+	/*
+	 * Insert records describing the new inode chunk into the btree.
+	 */
+	cur = xfs_btree_init_cursor(args.mp, tp, agbp,
+			INT_GET(agi->agi_seqno, ARCH_CONVERT),
+			XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+	for (thisino = newino;
+	     thisino < newino + newlen;
+	     thisino += XFS_INODES_PER_CHUNK) {
+		if ((error = xfs_inobt_lookup_eq(cur, thisino,
+				XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 0);
+		if ((error = xfs_inobt_insert(cur, &i))) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 1);
+	}
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	/*
+	 * Log allocation group header fields
+	 */
+	xfs_ialloc_log_agi(tp, agbp,
+		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+	/*
+	 * Modify/log superblock values for inode count and inode free count.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+	*alloc = 1;
+	return 0;
+}
+
+/*
+ * Select an allocation group to look for a free inode in, based on the parent
+ * inode and then mode.	 Return the allocation group buffer.
+ */
+STATIC xfs_buf_t *			/* allocation group buffer */
+xfs_ialloc_ag_select(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent directory inode number */
+	mode_t		mode,		/* bits set to indicate file type */
+	int		okalloc)	/* ok to allocate more space */
+{
+	xfs_buf_t	*agbp;		/* allocation group header buffer */
+	xfs_agnumber_t	agcount;	/* number of ag's in the filesystem */
+	xfs_agnumber_t	agno;		/* current ag number */
+	int		flags;		/* alloc buffer locking flags */
+	xfs_extlen_t	ineed;		/* blocks needed for inode allocation */
+	xfs_extlen_t	longest = 0;	/* longest extent available */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		needspace;	/* file mode implies space allocated */
+	xfs_perag_t	*pag;		/* per allocation group data */
+	xfs_agnumber_t	pagno;		/* parent (starting) ag number */
+
+	/*
+	 * Files of these types need at least one block if length > 0
+	 * (and they won't fit in the inode, but that's hard to figure out).
+	 */
+	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+	mp = tp->t_mountp;
+	agcount = mp->m_maxagi;
+	if (S_ISDIR(mode))
+		pagno = atomicIncWithWrap((int *)&mp->m_agirotor, agcount);
+	else {
+		pagno = XFS_INO_TO_AGNO(mp, parent);
+		if (pagno >= agcount)
+			pagno = 0;
+	}
+	ASSERT(pagno < agcount);
+	/*
+	 * Loop through allocation groups, looking for one with a little
+	 * free space in it.  Note we don't look for free inodes, exactly.
+	 * Instead, we include whether there is a need to allocate inodes
+	 * to mean that blocks must be allocated for them,
+	 * if none are currently free.
+	 */
+	agno = pagno;
+	flags = XFS_ALLOC_FLAG_TRYLOCK;
+	down_read(&mp->m_peraglock);
+	for (;;) {
+		pag = &mp->m_perag[agno];
+		if (!pag->pagi_init) {
+			if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+				agbp = NULL;
+				goto nextag;
+			}
+		} else
+			agbp = NULL;
+
+		if (!pag->pagi_inodeok) {
+			atomicIncWithWrap((int *)&mp->m_agirotor, agcount);
+			goto unlock_nextag;
+		}
+
+		/*
+		 * Is there enough free space for the file plus a block
+		 * of inodes (if we need to allocate some)?
+		 */
+		ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
+		if (ineed && !pag->pagf_init) {
+			if (agbp == NULL &&
+			    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+				agbp = NULL;
+				goto nextag;
+			}
+			(void)xfs_alloc_pagf_init(mp, tp, agno, flags);
+		}
+		if (!ineed || pag->pagf_init) {
+			if (ineed && !(longest = pag->pagf_longest))
+				longest = pag->pagf_flcount > 0;
+			if (!ineed ||
+			    (pag->pagf_freeblks >= needspace + ineed &&
+			     longest >= ineed &&
+			     okalloc)) {
+				if (agbp == NULL &&
+				    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+					agbp = NULL;
+					goto nextag;
+				}
+				up_read(&mp->m_peraglock);
+				return agbp;
+			}
+		}
+unlock_nextag:
+		if (agbp)
+			xfs_trans_brelse(tp, agbp);
+nextag:
+		/*
+		 * No point in iterating over the rest, if we're shutting
+		 * down.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			up_read(&mp->m_peraglock);
+			return (xfs_buf_t *)0;
+		}
+		agno++;
+		if (agno >= agcount)
+			agno = 0;
+		if (agno == pagno) {
+			if (flags == 0) {
+				up_read(&mp->m_peraglock);
+				return (xfs_buf_t *)0;
+			}
+			flags = 0;
+		}
+	}
+}
+
+/*
+ * Visible inode allocation functions.
+ */
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * The arguments IO_agbp and alloc_done are defined to work within
+ * the constraint of one allocation per transaction.
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.	 On the first call,
+ * IO_agbp should be set to NULL. If an inode is available,
+ * i.e., xfs_dialloc() did not need to do an allocation, an inode
+ * number is returned.	In this case, IO_agbp would be set to the
+ * current ag_buf and alloc_done set to false.
+ * If an allocation needed to be done, xfs_dialloc would return
+ * the current ag_buf in IO_agbp and set alloc_done to true.
+ * The caller should then commit the current transaction, allocate a new
+ * transaction, and call xfs_dialloc() again, passing in the previous
+ * value of IO_agbp.  IO_agbp should be held across the transactions.
+ * Since the agbp is locked across the two calls, the second call is
+ * guaranteed to have a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.	 The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent inode (directory) */
+	mode_t		mode,		/* mode bits for new inode */
+	int		okalloc,	/* ok to allocate more space */
+	xfs_buf_t	**IO_agbp,	/* in/out ag header's buffer */
+	boolean_t	*alloc_done,	/* true if we needed to replenish
+					   inode freelist */
+	xfs_ino_t	*inop)		/* inode number allocated */
+{
+	xfs_agnumber_t	agcount;	/* number of allocation groups */
+	xfs_buf_t	*agbp;		/* allocation group header's buffer */
+	xfs_agnumber_t	agno;		/* allocation group number */
+	xfs_agi_t	*agi;		/* allocation group header structure */
+	xfs_btree_cur_t *cur;		/* inode allocation btree cursor */
+	int		error;		/* error return value */
+	int		i;		/* result code */
+	int		ialloced;	/* inode allocation status */
+	int		noroom = 0;	/* no space for inode blk allocation */
+	xfs_ino_t	ino;		/* fs-relative inode to be returned */
+	/* REFERENCED */
+	int		j;		/* result code */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	int		offset;		/* index of inode in chunk */
+	xfs_agino_t	pagino;		/* parent's a.g. relative inode # */
+	xfs_agnumber_t	pagno;		/* parent's allocation group number */
+	xfs_inobt_rec_t rec;		/* inode allocation record */
+	xfs_agnumber_t	tagno;		/* testing allocation group number */
+	xfs_btree_cur_t *tcur;		/* temp cursor */
+	xfs_inobt_rec_t trec;		/* temp inode allocation record */
+
+
+	if (*IO_agbp == NULL) {
+		/*
+		 * We do not have an agbp, so select an initial allocation
+		 * group for inode allocation.
+		 */
+		agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+		/*
+		 * Couldn't find an allocation group satisfying the
+		 * criteria, give up.
+		 */
+		if (!agbp) {
+			*inop = NULLFSINO;
+			return 0;
+		}
+		agi = XFS_BUF_TO_AGI(agbp);
+		ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+	} else {
+		/*
+		 * Continue where we left off before.  In this case, we
+		 * know that the allocation group has free inodes.
+		 */
+		agbp = *IO_agbp;
+		agi = XFS_BUF_TO_AGI(agbp);
+		ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+		ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0);
+	}
+	mp = tp->t_mountp;
+	agcount = mp->m_sb.sb_agcount;
+	agno = INT_GET(agi->agi_seqno, ARCH_CONVERT);
+	tagno = agno;
+	pagno = XFS_INO_TO_AGNO(mp, parent);
+	pagino = XFS_INO_TO_AGINO(mp, parent);
+
+	/*
+	 * If we have already hit the ceiling of inode blocks then clear
+	 * okalloc so we scan all available agi structures for a free
+	 * inode.
+	 */
+
+	if (mp->m_maxicount &&
+	    mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+		noroom = 1;
+		okalloc = 0;
+	}
+
+	/*
+	 * Loop until we find an allocation group that either has free inodes
+	 * or in which we can allocate some inodes.  Iterate through the
+	 * allocation groups upward, wrapping at the end.
+	 */
+	*alloc_done = B_FALSE;
+	while (INT_ISZERO(agi->agi_freecount, ARCH_CONVERT)) {
+		/*
+		 * Don't do anything if we're not supposed to allocate
+		 * any blocks, just go on to the next ag.
+		 */
+		if (okalloc) {
+			/*
+			 * Try to allocate some new inodes in the allocation
+			 * group.
+			 */
+			if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
+				xfs_trans_brelse(tp, agbp);
+				if (error == ENOSPC) {
+					*inop = NULLFSINO;
+					return 0;
+				} else
+					return error;
+			}
+			if (ialloced) {
+				/*
+				 * We successfully allocated some inodes, return
+				 * the current context to the caller so that it
+				 * can commit the current transaction and call
+				 * us again where we left off.
+				 */
+				ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0);
+				*alloc_done = B_TRUE;
+				*IO_agbp = agbp;
+				*inop = NULLFSINO;
+				return 0;
+			}
+		}
+		/*
+		 * If it failed, give up on this ag.
+		 */
+		xfs_trans_brelse(tp, agbp);
+		/*
+		 * Go on to the next ag: get its ag header.
+		 */
+nextag:
+		if (++tagno == agcount)
+			tagno = 0;
+		if (tagno == agno) {
+			*inop = NULLFSINO;
+			return noroom ? ENOSPC : 0;
+		}
+		down_read(&mp->m_peraglock);
+		if (mp->m_perag[tagno].pagi_inodeok == 0) {
+			up_read(&mp->m_peraglock);
+			goto nextag;
+		}
+		error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
+		up_read(&mp->m_peraglock);
+		if (error)
+			goto nextag;
+		agi = XFS_BUF_TO_AGI(agbp);
+		ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+	}
+	/*
+	 * Here with an allocation group that has a free inode.
+	 * Reset agno since we may have chosen a new ag in the
+	 * loop above.
+	 */
+	agno = tagno;
+	*IO_agbp = NULL;
+	cur = xfs_btree_init_cursor(mp, tp, agbp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
+				    XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+	/*
+	 * If pagino is 0 (this is the root inode allocation) use newino.
+	 * This must work because we've just allocated some.
+	 */
+	if (!pagino)
+		pagino = INT_GET(agi->agi_newino, ARCH_CONVERT);
+#ifdef DEBUG
+	if (cur->bc_nlevels == 1) {
+		int	freecount = 0;
+
+		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		do {
+			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+					&rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			freecount += rec.ir_freecount;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error0;
+		} while (i == 1);
+
+		ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+		       XFS_FORCED_SHUTDOWN(mp));
+	}
+#endif
+	/*
+	 * If in the same a.g. as the parent, try to get near the parent.
+	 */
+	if (pagno == agno) {
+		if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+			goto error0;
+		if (i != 0 &&
+		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+			    &rec.ir_freecount, &rec.ir_free, &j, ARCH_NOCONVERT)) == 0 &&
+		    j == 1 &&
+		    rec.ir_freecount > 0) {
+			/*
+			 * Found a free inode in the same chunk
+			 * as parent, done.
+			 */
+		}
+		/*
+		 * In the same a.g. as parent, but parent's chunk is full.
+		 */
+		else {
+			int	doneleft;	/* done, to the left */
+			int	doneright;	/* done, to the right */
+
+			if (error)
+				goto error0;
+			ASSERT(i == 1);
+			ASSERT(j == 1);
+			/*
+			 * Duplicate the cursor, search left & right
+			 * simultaneously.
+			 */
+			if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+				goto error0;
+			/*
+			 * Search left with tcur, back up 1 record.
+			 */
+			if ((error = xfs_inobt_decrement(tcur, 0, &i)))
+				goto error1;
+			doneleft = !i;
+			if (!doneleft) {
+				if ((error = xfs_inobt_get_rec(tcur,
+						&trec.ir_startino,
+						&trec.ir_freecount,
+						&trec.ir_free, &i, ARCH_NOCONVERT)))
+					goto error1;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+			}
+			/*
+			 * Search right with cur, go forward 1 record.
+			 */
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error1;
+			doneright = !i;
+			if (!doneright) {
+				if ((error = xfs_inobt_get_rec(cur,
+						&rec.ir_startino,
+						&rec.ir_freecount,
+						&rec.ir_free, &i, ARCH_NOCONVERT)))
+					goto error1;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+			}
+			/*
+			 * Loop until we find the closest inode chunk
+			 * with a free one.
+			 */
+			while (!doneleft || !doneright) {
+				int	useleft;  /* using left inode
+						     chunk this time */
+
+				/*
+				 * Figure out which block is closer,
+				 * if both are valid.
+				 */
+				if (!doneleft && !doneright)
+					useleft =
+						pagino -
+						(trec.ir_startino +
+						 XFS_INODES_PER_CHUNK - 1) <
+						 rec.ir_startino - pagino;
+				else
+					useleft = !doneleft;
+				/*
+				 * If checking the left, does it have
+				 * free inodes?
+				 */
+				if (useleft && trec.ir_freecount) {
+					/*
+					 * Yes, set it up as the chunk to use.
+					 */
+					rec = trec;
+					xfs_btree_del_cursor(cur,
+						XFS_BTREE_NOERROR);
+					cur = tcur;
+					break;
+				}
+				/*
+				 * If checking the right, does it have
+				 * free inodes?
+				 */
+				if (!useleft && rec.ir_freecount) {
+					/*
+					 * Yes, it's already set up.
+					 */
+					xfs_btree_del_cursor(tcur,
+						XFS_BTREE_NOERROR);
+					break;
+				}
+				/*
+				 * If used the left, get another one
+				 * further left.
+				 */
+				if (useleft) {
+					if ((error = xfs_inobt_decrement(tcur, 0,
+							&i)))
+						goto error1;
+					doneleft = !i;
+					if (!doneleft) {
+						if ((error = xfs_inobt_get_rec(
+							    tcur,
+							    &trec.ir_startino,
+							    &trec.ir_freecount,
+							    &trec.ir_free, &i, ARCH_NOCONVERT)))
+							goto error1;
+						XFS_WANT_CORRUPTED_GOTO(i == 1,
+							error1);
+					}
+				}
+				/*
+				 * If used the right, get another one
+				 * further right.
+				 */
+				else {
+					if ((error = xfs_inobt_increment(cur, 0,
+							&i)))
+						goto error1;
+					doneright = !i;
+					if (!doneright) {
+						if ((error = xfs_inobt_get_rec(
+							    cur,
+							    &rec.ir_startino,
+							    &rec.ir_freecount,
+							    &rec.ir_free, &i, ARCH_NOCONVERT)))
+							goto error1;
+						XFS_WANT_CORRUPTED_GOTO(i == 1,
+							error1);
+					}
+				}
+			}
+			ASSERT(!doneleft || !doneright);
+		}
+	}
+	/*
+	 * In a different a.g. from the parent.
+	 * See if the most recently allocated block has any free.
+	 */
+	else if (INT_GET(agi->agi_newino, ARCH_CONVERT) != NULLAGINO) {
+		if ((error = xfs_inobt_lookup_eq(cur,
+				INT_GET(agi->agi_newino, ARCH_CONVERT), 0, 0, &i)))
+			goto error0;
+		if (i == 1 &&
+		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+			    &rec.ir_freecount, &rec.ir_free, &j, ARCH_NOCONVERT)) == 0 &&
+		    j == 1 &&
+		    rec.ir_freecount > 0) {
+			/*
+			 * The last chunk allocated in the group still has
+			 * a free inode.
+			 */
+		}
+		/*
+		 * None left in the last group, search the whole a.g.
+		 */
+		else {
+			if (error)
+				goto error0;
+			if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+				goto error0;
+			ASSERT(i == 1);
+			for (;;) {
+				if ((error = xfs_inobt_get_rec(cur,
+						&rec.ir_startino,
+						&rec.ir_freecount, &rec.ir_free,
+						&i, ARCH_NOCONVERT)))
+					goto error0;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+				if (rec.ir_freecount > 0)
+					break;
+				if ((error = xfs_inobt_increment(cur, 0, &i)))
+					goto error0;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			}
+		}
+	}
+	offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
+	ASSERT(offset >= 0);
+	ASSERT(offset < XFS_INODES_PER_CHUNK);
+	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+	XFS_INOBT_CLR_FREE(&rec, offset, ARCH_NOCONVERT);
+	rec.ir_freecount--;
+	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
+			rec.ir_free)))
+		goto error0;
+	INT_MOD(agi->agi_freecount, ARCH_CONVERT, -1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	down_read(&mp->m_peraglock);
+	mp->m_perag[tagno].pagi_freecount--;
+	up_read(&mp->m_peraglock);
+#ifdef DEBUG
+	if (cur->bc_nlevels == 1) {
+		int	freecount = 0;
+
+		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			goto error0;
+		do {
+			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+					&rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			freecount += rec.ir_freecount;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error0;
+		} while (i == 1);
+		ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+		       XFS_FORCED_SHUTDOWN(mp));
+	}
+#endif
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+	*inop = ino;
+	return 0;
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	inode)		/* inode to be freed */
+{
+	/* REFERENCED */
+	xfs_agblock_t	agbno;	/* block number containing inode */
+	xfs_buf_t	*agbp;	/* buffer containing allocation group header */
+	xfs_agino_t	agino;	/* inode number relative to allocation group */
+	xfs_agnumber_t	agno;	/* allocation group number */
+	xfs_agi_t	*agi;	/* allocation group header */
+	xfs_btree_cur_t *cur;	/* inode btree cursor */
+	int		error;	/* error return value */
+	int		i;	/* result code */
+	xfs_mount_t	*mp;	/* mount structure for filesystem */
+	int		off;	/* offset of inode in inode chunk */
+	xfs_inobt_rec_t rec;	/* btree record */
+
+	mp = tp->t_mountp;
+
+	/*
+	 * Break up inode number into its components.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, inode);
+	if (agno >= mp->m_sb.sb_agcount)  {
+		cmn_err(CE_WARN,
+			"xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s.  Returning EINVAL.",
+			agno, mp->m_sb.sb_agcount, mp->m_fsname);
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	agino = XFS_INO_TO_AGINO(mp, inode);
+	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))	 {
+		cmn_err(CE_WARN,
+			"xfs_difree: inode != XFS_AGINO_TO_INO() (%d != %d) on %s.  Returning EINVAL.",
+			inode, XFS_AGINO_TO_INO(mp, agno, agino), mp->m_fsname);
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agbno >= mp->m_sb.sb_agblocks)  {
+		cmn_err(CE_WARN,
+			"xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s.  Returning EINVAL.",
+			agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	/*
+	 * Get the allocation group header.
+	 */
+	down_read(&mp->m_peraglock);
+	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	up_read(&mp->m_peraglock);
+	if (error) {
+		cmn_err(CE_WARN,
+			"xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.	Returning error.",
+			error, mp->m_fsname);
+		return error;
+	}
+	agi = XFS_BUF_TO_AGI(agbp);
+	ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+	ASSERT(agbno < INT_GET(agi->agi_length, ARCH_CONVERT));
+	/*
+	 * Initialize the cursor.
+	 */
+	cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+		(xfs_inode_t *)0, 0);
+#ifdef DEBUG
+	if (cur->bc_nlevels == 1) {
+		int freecount = 0;
+
+		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			goto error0;
+		do {
+			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+					&rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			freecount += rec.ir_freecount;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error0;
+		} while (i == 1);
+		ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+		       XFS_FORCED_SHUTDOWN(mp));
+	}
+#endif
+	/*
+	 * Look for the entry describing this inode.
+	 */
+	if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+		cmn_err(CE_WARN,
+			"xfs_difree: xfs_inobt_lookup_le returned()  an error %d on %s.	 Returning error.",
+			error, mp->m_fsname);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
+			&rec.ir_free, &i, ARCH_NOCONVERT))) {
+		cmn_err(CE_WARN,
+			"xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
+			error, mp->m_fsname);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	/*
+	 * Get the offset in the inode chunk.
+	 */
+	off = agino - rec.ir_startino;
+	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+	ASSERT(!XFS_INOBT_IS_FREE(&rec, off, ARCH_NOCONVERT));
+	/*
+	 * Mark the inode free & increment the count.
+	 */
+	XFS_INOBT_SET_FREE(&rec, off, ARCH_NOCONVERT);
+	rec.ir_freecount++;
+	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+		cmn_err(CE_WARN,
+			"xfs_difree: xfs_inobt_update()	 returned an error %d on %s.  Returning error.",
+			error, mp->m_fsname);
+		goto error0;
+	}
+	/*
+	 * Change the inode free counts and log the ag/sb changes.
+	 */
+	INT_MOD(agi->agi_freecount, ARCH_CONVERT, 1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	down_read(&mp->m_peraglock);
+	mp->m_perag[agno].pagi_freecount++;
+	up_read(&mp->m_peraglock);
+#ifdef DEBUG
+	if (cur->bc_nlevels == 1) {
+		int freecount = 0;
+
+		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			goto error0;
+		do {
+			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+					&rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			freecount += rec.ir_freecount;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error0;
+		} while (i == 1);
+		ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+		       XFS_FORCED_SHUTDOWN(mp));
+	}
+#endif
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Return the location of the inode in bno/off, for mapping it into a buffer.
+ */
+/*ARGSUSED*/
+int
+xfs_dilocate(
+	xfs_mount_t	*mp,	/* file system mount structure */
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_ino_t	ino,	/* inode to locate */
+	xfs_fsblock_t	*bno,	/* output: block containing inode */
+	int		*len,	/* output: num blocks in inode cluster */
+	int		*off,	/* output: index in block of inode */
+	uint		flags)	/* flags concerning inode lookup */
+{
+	xfs_agblock_t	agbno;	/* block number of inode in the alloc group */
+	xfs_buf_t	*agbp;	/* agi buffer */
+	xfs_agino_t	agino;	/* inode number within alloc group */
+	xfs_agnumber_t	agno;	/* allocation group number */
+	int		blks_per_cluster; /* num blocks per inode cluster */
+	xfs_agblock_t	chunk_agbno;	/* first block in inode chunk */
+	xfs_agino_t	chunk_agino;	/* first agino in inode chunk */
+	__int32_t	chunk_cnt;	/* count of free inodes in chunk */
+	xfs_inofree_t	chunk_free;	/* mask of free inodes in chunk */
+	xfs_agblock_t	cluster_agbno;	/* first block in inode cluster */
+	xfs_btree_cur_t *cur;	/* inode btree cursor */
+	int		error;	/* error code */
+	int		i;	/* temp state */
+	int		offset; /* index of inode in its buffer */
+	int		offset_agbno;	/* blks from chunk start to inode */
+
+	ASSERT(ino != NULLFSINO);
+	/*
+	 * Split up the inode number into its parts.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
+	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+#ifdef DEBUG
+		if (agno >= mp->m_sb.sb_agcount) {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+					"xfs_dilocate: agno (%d) >= "
+					"mp->m_sb.sb_agcount (%d)",
+					agno,  mp->m_sb.sb_agcount);
+		}
+		if (agbno >= mp->m_sb.sb_agblocks) {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+					"xfs_dilocate: agbno (0x%llx) >= "
+					"mp->m_sb.sb_agblocks (0x%lx)",
+					(unsigned long long) agbno,
+					(unsigned long) mp->m_sb.sb_agblocks);
+		}
+		if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+					"xfs_dilocate: ino (0x%llx) != "
+					"XFS_AGINO_TO_INO(mp, agno, agino) "
+					"(0x%llx)",
+					ino, XFS_AGINO_TO_INO(mp, agno, agino));
+		}
+#endif /* DEBUG */
+		return XFS_ERROR(EINVAL);
+	}
+	if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
+	    !(flags & XFS_IMAP_LOOKUP)) {
+		offset = XFS_INO_TO_OFFSET(mp, ino);
+		ASSERT(offset < mp->m_sb.sb_inopblock);
+		*bno = XFS_AGB_TO_FSB(mp, agno, agbno);
+		*off = offset;
+		*len = 1;
+		return 0;
+	}
+	blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+	if (*bno != NULLFSBLOCK) {
+		offset = XFS_INO_TO_OFFSET(mp, ino);
+		ASSERT(offset < mp->m_sb.sb_inopblock);
+		cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
+		*off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+			offset;
+		*len = blks_per_cluster;
+		return 0;
+	}
+	if (mp->m_inoalign_mask) {
+		offset_agbno = agbno & mp->m_inoalign_mask;
+		chunk_agbno = agbno - offset_agbno;
+	} else {
+		down_read(&mp->m_peraglock);
+		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+		up_read(&mp->m_peraglock);
+		if (error) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+					"xfs_ialloc_read_agi() returned "
+					"error %d, agno %d",
+					error, agno);
+#endif /* DEBUG */
+			return error;
+		}
+		cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+			(xfs_inode_t *)0, 0);
+		if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+					"xfs_inobt_lookup_le() failed");
+#endif /* DEBUG */
+			goto error0;
+		}
+		if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
+				&chunk_free, &i, ARCH_NOCONVERT))) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+					"xfs_inobt_get_rec() failed");
+#endif /* DEBUG */
+			goto error0;
+		}
+		if (i == 0) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+					"xfs_inobt_get_rec() failed");
+#endif /* DEBUG */
+			error = XFS_ERROR(EINVAL);
+		}
+		xfs_trans_brelse(tp, agbp);
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		if (error)
+			return error;
+		chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+		offset_agbno = agbno - chunk_agbno;
+	}
+	ASSERT(agbno >= chunk_agbno);
+	cluster_agbno = chunk_agbno +
+		((offset_agbno / blks_per_cluster) * blks_per_cluster);
+	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+		XFS_INO_TO_OFFSET(mp, ino);
+	*bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
+	*off = offset;
+	*len = blks_per_cluster;
+	return 0;
+error0:
+	xfs_trans_brelse(tp, agbp);
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
+		XFS_INODES_PER_CHUNK_LOG;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_in_maxlevels = level;
+}
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*bp,		/* allocation group header buffer */
+	int		fields)		/* bitmask of fields to log */
+{
+	int			first;		/* first byte number */
+	int			last;		/* last byte number */
+	static const short	offsets[] = {	/* field starting offsets */
+					/* keep in sync with bit definitions */
+		offsetof(xfs_agi_t, agi_magicnum),
+		offsetof(xfs_agi_t, agi_versionnum),
+		offsetof(xfs_agi_t, agi_seqno),
+		offsetof(xfs_agi_t, agi_length),
+		offsetof(xfs_agi_t, agi_count),
+		offsetof(xfs_agi_t, agi_root),
+		offsetof(xfs_agi_t, agi_level),
+		offsetof(xfs_agi_t, agi_freecount),
+		offsetof(xfs_agi_t, agi_newino),
+		offsetof(xfs_agi_t, agi_dirino),
+		offsetof(xfs_agi_t, agi_unlinked),
+		sizeof(xfs_agi_t)
+	};
+#ifdef DEBUG
+	xfs_agi_t		*agi;	/* allocation group header */
+
+	agi = XFS_BUF_TO_AGI(bp);
+	ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) ==
+		XFS_AGI_MAGIC);
+#endif
+	/*
+	 * Compute byte offsets for the first and last fields.
+	 */
+	xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+	/*
+	 * Log the allocation group inode header buffer.
+	 */
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_ialloc_read_agi(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_buf_t	**bpp)		/* allocation group hdr buf */
+{
+	xfs_agi_t	*agi;		/* allocation group header */
+	int		agi_ok;		/* agi is consistent */
+	xfs_buf_t	*bp;		/* allocation group hdr buf */
+    xfs_daddr_t		d;		/* disk block address */
+	int		error;
+#ifdef DEBUG
+	int		i;
+#endif
+	xfs_perag_t	*pag;		/* per allocation group data */
+
+
+	ASSERT(agno != NULLAGNUMBER);
+	d = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 1, 0, &bp)))
+		return error;
+	ASSERT(bp && !XFS_BUF_GETERROR(bp));
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	agi = XFS_BUF_TO_AGI(bp);
+	agi_ok =
+		INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
+		XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
+	if (XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+			XFS_RANDOM_IALLOC_READ_AGI)) {
+		xfs_trans_brelse(tp, bp);
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	pag = &mp->m_perag[agno];
+	if (!pag->pagi_init) {
+		pag->pagi_freecount = INT_GET(agi->agi_freecount, ARCH_CONVERT);
+		pag->pagi_init = 1;
+	} else {
+		/*
+		 * It's possible for these to be out of sync if
+		 * we are in the middle of a forced shutdown.
+		 */
+		ASSERT(pag->pagi_freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT)
+			|| XFS_FORCED_SHUTDOWN(mp));
+	}
+#ifdef DEBUG
+	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+		ASSERT(!INT_ISZERO(agi->agi_unlinked[i], ARCH_CONVERT));
+#endif
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
+	*bpp = bp;
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ialloc.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ialloc.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ialloc.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ialloc.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IALLOC_H__
+#define __XFS_IALLOC_H__
+
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Allocation parameters for inode allocation.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_INODES)
+int xfs_ialloc_inodes(struct xfs_mount *mp);
+#define XFS_IALLOC_INODES(mp)	xfs_ialloc_inodes(mp)
+#else
+#define XFS_IALLOC_INODES(mp)	((mp)->m_ialloc_inos)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_BLOCKS)
+xfs_extlen_t xfs_ialloc_blocks(struct xfs_mount *mp);
+#define XFS_IALLOC_BLOCKS(mp)	xfs_ialloc_blocks(mp)
+#else
+#define XFS_IALLOC_BLOCKS(mp)	((mp)->m_ialloc_blks)
+#endif
+
+/*
+ * For small block file systems, move inodes in clusters of this size.
+ * When we don't have a lot of memory, however, we go a bit smaller
+ * to reduce the number of AGI and ialloc btree blocks we need to keep
+ * around for xfs_dilocate().  We choose which one to use in
+ * xfs_mount_int().
+ */
+#define XFS_INODE_BIG_CLUSTER_SIZE	8192
+#define XFS_INODE_SMALL_CLUSTER_SIZE	4096
+#define XFS_INODE_CLUSTER_SIZE(mp)	(mp)->m_inode_cluster_size
+
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MAKE_IPTR)
+struct xfs_dinode *xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o);
+#define XFS_MAKE_IPTR(mp,b,o)		xfs_make_iptr(mp,b,o)
+#else
+#define XFS_MAKE_IPTR(mp,b,o) \
+	((xfs_dinode_t *)(xfs_buf_offset(b, (o) << (mp)->m_sb.sb_inodelog)))
+#endif
+
+/*
+ * Find a free (set) bit in the inode bitmask.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_FIND_FREE)
+int xfs_ialloc_find_free(xfs_inofree_t *fp);
+#define XFS_IALLOC_FIND_FREE(fp)	xfs_ialloc_find_free(fp)
+#else
+#define XFS_IALLOC_FIND_FREE(fp)	xfs_lowbit64(*(fp))
+#endif
+
+
+#ifdef __KERNEL__
+
+/*
+ * Prototypes for visible xfs_ialloc.c routines.
+ */
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * To work within the constraint of one allocation per transaction,
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.	 If an inode is
+ * available without an allocation, agbp would be set to the current
+ * agbp and alloc_done set to false.
+ * If an allocation needed to be done, agbp would be set to the
+ * inode header of the allocation group and alloc_done set to true.
+ * The caller should then commit the current transaction and allocate a new
+ * transaction.	 xfs_dialloc() should then be called again with
+ * the agbp value returned from the previous call.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.	 The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ *
+ * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
+ */
+int					/* error */
+xfs_dialloc(
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent inode (directory) */
+	mode_t		mode,		/* mode bits for new inode */
+	int		okalloc,	/* ok to allocate more space */
+	struct xfs_buf	**agbp,		/* buf for a.g. inode header */
+	boolean_t	*alloc_done,	/* an allocation was done to replenish
+					   the free inodes */
+	xfs_ino_t	*inop);		/* inode number allocated */
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int					/* error */
+xfs_difree(
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	inode);		/* inode to be freed */
+
+/*
+ * Return the location of the inode in bno/len/off,
+ * for mapping it into a buffer.
+ */
+int
+xfs_dilocate(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode to locate */
+	xfs_fsblock_t	*bno,		/* output: block containing inode */
+	int		*len,		/* output: num blocks in cluster*/
+	int		*off,		/* output: index in block of inode */
+	uint		flags);		/* flags for inode btree lookup */
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+	struct xfs_mount *mp);		/* file system mount structure */
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+	struct xfs_trans *tp,		/* transaction pointer */
+	struct xfs_buf	*bp,		/* allocation group header buffer */
+	int		fields);	/* bitmask of fields to log */
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int					/* error */
+xfs_ialloc_read_agi(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	struct xfs_buf	**bpp);		/* allocation group hdr buf */
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_IALLOC_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ialloc_btree.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ialloc_btree.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ialloc_btree.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ialloc_btree.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,2104 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Inode allocation management for XFS.
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Prototypes for internal functions.
+ */
+
+STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
+STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
+STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
+		xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
+STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
+
+/*
+ * Internal functions.
+ */
+
+#ifdef _NOTYET_
+/*
+ * Single level of the xfs_inobt_delete record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int				/* error */
+xfs_inobt_delrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level removing record from */
+	int			*stat)	/* fail/done/go-on */
+{
+	xfs_buf_t		*agbp;	/* buffer for a.g. inode header */
+	xfs_agnumber_t		agfbno; /* agf block of freed btree block */
+	xfs_buf_t		*agfbp; /* bp of agf block of freed block */
+	xfs_agi_t		*agi;	/* allocation group inode header */
+	xfs_inobt_block_t	*block; /* btree block record/key lives in */
+	xfs_agblock_t		bno;	/* btree block number */
+	xfs_buf_t		*bp;	/* buffer for block */
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_inobt_key_t		key;	/* kp points here if block is level 0 */
+	xfs_inobt_key_t		*kp;	/* pointer to btree keys */
+	xfs_agblock_t		lbno;	/* left block's block number */
+	xfs_buf_t		*lbp;	/* left block's buffer pointer */
+	xfs_inobt_block_t	*left;	/* left btree block */
+	xfs_inobt_key_t		*lkp;	/* left block key pointer */
+	xfs_inobt_ptr_t		*lpp;	/* left block address pointer */
+	int			lrecs;	/* number of records in left block */
+	xfs_inobt_rec_t		*lrp;	/* left block record pointer */
+	xfs_inobt_ptr_t		*pp;	/* pointer to btree addresses */
+	int			ptr;	/* index in btree block for this rec */
+	xfs_agblock_t		rbno;	/* right block's block number */
+	xfs_buf_t		*rbp;	/* right block's buffer pointer */
+	xfs_inobt_block_t	*right; /* right btree block */
+	xfs_inobt_key_t		*rkp;	/* right block key pointer */
+	xfs_inobt_rec_t		*rp;	/* pointer to btree records */
+	xfs_inobt_ptr_t		*rpp;	/* right block address pointer */
+	int			rrecs;	/* number of records in right block */
+	xfs_inobt_rec_t		*rrp;	/* right block record pointer */
+	xfs_btree_cur_t		*tcur;	/* temporary btree cursor */
+
+
+	/*
+	 * Get the index of the entry being deleted, check for nothing there.
+	 */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Get the buffer & block containing the record or key/ptr.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if (error = xfs_btree_check_sblock(cur, block, level, bp))
+		return error;
+#endif
+	/*
+	 * Fail if we're off the end of the block.
+	 */
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * It's a nonleaf.  Excise the key and ptr being deleted, by
+	 * sliding the entries past them down one.
+	 * Log the changed areas of the block.
+	 */
+	if (level > 0) {
+		kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
+		pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = ptr; i < INT_GET(block->bb_numrecs, ARCH_CONVERT); i++) {
+			if (error = xfs_btree_check_sptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))
+				return error;
+		}
+#endif
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			ovbcopy(&kp[ptr], &kp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*kp));
+			ovbcopy(&pp[ptr], &pp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*pp));
+			xfs_inobt_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+			xfs_inobt_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+		}
+	}
+	/*
+	 * It's a leaf.	 Excise the record being deleted, by sliding the
+	 * entries past it down one.  Log the changed areas of the block.
+	 */
+	else {
+		rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			ovbcopy(&rp[ptr], &rp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*rp));
+			xfs_inobt_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+		}
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		if (ptr == 1) {
+			INT_COPY(key.ir_startino, rp->ir_startino, ARCH_CONVERT);
+			kp = &key;
+		}
+	}
+	/*
+	 * Decrement and log the number of entries in the block.
+	 */
+	INT_MOD(block->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+	/*
+	 * Is this the root level?  If so, we're almost done.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		/*
+		 * If this is the root level,
+		 * and there's only one entry left,
+		 * and it's NOT the leaf level,
+		 * then we can get rid of this level.
+		 */
+		if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == 1 && level > 0) {
+			agbp = cur->bc_private.i.agbp;
+			agi = XFS_BUF_TO_AGI(agbp);
+			/*
+			 * pp is still set to the first pointer in the block.
+			 * Make it the new root of the btree.
+			 */
+			bno = INT_GET(agi->agi_root, ARCH_CONVERT);
+			INT_COPY(agi->agi_root, *pp, ARCH_CONVERT);
+			INT_MOD(agi->agi_level, ARCH_CONVERT, -1);
+			/*
+			 * Free the block.
+			 */
+			if (error = xfs_free_extent(cur->bc_tp, bno, 1))
+				return error;
+			xfs_trans_binval(cur->bc_tp, bp);
+			xfs_ialloc_log_agi(cur->bc_tp, agbp,
+				XFS_AGI_ROOT | XFS_AGI_LEVEL);
+			/*
+			 * Update the cursor so there's one fewer level.
+			 */
+			cur->bc_bufs[level] = NULL;
+			cur->bc_nlevels--;
+			/*
+			 * To ensure that the freed block is not used for
+			 * user data until this transaction is permanent,
+			 * we lock the agf buffer for this ag until the
+			 * transaction record makes it to the on-disk log.
+			 */
+			agfbno = XFS_AG_DADDR(cur->bc_mp,
+					      cur->bc_private.i.agno,
+					      XFS_AGF_DADDR);
+			if (error = xfs_trans_read_buf(cur->bc_mp, cur->bc_tp,
+					cur->bc_mp->m_ddev_targp, agfbno, 1, 0,
+					&agfbp))
+				return error;
+			ASSERT(!XFS_BUF_GETERROR(agfbp));
+			xfs_trans_bhold_until_committed(cur->bc_tp, agfbp);
+		} else if (level > 0 &&
+			   (error = xfs_inobt_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * If we deleted the leftmost entry in the block, update the
+	 * key values above us in the tree.
+	 */
+	if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
+		return error;
+	/*
+	 * If the number of records remaining in the block is at least
+	 * the minimum, we're done.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
+		if (level > 0 &&
+		    (error = xfs_inobt_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * Otherwise, we have to move some records around to keep the
+	 * tree balanced.  Look at the left and right sibling blocks to
+	 * see if we can re-balance by moving only one record.
+	 */
+	rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+	lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
+	bno = NULLAGBLOCK;
+	ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
+	/*
+	 * Duplicate the cursor so our btree manipulations here won't
+	 * disrupt the next level up.
+	 */
+	if (error = xfs_btree_dup_cursor(cur, &tcur))
+		return error;
+	/*
+	 * If there's a right sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (rbno != NULLAGBLOCK) {
+		/*
+		 * Move the temp cursor to the last entry in the next block.
+		 * Actually any entry but the first would suffice.
+		 */
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if (error = xfs_inobt_increment(tcur, level, &i))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Grab a pointer to the block.
+		 */
+		rbp = tcur->bc_bufs[level];
+		right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+#ifdef DEBUG
+		if (error = xfs_btree_check_sblock(cur, right, level, rbp))
+			goto error0;
+#endif
+		/*
+		 * Grab the current block number, for future use.
+		 */
+		bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		/*
+		 * If right block is full enough so that removing one entry
+		 * won't make it too empty, and left-shifting an entry out
+		 * of right to us works, we're done.
+		 */
+		if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
+		     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
+			if (error = xfs_inobt_lshift(tcur, level, &i))
+				goto error0;
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_INOBT_BLOCK_MINRECS(level, cur));
+				xfs_btree_del_cursor(tcur,
+						     XFS_BTREE_NOERROR);
+				if (level > 0 &&
+				    (error = xfs_inobt_decrement(cur, level,
+						&i)))
+					return error;
+				*stat = 1;
+				return 0;
+			}
+		}
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference, and fix up the temp cursor to point
+		 * to our block again (last record).
+		 */
+		rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+		if (lbno != NULLAGBLOCK) {
+			xfs_btree_firstrec(tcur, level);
+			if (error = xfs_inobt_decrement(tcur, level, &i))
+				goto error0;
+		}
+	}
+	/*
+	 * If there's a left sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (lbno != NULLAGBLOCK) {
+		/*
+		 * Move the temp cursor to the first entry in the
+		 * previous block.
+		 */
+		xfs_btree_firstrec(tcur, level);
+		if (error = xfs_inobt_decrement(tcur, level, &i))
+			goto error0;
+		xfs_btree_firstrec(tcur, level);
+		/*
+		 * Grab a pointer to the block.
+		 */
+		lbp = tcur->bc_bufs[level];
+		left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+#ifdef DEBUG
+		if (error = xfs_btree_check_sblock(cur, left, level, lbp))
+			goto error0;
+#endif
+		/*
+		 * Grab the current block number, for future use.
+		 */
+		bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		/*
+		 * If left block is full enough so that removing one entry
+		 * won't make it too empty, and right-shifting an entry out
+		 * of left to us works, we're done.
+		 */
+		if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
+		     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
+			if (error = xfs_inobt_rshift(tcur, level, &i))
+				goto error0;
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_INOBT_BLOCK_MINRECS(level, cur));
+				xfs_btree_del_cursor(tcur,
+						     XFS_BTREE_NOERROR);
+				if (level == 0)
+					cur->bc_ptrs[0]++;
+				*stat = 1;
+				return 0;
+			}
+		}
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference.
+		 */
+		lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * Delete the temp cursor, we're done with it.
+	 */
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	/*
+	 * If here, we need to do a join to keep the tree balanced.
+	 */
+	ASSERT(bno != NULLAGBLOCK);
+	/*
+	 * See if we can join with the left neighbor block.
+	 */
+	if (lbno != NULLAGBLOCK &&
+	    lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * Set "right" to be the starting block,
+		 * "left" to be the left neighbor.
+		 */
+		rbno = bno;
+		right = block;
+		rbp = bp;
+		if (error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, lbno, 0, &lbp,
+				XFS_INO_BTREE_REF))
+			return error;
+		left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+		if (error = xfs_btree_check_sblock(cur, left, level, lbp))
+			return error;
+	}
+	/*
+	 * If that won't work, see if we can join with the right neighbor block.
+	 */
+	else if (rbno != NULLAGBLOCK &&
+		 rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+		  XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * Set "left" to be the starting block,
+		 * "right" to be the right neighbor.
+		 */
+		lbno = bno;
+		left = block;
+		lbp = bp;
+		if (error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, rbno, 0, &rbp,
+				XFS_INO_BTREE_REF))
+			return error;
+		right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+		if (error = xfs_btree_check_sblock(cur, right, level, rbp))
+			return error;
+	}
+	/*
+	 * Otherwise, we can't fix the imbalance.
+	 * Just return.	 This is probably a logic error, but it's not fatal.
+	 */
+	else {
+		if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * We're now going to join "left" and "right" by moving all the stuff
+	 * in "right" to "left" and deleting "right".
+	 */
+	if (level > 0) {
+		/*
+		 * It's a non-leaf.  Move keys and pointers.
+		 */
+		lkp = XFS_INOBT_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		lpp = XFS_INOBT_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+		rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if (error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))
+				return error;
+		}
+#endif
+		bcopy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lkp));
+		bcopy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lpp));
+		xfs_inobt_log_keys(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_inobt_log_ptrs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	} else {
+		/*
+		 * It's a leaf.	 Move records.
+		 */
+		lrp = XFS_INOBT_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		bcopy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lrp));
+		xfs_inobt_log_recs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	}
+	/*
+	 * Fix up the number of records in the surviving block.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	/*
+	 * Fix up the right block pointer in the surviving block, and log it.
+	 */
+	INT_COPY(left->bb_rightsib, right->bb_rightsib, ARCH_CONVERT);
+	xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	/*
+	 * If there is a right sibling now, make it point to the
+	 * remaining block.
+	 */
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		xfs_inobt_block_t	*rrblock;
+		xfs_buf_t		*rrbp;
+
+		if (error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
+				&rrbp, XFS_INO_BTREE_REF))
+			return error;
+		rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
+		if (error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))
+			return error;
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
+		xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * Free the deleting block.
+	 */
+	if (error = xfs_free_extent(cur->bc_tp, rbno, 1))
+		return error;
+	xfs_trans_binval(cur->bc_tp, rbp);
+	/*
+	 * To ensure that the freed block is not used for
+	 * user data until this transaction is permanent,
+	 * we lock the agf buffer for this ag until the
+	 * transaction record makes it to the on-disk log.
+	 */
+	agfbno = XFS_AG_DADDR(cur->bc_mp, cur->bc_private.i.agno,
+			      XFS_AGF_DADDR);
+	if (error = xfs_trans_read_buf(cur->bc_mp, cur->bc_tp,
+			cur->bc_mp->m_ddev_targp, agfbno, 1, 0, &agfbp))
+		return error;
+	ASSERT(!XFS_BUF_GETERROR(agfbp));
+	xfs_trans_bhold_until_committed(cur->bc_tp, agfbp);
+	/*
+	 * If we joined with the left neighbor, set the buffer in the
+	 * cursor to the left block, and fix up the index.
+	 */
+	if (bp != lbp) {
+		cur->bc_bufs[level] = lbp;
+		cur->bc_ptrs[level] += INT_GET(left->bb_numrecs, ARCH_CONVERT);
+		cur->bc_ra[level] = 0;
+	}
+	/*
+	 * If we joined with the right neighbor and there's a level above
+	 * us, increment the cursor at that level.
+	 */
+	else if (level + 1 < cur->bc_nlevels &&
+		 (error = xfs_inobt_increment(cur, level + 1, &i))) {
+		return error;
+	}
+	/*
+	 * Readjust the ptr at this level if it's not a leaf, since it's
+	 * still pointing at the deletion point, which makes the cursor
+	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+	 * We can't use decrement because it would change the next level up.
+	 */
+	if (level > 0)
+		cur->bc_ptrs[level]--;
+	/*
+	 * Return value means the next level up has something to do.
+	 */
+	*stat = 2;
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+#endif	/* _NOTYET_ */
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int				/* error */
+xfs_inobt_insrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to insert record at */
+	xfs_agblock_t		*bnop,	/* i/o: block number inserted */
+	xfs_inobt_rec_t		*recp,	/* i/o: record data inserted */
+	xfs_btree_cur_t		**curp, /* output: new cursor replacing cur */
+	int			*stat)	/* success/failure */
+{
+	xfs_inobt_block_t	*block; /* btree block record/key lives in */
+	xfs_buf_t		*bp;	/* buffer for block */
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_inobt_key_t		key;	/* key value being inserted */
+	xfs_inobt_key_t		*kp=NULL;	/* pointer to btree keys */
+	xfs_agblock_t		nbno;	/* block number of allocated block */
+	xfs_btree_cur_t		*ncur;	/* new cursor to be used at next lvl */
+	xfs_inobt_key_t		nkey;	/* new key value, from split */
+	xfs_inobt_rec_t		nrec;	/* new record value, for caller */
+	int			optr;	/* old ptr value */
+	xfs_inobt_ptr_t		*pp;	/* pointer to btree addresses */
+	int			ptr;	/* index in btree block for this rec */
+	xfs_inobt_rec_t		*rp=NULL;	/* pointer to btree records */
+
+	/*
+	 * If we made it to the root level, allocate a new root block
+	 * and we're done.
+	 */
+	if (level >= cur->bc_nlevels) {
+		error = xfs_inobt_newroot(cur, &i);
+		*bnop = NULLAGBLOCK;
+		*stat = i;
+		return error;
+	}
+	/*
+	 * Make a key out of the record data to be inserted, and save it.
+	 */
+	key.ir_startino = recp->ir_startino; /* INT_: direct copy */
+	optr = ptr = cur->bc_ptrs[level];
+	/*
+	 * If we're off the left edge, return failure.
+	 */
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Get pointers to the btree buffer and block.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+	/*
+	 * Check that the new entry is being inserted in the right place.
+	 */
+	if (ptr <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		if (level == 0) {
+			rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
+			xfs_btree_check_rec(cur->bc_btnum, recp, rp);
+		} else {
+			kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
+			xfs_btree_check_key(cur->bc_btnum, &key, kp);
+		}
+	}
+#endif
+	nbno = NULLAGBLOCK;
+	ncur = (xfs_btree_cur_t *)0;
+	/*
+	 * If the block is full, we can't insert the new entry until we
+	 * make the block un-full.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * First, try shifting an entry to the right neighbor.
+		 */
+		if ((error = xfs_inobt_rshift(cur, level, &i)))
+			return error;
+		if (i) {
+			/* nothing */
+		}
+		/*
+		 * Next, try shifting an entry to the left neighbor.
+		 */
+		else {
+			if ((error = xfs_inobt_lshift(cur, level, &i)))
+				return error;
+			if (i) {
+				optr = ptr = cur->bc_ptrs[level];
+			} else {
+				/*
+				 * Next, try splitting the current block
+				 * in half. If this works we have to
+				 * re-set our variables because
+				 * we could be in a different block now.
+				 */
+				if ((error = xfs_inobt_split(cur, level, &nbno,
+						&nkey, &ncur, &i)))
+					return error;
+				if (i) {
+					bp = cur->bc_bufs[level];
+					block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+					if ((error = xfs_btree_check_sblock(cur,
+							block, level, bp)))
+						return error;
+#endif
+					ptr = cur->bc_ptrs[level];
+					nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */
+				} else {
+					/*
+					 * Otherwise the insert fails.
+					 */
+					*stat = 0;
+					return 0;
+				}
+			}
+		}
+	}
+	/*
+	 * At this point we know there's room for our new entry in the block
+	 * we're pointing at.
+	 */
+	if (level > 0) {
+		/*
+		 * It's a non-leaf entry.  Make a hole for the new data
+		 * in the key and ptr regions of the block.
+		 */
+		kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
+		pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(block->bb_numrecs, ARCH_CONVERT); i >= ptr; i--) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		ovbcopy(&kp[ptr - 1], &kp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*kp));
+		ovbcopy(&pp[ptr - 1], &pp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*pp));
+		/*
+		 * Now stuff the new data in, bump numrecs and log the new data.
+		 */
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
+			return error;
+#endif
+		kp[ptr - 1] = key; /* INT_: struct copy */
+		INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+		INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+		xfs_inobt_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+		xfs_inobt_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+	} else {
+		/*
+		 * It's a leaf entry.  Make a hole for the new record.
+		 */
+		rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+		ovbcopy(&rp[ptr - 1], &rp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*rp));
+		/*
+		 * Now stuff the new record in, bump numrecs
+		 * and log the new data.
+		 */
+		rp[ptr - 1] = *recp; /* INT_: struct copy */
+		INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+		xfs_inobt_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+	}
+	/*
+	 * Log the new number of records in the btree header.
+	 */
+	xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+	/*
+	 * Check that the key/record is in the right place, now.
+	 */
+	if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		if (level == 0)
+			xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
+				rp + ptr);
+		else
+			xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
+				kp + ptr);
+	}
+#endif
+	/*
+	 * If we inserted at the start of a block, update the parents' keys.
+	 */
+	if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
+		return error;
+	/*
+	 * Return the new block number, if any.
+	 * If there is one, give back a record value and a cursor too.
+	 */
+	*bnop = nbno;
+	if (nbno != NULLAGBLOCK) {
+		*recp = nrec; /* INT_: struct copy */
+		*curp = ncur;
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Log header fields from a btree block.
+ */
+STATIC void
+xfs_inobt_log_block(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			fields) /* mask of fields: XFS_BB_... */
+{
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	static const short	offsets[] = {	/* table of offsets */
+		offsetof(xfs_inobt_block_t, bb_magic),
+		offsetof(xfs_inobt_block_t, bb_level),
+		offsetof(xfs_inobt_block_t, bb_numrecs),
+		offsetof(xfs_inobt_block_t, bb_leftsib),
+		offsetof(xfs_inobt_block_t, bb_rightsib),
+		sizeof(xfs_inobt_block_t)
+	};
+
+	xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * Log keys from a btree block (nonleaf).
+ */
+STATIC void
+xfs_inobt_log_keys(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			kfirst, /* index of first key to log */
+	int			klast)	/* index of last key to log */
+{
+	xfs_inobt_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	xfs_inobt_key_t		*kp;	/* key pointer in btree block */
+	int			last;	/* last byte offset logged */
+
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+	kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_inobt_log_ptrs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			pfirst, /* index of first pointer to log */
+	int			plast)	/* index of last pointer to log */
+{
+	xfs_inobt_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	xfs_inobt_ptr_t		*pp;	/* block-pointer pointer in btree blk */
+
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+	pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Log records from a btree block (leaf).
+ */
+STATIC void
+xfs_inobt_log_recs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			rfirst, /* index of first record to log */
+	int			rlast)	/* index of last record to log */
+{
+	xfs_inobt_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	xfs_inobt_rec_t		*rp;	/* record pointer for btree block */
+
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+	rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+STATIC int				/* error */
+xfs_inobt_lookup(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_lookup_t		dir,	/* <=, ==, or >= */
+	int			*stat)	/* success/failure */
+{
+	xfs_agblock_t		agbno;	/* a.g. relative btree block number */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	xfs_inobt_block_t	*block=NULL;	/* current btree block */
+	int			diff;	/* difference for the current key */
+	int			error;	/* error return value */
+	int			keyno=0;	/* current key number */
+	int			level;	/* level in the btree */
+	xfs_mount_t		*mp;	/* file system mount point */
+
+	/*
+	 * Get the allocation group header, and the root block number.
+	 */
+	mp = cur->bc_mp;
+	{
+		xfs_agi_t	*agi;	/* a.g. inode header */
+
+		agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+		agno = INT_GET(agi->agi_seqno, ARCH_CONVERT);
+		agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
+	}
+	/*
+	 * Iterate over each level in the btree, starting at the root.
+	 * For each level above the leaves, find the key we need, based
+	 * on the lookup record, then follow the corresponding block
+	 * pointer down to the next level.
+	 */
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		xfs_buf_t	*bp;	/* buffer pointer for btree block */
+		xfs_daddr_t		d;	/* disk address of btree block */
+
+		/*
+		 * Get the disk address we're looking for.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+		/*
+		 * If the old buffer at this level is for a different block,
+		 * throw it away, otherwise just use it.
+		 */
+		bp = cur->bc_bufs[level];
+		if (bp && XFS_BUF_ADDR(bp) != d)
+			bp = (xfs_buf_t *)0;
+		if (!bp) {
+			/*
+			 * Need to get a new buffer.  Read it, then
+			 * set it in the cursor, releasing the old one.
+			 */
+			if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+					agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
+				return error;
+			xfs_btree_setbuf(cur, level, bp);
+			/*
+			 * Point to the btree block, now that we have the buffer
+			 */
+			block = XFS_BUF_TO_INOBT_BLOCK(bp);
+			if ((error = xfs_btree_check_sblock(cur, block, level,
+					bp)))
+				return error;
+		} else
+			block = XFS_BUF_TO_INOBT_BLOCK(bp);
+		/*
+		 * If we already had a key match at a higher level, we know
+		 * we need to use the first entry in this block.
+		 */
+		if (diff == 0)
+			keyno = 1;
+		/*
+		 * Otherwise we need to search this block.  Do a binary search.
+		 */
+		else {
+			int		high;	/* high entry number */
+			xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
+			xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
+			int		low;	/* low entry number */
+
+			/*
+			 * Get a pointer to keys or records.
+			 */
+			if (level > 0)
+				kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
+			else
+				krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
+			/*
+			 * Set low and high entry numbers, 1-based.
+			 */
+			low = 1;
+			if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
+				/*
+				 * If the block is empty, the tree must
+				 * be an empty leaf.
+				 */
+				ASSERT(level == 0 && cur->bc_nlevels == 1);
+				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+				*stat = 0;
+				return 0;
+			}
+			/*
+			 * Binary search the block.
+			 */
+			while (low <= high) {
+				xfs_agino_t	startino;	/* key value */
+
+				/*
+				 * keyno is average of low and high.
+				 */
+				keyno = (low + high) >> 1;
+				/*
+				 * Get startino.
+				 */
+				if (level > 0) {
+					xfs_inobt_key_t *kkp;
+
+					kkp = kkbase + keyno - 1;
+					startino = INT_GET(kkp->ir_startino, ARCH_CONVERT);
+				} else {
+					xfs_inobt_rec_t *krp;
+
+					krp = krbase + keyno - 1;
+					startino = INT_GET(krp->ir_startino, ARCH_CONVERT);
+				}
+				/*
+				 * Compute difference to get next direction.
+				 */
+				diff = (int)startino - cur->bc_rec.i.ir_startino;
+				/*
+				 * Less than, move right.
+				 */
+				if (diff < 0)
+					low = keyno + 1;
+				/*
+				 * Greater than, move left.
+				 */
+				else if (diff > 0)
+					high = keyno - 1;
+				/*
+				 * Equal, we're done.
+				 */
+				else
+					break;
+			}
+		}
+		/*
+		 * If there are more levels, set up for the next level
+		 * by getting the block number and filling in the cursor.
+		 */
+		if (level > 0) {
+			/*
+			 * If we moved left, need the previous key number,
+			 * unless there isn't one.
+			 */
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, keyno, cur), ARCH_CONVERT);
+#ifdef DEBUG
+			if ((error = xfs_btree_check_sptr(cur, agbno, level)))
+				return error;
+#endif
+			cur->bc_ptrs[level] = keyno;
+		}
+	}
+	/*
+	 * Done with the search.
+	 * See if we need to adjust the results.
+	 */
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		if (dir == XFS_LOOKUP_GE &&
+		    keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
+		    INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			int	i;
+
+			cur->bc_ptrs[0] = keyno;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				return error;
+			ASSERT(i == 1);
+			*stat = 1;
+			return 0;
+		}
+	}
+	else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_ptrs[0] = keyno;
+	/*
+	 * Return if we succeeded or not.
+	 */
+	if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT))
+		*stat = 0;
+	else
+		*stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
+	return 0;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int				/* error */
+xfs_inobt_lshift(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to shift record on */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+#ifdef DEBUG
+	int			i;	/* loop index */
+#endif
+	xfs_inobt_key_t		key;	/* key value for leaf level upward */
+	xfs_buf_t		*lbp;	/* buffer for left neighbor block */
+	xfs_inobt_block_t	*left;	/* left neighbor btree block */
+	xfs_inobt_key_t		*lkp=NULL;	/* key pointer for left block */
+	xfs_inobt_ptr_t		*lpp;	/* address pointer for left block */
+	xfs_inobt_rec_t		*lrp=NULL;	/* record pointer for left block */
+	int			nrec;	/* new number of left block entries */
+	xfs_buf_t		*rbp;	/* buffer for right (current) block */
+	xfs_inobt_block_t	*right; /* right (current) btree block */
+	xfs_inobt_key_t		*rkp=NULL;	/* key pointer for right block */
+	xfs_inobt_ptr_t		*rpp=NULL;	/* address pointer for right block */
+	xfs_inobt_rec_t		*rrp=NULL;	/* record pointer for right block */
+
+	/*
+	 * Set up variables for this block as "right".
+	 */
+	rbp = cur->bc_bufs[level];
+	right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+		return error;
+#endif
+	/*
+	 * If we've got no left sibling then we can't shift an entry left.
+	 */
+	if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] <= 1) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Set up the left neighbor as "left".
+	 */
+	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.i.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp,
+			XFS_INO_BTREE_REF)))
+		return error;
+	left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+	/*
+	 * If it's full, it can't take another entry.
+	 */
+	if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		*stat = 0;
+		return 0;
+	}
+	nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
+	/*
+	 * If non-leaf, copy a key and a ptr to the left block.
+	 */
+	if (level > 0) {
+		lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
+		rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+		*lkp = *rkp;
+		xfs_inobt_log_keys(cur, lbp, nrec, nrec);
+		lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
+		rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level)))
+			return error;
+#endif
+		*lpp = *rpp; /* INT_: no-change copy */
+		xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
+	}
+	/*
+	 * If leaf, copy a record to the left block.
+	 */
+	else {
+		lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
+		rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		*lrp = *rrp;
+		xfs_inobt_log_recs(cur, lbp, nrec, nrec);
+	}
+	/*
+	 * Bump and log left's numrecs, decrement and log right's numrecs.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1);
+	xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+	if (level > 0)
+		xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
+	else
+		xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
+#endif
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+	/*
+	 * Slide the contents of right down one entry.
+	 */
+	if (level > 0) {
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+					level)))
+				return error;
+		}
+#endif
+		ovbcopy(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+		xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	} else {
+		ovbcopy(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+		rkp = &key;
+	}
+	/*
+	 * Update the parent key values of right.
+	 */
+	if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
+		return error;
+	/*
+	 * Slide the cursor value left one.
+	 */
+	cur->bc_ptrs[level]--;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int				/* error */
+xfs_inobt_newroot(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*stat)	/* success/failure */
+{
+	xfs_agi_t		*agi;	/* a.g. inode header */
+	xfs_alloc_arg_t		args;	/* allocation argument structure */
+	xfs_inobt_block_t	*block; /* one half of the old root block */
+	xfs_buf_t		*bp;	/* buffer containing block */
+	int			error;	/* error return value */
+	xfs_inobt_key_t		*kp;	/* btree key pointer */
+	xfs_agblock_t		lbno;	/* left block number */
+	xfs_buf_t		*lbp;	/* left buffer pointer */
+	xfs_inobt_block_t	*left;	/* left btree block */
+	xfs_buf_t		*nbp;	/* new (root) buffer */
+	xfs_inobt_block_t	*new;	/* new (root) btree block */
+	int			nptr;	/* new value for key index, 1 or 2 */
+	xfs_inobt_ptr_t		*pp;	/* btree address pointer */
+	xfs_agblock_t		rbno;	/* right block number */
+	xfs_buf_t		*rbp;	/* right buffer pointer */
+	xfs_inobt_block_t	*right; /* right btree block */
+	xfs_inobt_rec_t		*rp;	/* btree record pointer */
+
+	ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
+
+	/*
+	 * Get a block & a buffer.
+	 */
+	agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
+		INT_GET(agi->agi_root, ARCH_CONVERT));
+	args.mod = args.minleft = args.alignment = args.total = args.wasdel =
+		args.isfl = args.userdata = args.minalignslop = 0;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	if ((error = xfs_alloc_vextent(&args)))
+		return error;
+	/*
+	 * None available, we fail.
+	 */
+	if (args.fsbno == NULLFSBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
+	new = XFS_BUF_TO_INOBT_BLOCK(nbp);
+	/*
+	 * Set the root data in the a.g. inode structure.
+	 */
+	INT_SET(agi->agi_root, ARCH_CONVERT, args.agbno);
+	INT_MOD(agi->agi_level, ARCH_CONVERT, 1);
+	xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
+		XFS_AGI_ROOT | XFS_AGI_LEVEL);
+	/*
+	 * At the previous root level there are now two blocks: the old
+	 * root, and the new block generated when it was split.
+	 * We don't know which one the cursor is pointing at, so we
+	 * set up variables "left" and "right" for each case.
+	 */
+	bp = cur->bc_bufs[cur->bc_nlevels - 1];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
+		return error;
+#endif
+	if (INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		/*
+		 * Our block is left, pick up the right block.
+		 */
+		lbp = bp;
+		lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
+		left = block;
+		rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
+				rbno, 0, &rbp, XFS_INO_BTREE_REF)))
+			return error;
+		bp = rbp;
+		right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+		if ((error = xfs_btree_check_sblock(cur, right,
+				cur->bc_nlevels - 1, rbp)))
+			return error;
+		nptr = 1;
+	} else {
+		/*
+		 * Our block is right, pick up the left block.
+		 */
+		rbp = bp;
+		rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
+		right = block;
+		lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
+				lbno, 0, &lbp, XFS_INO_BTREE_REF)))
+			return error;
+		bp = lbp;
+		left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+		if ((error = xfs_btree_check_sblock(cur, left,
+				cur->bc_nlevels - 1, lbp)))
+			return error;
+		nptr = 2;
+	}
+	/*
+	 * Fill in the new block's btree header and log it.
+	 */
+	INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+	INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels);
+	INT_SET(new->bb_numrecs, ARCH_CONVERT, 2);
+	INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+	INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+	xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
+	ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
+	/*
+	 * Fill in the key data in the new root.
+	 */
+	kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
+	if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) {
+		kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */
+		kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */
+	} else {
+		rp = XFS_INOBT_REC_ADDR(left, 1, cur);
+		INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT);
+		rp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT);
+	}
+	xfs_inobt_log_keys(cur, nbp, 1, 2);
+	/*
+	 * Fill in the pointer data in the new root.
+	 */
+	pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
+	INT_SET(pp[0], ARCH_CONVERT, lbno);
+	INT_SET(pp[1], ARCH_CONVERT, rbno);
+	xfs_inobt_log_ptrs(cur, nbp, 1, 2);
+	/*
+	 * Fix up the cursor.
+	 */
+	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+	cur->bc_ptrs[cur->bc_nlevels] = nptr;
+	cur->bc_nlevels++;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int				/* error */
+xfs_inobt_rshift(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to shift record on */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_inobt_key_t		key;	/* key value for leaf level upward */
+	xfs_buf_t		*lbp;	/* buffer for left (current) block */
+	xfs_inobt_block_t	*left;	/* left (current) btree block */
+	xfs_inobt_key_t		*lkp;	/* key pointer for left block */
+	xfs_inobt_ptr_t		*lpp;	/* address pointer for left block */
+	xfs_inobt_rec_t		*lrp;	/* record pointer for left block */
+	xfs_buf_t		*rbp;	/* buffer for right neighbor block */
+	xfs_inobt_block_t	*right; /* right neighbor btree block */
+	xfs_inobt_key_t		*rkp;	/* key pointer for right block */
+	xfs_inobt_ptr_t		*rpp;	/* address pointer for right block */
+	xfs_inobt_rec_t		*rrp=NULL;	/* record pointer for right block */
+	xfs_btree_cur_t		*tcur;	/* temporary cursor */
+
+	/*
+	 * Set up variables for this block as "left".
+	 */
+	lbp = cur->bc_bufs[level];
+	left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+#endif
+	/*
+	 * If we've got no right sibling then we can't shift an entry right.
+	 */
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Set up the right neighbor as "right".
+	 */
+	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp,
+			XFS_INO_BTREE_REF)))
+		return error;
+	right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+		return error;
+	/*
+	 * If it's full, it can't take another entry.
+	 */
+	if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Make a hole at the start of the right neighbor block, then
+	 * copy the last left block entry to the hole.
+	 */
+	if (level > 0) {
+		lkp = XFS_INOBT_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		lpp = XFS_INOBT_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+		rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		ovbcopy(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level)))
+			return error;
+#endif
+		*rkp = *lkp; /* INT_: no change copy */
+		*rpp = *lpp; /* INT_: no change copy */
+		xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+	} else {
+		lrp = XFS_INOBT_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		ovbcopy(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		*rrp = *lrp;
+		xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+		rkp = &key;
+	}
+	/*
+	 * Decrement and log left's numrecs, bump and log right's numrecs.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+#ifdef DEBUG
+	if (level > 0)
+		xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
+	else
+		xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
+#endif
+	xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+	/*
+	 * Using a temporary cursor, update the parent key values of the
+	 * block on the right.
+	 */
+	if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+		return error;
+	xfs_btree_lastrec(tcur, level);
+	if ((error = xfs_inobt_increment(tcur, level, &i)) ||
+	    (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
+		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+		return error;
+	}
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and its first record (to be inserted into parent).
+ */
+STATIC int				/* error */
+xfs_inobt_split(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to split */
+	xfs_agblock_t		*bnop,	/* output: block number allocated */
+	xfs_inobt_key_t		*keyp,	/* output: first key of new block */
+	xfs_btree_cur_t		**curp, /* output: new cursor */
+	int			*stat)	/* success/failure */
+{
+	xfs_alloc_arg_t		args;	/* allocation argument structure */
+	int			error;	/* error return value */
+	int			i;	/* loop index/record number */
+	xfs_agblock_t		lbno;	/* left (current) block number */
+	xfs_buf_t		*lbp;	/* buffer for left block */
+	xfs_inobt_block_t	*left;	/* left (current) btree block */
+	xfs_inobt_key_t		*lkp;	/* left btree key pointer */
+	xfs_inobt_ptr_t		*lpp;	/* left btree address pointer */
+	xfs_inobt_rec_t		*lrp;	/* left btree record pointer */
+	xfs_buf_t		*rbp;	/* buffer for right block */
+	xfs_inobt_block_t	*right; /* right (new) btree block */
+	xfs_inobt_key_t		*rkp;	/* right btree key pointer */
+	xfs_inobt_ptr_t		*rpp;	/* right btree address pointer */
+	xfs_inobt_rec_t		*rrp;	/* right btree record pointer */
+
+	/*
+	 * Set up left block (current one).
+	 */
+	lbp = cur->bc_bufs[level];
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
+	/*
+	 * Allocate the new block.
+	 * If we can't do it, we're toast.  Give up.
+	 */
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
+	args.mod = args.minleft = args.alignment = args.total = args.wasdel =
+		args.isfl = args.userdata = args.minalignslop = 0;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	if ((error = xfs_alloc_vextent(&args)))
+		return error;
+	if (args.fsbno == NULLFSBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
+	/*
+	 * Set up the new block as "right".
+	 */
+	right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+	/*
+	 * "Left" is the current (according to the cursor) block.
+	 */
+	left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+#endif
+	/*
+	 * Fill in the btree header for the new block.
+	 */
+	INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+	right->bb_level = left->bb_level; /* INT_: direct copy */
+	INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
+	/*
+	 * Make sure that if there's an odd number of entries now, that
+	 * each new block will have the same number of entries.
+	 */
+	if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
+	    cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
+		INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+	i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
+	/*
+	 * For non-leaf blocks, copy keys and addresses over to the new block.
+	 */
+	if (level > 0) {
+		lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
+		lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
+		rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+		rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		bcopy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		bcopy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+		xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		*keyp = *rkp;
+	}
+	/*
+	 * For leaf blocks, copy records over to the new block.
+	 */
+	else {
+		lrp = XFS_INOBT_REC_ADDR(left, i, cur);
+		rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		bcopy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */
+	}
+	/*
+	 * Find the left block number by looking in the buffer.
+	 * Adjust numrecs, sibling pointers.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
+	right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
+	INT_SET(left->bb_rightsib, ARCH_CONVERT, args.agbno);
+	INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
+	xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
+	xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	/*
+	 * If there's a block to the new block's right, make that block
+	 * point back to right instead of to left.
+	 */
+	if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		xfs_inobt_block_t	*rrblock;	/* rr btree block */
+		xfs_buf_t		*rrbp;		/* buffer for rrblock */
+
+		if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
+				INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
+				XFS_INO_BTREE_REF)))
+			return error;
+		rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
+		if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+			return error;
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.agbno);
+		xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * If the cursor is really in the right block, move it there.
+	 * If it's just pointing past the last entry in left, then we'll
+	 * insert there, so don't change anything in that case.
+	 */
+	if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * If there are more levels, we'll need another cursor which refers
+	 * the right block, no matter where this cursor was.
+	 */
+	if (level + 1 < cur->bc_nlevels) {
+		if ((error = xfs_btree_dup_cursor(cur, curp)))
+			return error;
+		(*curp)->bc_ptrs[level + 1]++;
+	}
+	*bnop = args.agbno;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int				/* error */
+xfs_inobt_updkey(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_inobt_key_t		*keyp,	/* new key value to update to */
+	int			level)	/* starting level for update */
+{
+	int			ptr;	/* index of key in block */
+
+	/*
+	 * Go up the tree from this level toward the root.
+	 * At each level, update the key value to the value input.
+	 * Stop when we reach a level where the cursor isn't pointing
+	 * at the first entry in the block.
+	 */
+	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+		xfs_buf_t		*bp;	/* buffer for block */
+		xfs_inobt_block_t	*block; /* btree block */
+#ifdef DEBUG
+		int			error;	/* error return value */
+#endif
+		xfs_inobt_key_t		*kp;	/* ptr to btree block keys */
+
+		bp = cur->bc_bufs[level];
+		block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+			return error;
+#endif
+		ptr = cur->bc_ptrs[level];
+		kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
+		*kp = *keyp;
+		xfs_inobt_log_keys(cur, bp, ptr, ptr);
+	}
+	return 0;
+}
+
+/*
+ * Externally visible routines.
+ */
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_inobt_decrement(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat)	/* success/failure */
+{
+	xfs_inobt_block_t	*block; /* btree block */
+	int			error;
+	int			lev;	/* btree level */
+
+	ASSERT(level < cur->bc_nlevels);
+	/*
+	 * Read-ahead to the left at this level.
+	 */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+	/*
+	 * Decrement the ptr at this level.  If we're still in the block
+	 * then we're done.
+	 */
+	if (--cur->bc_ptrs[level] > 0) {
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * Get a pointer to the btree block.
+	 */
+	block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level,
+			cur->bc_bufs[level])))
+		return error;
+#endif
+	/*
+	 * If we just went off the left edge of the tree, return failure.
+	 */
+	if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * March up the tree decrementing pointers.
+	 * Stop when we don't go off the left edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_ptrs[lev] > 0)
+			break;
+		/*
+		 * Read-ahead the left block, we're going to read it
+		 * in the next loop.
+		 */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+	/*
+	 * If we went off the root then we are seriously confused.
+	 */
+	ASSERT(lev < cur->bc_nlevels);
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
+		xfs_agblock_t	agbno;	/* block number of btree block */
+		xfs_buf_t	*bp;	/* buffer containing btree block */
+
+		agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, agbno, 0, &bp,
+				XFS_INO_BTREE_REF)))
+			return error;
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_INOBT_BLOCK(bp);
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+		cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+	}
+	*stat = 1;
+	return 0;
+}
+
+#ifdef _NOTYET_
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_inobt_delete(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		*stat)		/* success/failure */
+{
+	int		error;
+	int		i;		/* result code */
+	int		level;		/* btree level */
+
+	/*
+	 * Go up the tree, starting at leaf level.
+	 * If 2 is returned then a join was done; go to the next level.
+	 * Otherwise we are done.
+	 */
+	for (level = 0, i = 2; i == 2; level++) {
+		if (error = xfs_inobt_delrec(cur, level, &i))
+			return error;
+	}
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_ptrs[level] == 0) {
+				if (error = xfs_inobt_decrement(cur, level, &i))
+					return error;
+				break;
+			}
+		}
+	}
+	*stat = i;
+	return 0;
+}
+#endif	/* _NOTYET_ */
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_inobt_get_rec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agino_t		*ino,	/* output: starting inode of chunk */
+	__int32_t		*fcnt,	/* output: number of free inodes */
+	xfs_inofree_t		*free,	/* output: free inode mask */
+	int			*stat,	/* output: success/failure */
+	xfs_arch_t		arch)	/* input: architecture */
+{
+	xfs_inobt_block_t	*block; /* btree block */
+	xfs_buf_t		*bp;	/* buffer containing btree block */
+#ifdef DEBUG
+	int			error;	/* error return value */
+#endif
+	int			ptr;	/* record number */
+	xfs_inobt_rec_t		*rec;	/* record data */
+
+	bp = cur->bc_bufs[0];
+	ptr = cur->bc_ptrs[0];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
+		return error;
+#endif
+	/*
+	 * Off the right end or left end, return failure.
+	 */
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Point to the record and extract its data.
+	 */
+	rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
+	ASSERT(arch == ARCH_NOCONVERT || arch == ARCH_CONVERT);
+	if (arch == ARCH_NOCONVERT) {
+	    *ino = INT_GET(rec->ir_startino, ARCH_CONVERT);
+	    *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT);
+	    *free = INT_GET(rec->ir_free, ARCH_CONVERT);
+	} else {
+	    INT_COPY(*ino, rec->ir_startino, ARCH_CONVERT);
+	    INT_COPY(*fcnt, rec->ir_freecount, ARCH_CONVERT);
+	    INT_COPY(*free, rec->ir_free, ARCH_CONVERT);
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_inobt_increment(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat)	/* success/failure */
+{
+	xfs_inobt_block_t	*block; /* btree block */
+	xfs_buf_t		*bp;	/* buffer containing btree block */
+	int			error;	/* error return value */
+	int			lev;	/* btree level */
+
+	ASSERT(level < cur->bc_nlevels);
+	/*
+	 * Read-ahead to the right at this level.
+	 */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+	/*
+	 * Get a pointer to the btree block.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+#endif
+	/*
+	 * Increment the ptr at this level.  If we're still in the block
+	 * then we're done.
+	 */
+	if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * If we just went off the right edge of the tree, return failure.
+	 */
+	if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * March up the tree incrementing pointers.
+	 * Stop when we don't go off the right edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		bp = cur->bc_bufs[lev];
+		block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+#endif
+		if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			break;
+		/*
+		 * Read-ahead the right block, we're going to read it
+		 * in the next loop.
+		 */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+	/*
+	 * If we went off the root then we are seriously confused.
+	 */
+	ASSERT(lev < cur->bc_nlevels);
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
+	     lev > level; ) {
+		xfs_agblock_t	agbno;	/* block number of btree block */
+
+		agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, agbno, 0, &bp,
+				XFS_INO_BTREE_REF)))
+			return error;
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_INOBT_BLOCK(bp);
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+		cur->bc_ptrs[lev] = 1;
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int					/* error */
+xfs_inobt_insert(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+	int		i;		/* result value, 0 for failure */
+	int		level;		/* current level number in btree */
+	xfs_agblock_t	nbno;		/* new block number (split result) */
+	xfs_btree_cur_t *ncur;		/* new cursor (split result) */
+	xfs_inobt_rec_t nrec;		/* record being inserted this level */
+	xfs_btree_cur_t *pcur;		/* previous level's cursor */
+
+	level = 0;
+	nbno = NULLAGBLOCK;
+	INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino);
+	INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount);
+	INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free);
+	ncur = (xfs_btree_cur_t *)0;
+	pcur = cur;
+	/*
+	 * Loop going up the tree, starting at the leaf level.
+	 * Stop when we don't get a split block, that must mean that
+	 * the insert is finished with this level.
+	 */
+	do {
+		/*
+		 * Insert nrec/nbno into this level of the tree.
+		 * Note if we fail, nbno will be null.
+		 */
+		if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
+				&i))) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			return error;
+		}
+		/*
+		 * See if the cursor we just used is trash.
+		 * Can't trash the caller's cursor, but otherwise we should
+		 * if ncur is a new cursor or we're about to be done.
+		 */
+		if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
+			cur->bc_nlevels = pcur->bc_nlevels;
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		/*
+		 * If we got a new cursor, switch to it.
+		 */
+		if (ncur) {
+			pcur = ncur;
+			ncur = (xfs_btree_cur_t *)0;
+		}
+	} while (nbno != NULLAGBLOCK);
+	*stat = i;
+	return 0;
+}
+
+/*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_eq(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agino_t	ino,		/* starting inode of chunk */
+	__int32_t	fcnt,		/* free inode count */
+	xfs_inofree_t	free,		/* free inode mask */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_freecount = fcnt;
+	cur->bc_rec.i.ir_free = free;
+	return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_ge(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agino_t	ino,		/* starting inode of chunk */
+	__int32_t	fcnt,		/* free inode count */
+	xfs_inofree_t	free,		/* free inode mask */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_freecount = fcnt;
+	cur->bc_rec.i.ir_free = free;
+	return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_le(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agino_t	ino,		/* starting inode of chunk */
+	__int32_t	fcnt,		/* free inode count */
+	xfs_inofree_t	free,		/* free inode mask */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_freecount = fcnt;
+	cur->bc_rec.i.ir_free = free;
+	return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur, to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int					/* error */
+xfs_inobt_update(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free)	/* free inode mask */
+{
+	xfs_inobt_block_t	*block; /* btree block to update */
+	xfs_buf_t		*bp;	/* buffer containing btree block */
+	int			error;	/* error return value */
+	int			ptr;	/* current record number (updating) */
+	xfs_inobt_rec_t		*rp;	/* pointer to updated record */
+
+	/*
+	 * Pick up the current block.
+	 */
+	bp = cur->bc_bufs[0];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
+		return error;
+#endif
+	/*
+	 * Get the address of the rec to be updated.
+	 */
+	ptr = cur->bc_ptrs[0];
+	rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
+	/*
+	 * Fill in the new contents and log them.
+	 */
+	INT_SET(rp->ir_startino, ARCH_CONVERT, ino);
+	INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt);
+	INT_SET(rp->ir_free, ARCH_CONVERT, free);
+	xfs_inobt_log_recs(cur, bp, ptr, ptr);
+	/*
+	 * Updating first record in leaf. Pass new key value up to our parent.
+	 */
+	if (ptr == 1) {
+		xfs_inobt_key_t key;	/* key containing [ino] */
+
+		INT_SET(key.ir_startino, ARCH_CONVERT, ino);
+		if ((error = xfs_inobt_updkey(cur, &key, 1)))
+			return error;
+	}
+	return 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ialloc_btree.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ialloc_btree.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_ialloc_btree.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_ialloc_btree.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define __XFS_IALLOC_BTREE_H__
+
+/*
+ * Inode map on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_btree_sblock;
+struct xfs_mount;
+
+/*
+ * There is a btree for the inode map per allocation group.
+ */
+#define XFS_IBT_MAGIC	0x49414254	/* 'IABT' */
+
+typedef __uint64_t	xfs_inofree_t;
+#define XFS_INODES_PER_CHUNK	(NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
+#define XFS_INOBT_ALL_FREE	((xfs_inofree_t)-1)
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASKN)
+xfs_inofree_t xfs_inobt_maskn(int i, int n);
+#define XFS_INOBT_MASKN(i,n)		xfs_inobt_maskn(i,n)
+#else
+#define XFS_INOBT_MASKN(i,n)	\
+	((((n) >= XFS_INODES_PER_CHUNK ? \
+		(xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i))
+#endif
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec
+{
+	xfs_agino_t	ir_startino;	/* starting inode number */
+	__int32_t	ir_freecount;	/* count of free inodes (set bits) */
+	xfs_inofree_t	ir_free;	/* free inode mask */
+} xfs_inobt_rec_t;
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key
+{
+	xfs_agino_t	ir_startino;	/* starting inode number */
+} xfs_inobt_key_t;
+
+typedef xfs_agblock_t xfs_inobt_ptr_t;	/* btree pointer type */
+					/* btree block header type */
+typedef struct xfs_btree_sblock xfs_inobt_block_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_INOBT_BLOCK)
+xfs_inobt_block_t *xfs_buf_to_inobt_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_INOBT_BLOCK(bp)	xfs_buf_to_inobt_block(bp)
+#else
+#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+/*
+ * Bit manipulations for ir_free.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASK)
+xfs_inofree_t xfs_inobt_mask(int i);
+#define XFS_INOBT_MASK(i)		xfs_inobt_mask(i)
+#else
+#define XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_FREE)
+int xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch);
+#define XFS_INOBT_IS_FREE(rp,i,arch)	xfs_inobt_is_free(rp,i,arch)
+#else
+#define XFS_INOBT_IS_FREE(rp,i,arch)	((INT_GET((rp)->ir_free, arch) \
+					 & XFS_INOBT_MASK(i)) != 0)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_SET_FREE)
+void xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch);
+#define XFS_INOBT_SET_FREE(rp,i,arch)	xfs_inobt_set_free(rp,i,arch)
+#else
+#define XFS_INOBT_SET_FREE(rp,i,arch)	(INT_MOD_EXPR((rp)->ir_free, arch, |= XFS_INOBT_MASK(i)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_CLR_FREE)
+void xfs_inobt_clr_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch);
+#define XFS_INOBT_CLR_FREE(rp,i,arch)	xfs_inobt_clr_free(rp,i,arch)
+#else
+#define XFS_INOBT_CLR_FREE(rp,i,arch)	(INT_MOD_EXPR((rp)->ir_free, arch, &= ~XFS_INOBT_MASK(i)))
+#endif
+
+/*
+ * Real block structures have a size equal to the disk block size.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_SIZE)
+int xfs_inobt_block_size(int lev, struct xfs_btree_cur *cur);
+#define XFS_INOBT_BLOCK_SIZE(lev,cur)	xfs_inobt_block_size(lev,cur)
+#else
+#define XFS_INOBT_BLOCK_SIZE(lev,cur)	(1 << (cur)->bc_blocklog)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MAXRECS)
+int xfs_inobt_block_maxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_INOBT_BLOCK_MAXRECS(lev,cur)	xfs_inobt_block_maxrecs(lev,cur)
+#else
+#define XFS_INOBT_BLOCK_MAXRECS(lev,cur)	\
+	((cur)->bc_mp->m_inobt_mxr[lev != 0])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MINRECS)
+int xfs_inobt_block_minrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_INOBT_BLOCK_MINRECS(lev,cur)	xfs_inobt_block_minrecs(lev,cur)
+#else
+#define XFS_INOBT_BLOCK_MINRECS(lev,cur)	\
+	((cur)->bc_mp->m_inobt_mnr[lev != 0])
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_LAST_REC)
+int xfs_inobt_is_last_rec(struct xfs_btree_cur *cur);
+#define XFS_INOBT_IS_LAST_REC(cur)	xfs_inobt_is_last_rec(cur)
+#else
+#define XFS_INOBT_IS_LAST_REC(cur)	\
+	((cur)->bc_ptrs[0] == \
+		INT_GET(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs, ARCH_CONVERT))
+#endif
+
+/*
+ * Maximum number of inode btree levels.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IN_MAXLEVELS)
+int xfs_in_maxlevels(struct xfs_mount *mp);
+#define XFS_IN_MAXLEVELS(mp)		xfs_in_maxlevels(mp)
+#else
+#define XFS_IN_MAXLEVELS(mp)		((mp)->m_in_maxlevels)
+#endif
+
+/*
+ * block numbers in the AG.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IBT_BLOCK)
+xfs_agblock_t xfs_ibt_block(struct xfs_mount *mp);
+#define XFS_IBT_BLOCK(mp)		xfs_ibt_block(mp)
+#else
+#define XFS_IBT_BLOCK(mp)	((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_PREALLOC_BLOCKS)
+xfs_agblock_t xfs_prealloc_blocks(struct xfs_mount *mp);
+#define XFS_PREALLOC_BLOCKS(mp)		xfs_prealloc_blocks(mp)
+#else
+#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+#endif
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_REC_ADDR)
+xfs_inobt_rec_t *
+xfs_inobt_rec_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_INOBT_REC_ADDR(bb,i,cur)	xfs_inobt_rec_addr(bb,i,cur)
+#else
+#define XFS_INOBT_REC_ADDR(bb,i,cur)	\
+	XFS_BTREE_REC_ADDR(XFS_INOBT_BLOCK_SIZE(0,cur), xfs_inobt, bb, i, \
+		XFS_INOBT_BLOCK_MAXRECS(0, cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_KEY_ADDR)
+xfs_inobt_key_t *
+xfs_inobt_key_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_INOBT_KEY_ADDR(bb,i,cur)	xfs_inobt_key_addr(bb,i,cur)
+#else
+#define XFS_INOBT_KEY_ADDR(bb,i,cur)	\
+	XFS_BTREE_KEY_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \
+		XFS_INOBT_BLOCK_MAXRECS(1, cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_PTR_ADDR)
+xfs_inobt_ptr_t *
+xfs_inobt_ptr_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_INOBT_PTR_ADDR(bb,i,cur)	xfs_inobt_ptr_addr(bb,i,cur)
+#else
+#define XFS_INOBT_PTR_ADDR(bb,i,cur)	\
+	XFS_BTREE_PTR_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \
+		XFS_INOBT_BLOCK_MAXRECS(1, cur))
+#endif
+
+/*
+ * Prototypes for externally visible routines.
+ */
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_inobt_decrement(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat); /* success/failure */
+
+#ifdef _NOTYET_
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_inobt_delete(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat); /* success/failure */
+#endif	/* _NOTYET_ */
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_inobt_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		*ino,	/* output: starting inode of chunk */
+	__int32_t		*fcnt,	/* output: number of free inodes */
+	xfs_inofree_t		*free,	/* output: free inode mask */
+	int			*stat,	/* output: success/failure */
+	xfs_arch_t		arch);	/* output: architecture */
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_inobt_increment(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat); /* success/failure */
+
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int					/* error */
+xfs_inobt_insert(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_eq(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free,	/* free inode mask */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free,	/* free inode mask */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free,	/* free inode mask */
+	int			*stat); /* success/failure */
+
+/*
+ * Update the record referred to by cur, to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int					/* error */
+xfs_inobt_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free);	/* free inode mask */
+
+#endif	/* __XFS_IALLOC_BTREE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_iget.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_iget.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_iget.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_iget.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1040 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Initialize the inode hash table for the newly mounted file system.
+ *
+ * mp -- this is the mount point structure for the file system being
+ *	 initialized
+ */
+void
+xfs_ihash_init(xfs_mount_t *mp)
+{
+	int	i;
+
+	mp->m_ihsize = XFS_BUCKETS(mp);
+	mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize
+				      * sizeof(xfs_ihash_t), KM_SLEEP);
+	ASSERT(mp->m_ihash != NULL);
+	for (i = 0; i < mp->m_ihsize; i++) {
+		rwlock_init(&(mp->m_ihash[i].ih_lock));
+	}
+}
+
+/*
+ * Free up structures allocated by xfs_ihash_init, at unmount time.
+ */
+void
+xfs_ihash_free(xfs_mount_t *mp)
+{
+	kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t));
+	mp->m_ihash = NULL;
+}
+
+/*
+ * Initialize the inode cluster hash table for the newly mounted file system.
+ *
+ * mp -- this is the mount point structure for the file system being
+ *	 initialized
+ */
+void
+xfs_chash_init(xfs_mount_t *mp)
+{
+	int	i;
+
+	/*
+	 * m_chash size is based on m_ihash
+	 * with a minimum of 37 entries
+	 */
+	mp->m_chsize = (XFS_BUCKETS(mp)) /
+			 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
+	if (mp->m_chsize < 37) {
+		mp->m_chsize = 37;
+	}
+	mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
+						 * sizeof(xfs_chash_t),
+						 KM_SLEEP);
+	ASSERT(mp->m_chash != NULL);
+
+	for (i = 0; i < mp->m_chsize; i++) {
+		spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
+	}
+}
+
+/*
+ * Free up structures allocated by xfs_chash_init, at unmount time.
+ */
+void
+xfs_chash_free(xfs_mount_t *mp)
+{
+	int	i;
+
+	for (i = 0; i < mp->m_chsize; i++) {
+		spinlock_destroy(&mp->m_chash[i].ch_lock);
+	}
+
+	kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t));
+	mp->m_chash = NULL;
+}
+
+
+static inline void
+xfs_iget_vnode_init(
+	xfs_mount_t	*mp,
+	vnode_t		*vp,
+	xfs_inode_t	*ip)
+{
+	vp->v_vfsp  = XFS_MTOVFS(mp);
+	vp->v_type  = IFTOVT(ip->i_d.di_mode);
+
+	/* If we have a real type for an on-disk inode, we can set ops(&unlock)
+	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
+	 */
+	if (vp->v_type != VNON) {
+		linvfs_set_inode_ops(LINVFS_GET_IP(vp));
+	}
+}
+
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the hash table for the file system
+ * represented by the mount point parameter mp.	 Each bucket of
+ * the hash table is guarded by an individual semaphore.
+ *
+ * If the inode is found in the hash table, its corresponding vnode
+ * is obtained with a call to vn_get().	 This call takes care of
+ * coordination with the reclamation of the inode and vnode.  Note
+ * that the vmap structure is filled in while holding the hash lock.
+ * This gives us the state of the inode/vnode when we found it and
+ * is used for coordination in vn_get().
+ *
+ * If it is not in core, read it in from the file system's device and
+ * add the inode into the hash table.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.	 It points
+ *	 to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.	This is
+ *	 simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *	  within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *		 for xfs_ilock() for a list of valid values.
+ * bno -- the block number starting the buffer containing the inode,
+ *	  if known (as by bulkstat), else 0.
+ */
+int
+xfs_iget_core(
+	vnode_t		*vp,
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		lock_flags,
+	xfs_inode_t	**ipp,
+	xfs_daddr_t	bno)
+{
+	xfs_ihash_t	*ih;
+	xfs_inode_t	*ip;
+	xfs_inode_t	*iq;
+	vnode_t		*inode_vp;
+	ulong		version;
+	int		error;
+	/* REFERENCED */
+	int		newnode;
+	xfs_chash_t	*ch;
+	xfs_chashlist_t *chl, *chlnew;
+	SPLDECL(s);
+
+
+	ih = XFS_IHASH(mp, ino);
+
+again:
+	read_lock(&ih->ih_lock);
+
+	for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
+		if (ip->i_ino == ino) {
+
+			inode_vp = XFS_ITOV_NULL(ip);
+
+			if (inode_vp == NULL) {
+				/* If IRECLAIM is set this inode is
+				 * on its way out of the system,
+				 * we need to pause and try again.
+				 */
+				if (ip->i_flags & XFS_IRECLAIM) {
+					read_unlock(&ih->ih_lock);
+					delay(1);
+					XFS_STATS_INC(xfsstats.xs_ig_frecycle);
+
+					goto again;
+				}
+
+				xfs_iget_vnode_init(mp, vp, ip);
+
+				vn_trace_exit(vp, "xfs_iget.alloc",
+					(inst_t *)__return_address);
+
+				bhv_desc_init(&(ip->i_bhv_desc), ip, vp,
+							&xfs_vnodeops);
+				vn_bhv_insert_initial(VN_BHV_HEAD(vp),
+							&(ip->i_bhv_desc));
+
+				XFS_STATS_INC(xfsstats.xs_ig_found);
+
+				read_unlock(&ih->ih_lock);
+				goto finish_inode;
+
+			} else if (vp != inode_vp) {
+				struct inode *inode = LINVFS_GET_IP(inode_vp);
+
+				/* The inode is being torn down, pause and
+				 * try again.
+				 */
+				if (inode->i_state & (I_FREEING | I_CLEAR)) {
+					read_unlock(&ih->ih_lock);
+					delay(1);
+					XFS_STATS_INC(xfsstats.xs_ig_frecycle);
+
+					goto again;
+				}
+/* Chances are the other vnode (the one in the inode) is being torn
+ * down right now, and we landed on top of it. Question is, what do
+ * we do? Unhook the old inode and hook up the new one?
+ */
+				cmn_err(CE_PANIC,
+			"xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
+						inode_vp, vp);
+				BUG();
+			}
+
+			read_unlock(&ih->ih_lock);
+
+			XFS_STATS_INC(xfsstats.xs_ig_found);
+
+			/*
+			 * Make sure the vnode and the inode are hooked up
+			 */
+			xfs_iget_vnode_init(mp, vp, ip);
+
+finish_inode:
+			if (lock_flags != 0) {
+				xfs_ilock(ip, lock_flags);
+			}
+
+			newnode = (ip->i_d.di_mode == 0);
+			if (newnode) {
+				xfs_iocore_inode_reinit(ip);
+			}
+			vn_trace_exit(vp, "xfs_iget.found",
+						(inst_t *)__return_address);
+			goto return_ip;
+		}
+	}
+
+	/*
+	 * Inode cache miss: save the hash chain version stamp and unlock
+	 * the chain, so we don't deadlock in vn_alloc.
+	 */
+	XFS_STATS_INC(xfsstats.xs_ig_missed);
+
+	version = ih->ih_version;
+
+	read_unlock(&ih->ih_lock);
+
+	/*
+	 * Read the disk inode attributes into a new inode structure and get
+	 * a new vnode for it.	Initialize the inode lock so we can idestroy
+	 * it soon if it's a dup.  This should also initialize i_dev, i_ino,
+	 * i_bno, i_mount, and i_index.
+	 */
+	error = xfs_iread(mp, tp, ino, &ip, bno);
+	if (error) {
+		return error;
+	}
+
+	/*
+	 * Vnode provided by vn_initialize.
+	 */
+
+	xfs_iget_vnode_init(mp, vp, ip);
+
+	vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
+
+	if (vp->v_fbhv == NULL) {
+		bhv_desc_init(&(ip->i_bhv_desc), ip, vp, &xfs_vnodeops);
+		vn_bhv_insert_initial(VN_BHV_HEAD(vp), &(ip->i_bhv_desc));
+	}
+
+	xfs_inode_lock_init(ip, vp);
+	xfs_iocore_inode_init(ip);
+
+	if (lock_flags != 0) {
+		xfs_ilock(ip, lock_flags);
+	}
+
+	/*
+	 * Put ip on its hash chain, unless someone else hashed a duplicate
+	 * after we released the hash lock.
+	 */
+	write_lock(&ih->ih_lock);
+
+	if (ih->ih_version != version) {
+		for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) {
+			if (iq->i_ino == ino) {
+				write_unlock(&ih->ih_lock);
+				xfs_idestroy(ip);
+
+				XFS_STATS_INC(xfsstats.xs_ig_dup);
+				goto again;
+			}
+		}
+	}
+
+	/*
+	 * These values _must_ be set before releasing ihlock!
+	 */
+	ip->i_hash = ih;
+	if ((iq = ih->ih_next)) {
+		iq->i_prevp = &ip->i_next;
+	}
+	ip->i_next = iq;
+	ip->i_prevp = &ih->ih_next;
+	ih->ih_next = ip;
+	ip->i_udquot = ip->i_gdquot = NULL;
+	ih->ih_version++;
+
+	write_unlock(&ih->ih_lock);
+
+	/*
+	 * put ip on its cluster's hash chain
+	 */
+	ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
+	       ip->i_cnext == NULL);
+
+	chlnew = NULL;
+	ch = XFS_CHASH(mp, ip->i_blkno);
+ chlredo:
+	s = mutex_spinlock(&ch->ch_lock);
+	for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
+		if (chl->chl_blkno == ip->i_blkno) {
+
+			/* insert this inode into the doubly-linked list
+			 * where chl points */
+			if ((iq = chl->chl_ip)) {
+				ip->i_cprev = iq->i_cprev;
+				iq->i_cprev->i_cnext = ip;
+				iq->i_cprev = ip;
+				ip->i_cnext = iq;
+			} else {
+				ip->i_cnext = ip;
+				ip->i_cprev = ip;
+			}
+			chl->chl_ip = ip;
+			ip->i_chash = chl;
+			break;
+		}
+	}
+
+	/* no hash list found for this block; add a new hash list */
+	if (chl == NULL)  {
+		if (chlnew == NULL) {
+			mutex_spinunlock(&ch->ch_lock, s);
+			ASSERT(xfs_chashlist_zone != NULL);
+			chlnew = (xfs_chashlist_t *)
+					kmem_zone_alloc(xfs_chashlist_zone,
+						KM_SLEEP);
+			ASSERT(chlnew != NULL);
+			goto chlredo;
+		} else {
+			ip->i_cnext = ip;
+			ip->i_cprev = ip;
+			ip->i_chash = chlnew;
+			chlnew->chl_ip = ip;
+			chlnew->chl_blkno = ip->i_blkno;
+			chlnew->chl_next = ch->ch_list;
+			ch->ch_list = chlnew;
+			chlnew = NULL;
+		}
+	} else {
+		if (chlnew != NULL) {
+			kmem_zone_free(xfs_chashlist_zone, chlnew);
+		}
+	}
+
+	mutex_spinunlock(&ch->ch_lock, s);
+
+
+	/*
+	 * Link ip to its mount and thread it on the mount's inode list.
+	 */
+	XFS_MOUNT_ILOCK(mp);
+	if ((iq = mp->m_inodes)) {
+		ASSERT(iq->i_mprev->i_mnext == iq);
+		ip->i_mprev = iq->i_mprev;
+		iq->i_mprev->i_mnext = ip;
+		iq->i_mprev = ip;
+		ip->i_mnext = iq;
+	} else {
+		ip->i_mnext = ip;
+		ip->i_mprev = ip;
+	}
+	mp->m_inodes = ip;
+
+	XFS_MOUNT_IUNLOCK(mp);
+
+	newnode = 1;
+
+ return_ip:
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
+
+	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
+	       ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
+
+	*ipp = ip;
+
+	/* Update the linux inode */
+	error = vn_revalidate(vp, ATTR_COMM|ATTR_LAZY);
+
+	return 0;
+}
+
+
+/*
+ * The 'normal' internal xfs_iget, if needed it will
+ * 'allocate', or 'get', the vnode.
+ */
+int
+xfs_iget(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		lock_flags,
+	xfs_inode_t	**ipp,
+	xfs_daddr_t	bno)
+{
+	struct inode	*inode;
+	vnode_t		*vp = NULL;
+	int		error;
+
+retry:
+	XFS_STATS_INC(xfsstats.xs_ig_attempts);
+
+	if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) {
+		bhv_desc_t	*bdp;
+		xfs_inode_t	*ip;
+		int		newnode;
+
+
+		vp = LINVFS_GET_VP(inode);
+		if (inode->i_state & I_NEW) {
+inode_allocate:
+			XFS_STATS_INC(xfsstats.vn_alloc);
+			vn_initialize(inode);
+			error = xfs_iget_core(vp, mp, tp, ino,
+							lock_flags, ipp, bno);
+			if (error) {
+				make_bad_inode(inode);
+				if (inode->i_state & I_NEW)
+					unlock_new_inode(inode);
+				iput(inode);
+			}
+		} else {
+			/* These are true if the inode is in inactive or
+			 * reclaim. The linux inode is about to go away,
+			 * wait for that path to finish, and try again.
+			 */
+			if (vp->v_flag & (VINACT | VRECLM)) {
+				vn_wait(vp);
+				iput(inode);
+				goto retry;
+			}
+
+			bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
+			if (bdp == NULL)
+				goto inode_allocate;
+			ip = XFS_BHVTOI(bdp);
+			if (lock_flags != 0)
+				xfs_ilock(ip, lock_flags);
+			newnode = (ip->i_d.di_mode == 0);
+			if (newnode)
+				xfs_iocore_inode_reinit(ip);
+			vn_revalidate(vp, ATTR_COMM|ATTR_LAZY);
+			XFS_STATS_INC(xfsstats.xs_ig_found);
+			*ipp = ip;
+			error = 0;
+		}
+	} else
+		error = ENOMEM; /* If we got no inode we are out of memory */
+
+	return error;
+}
+
+
+/*
+ * A 'special' interface to xfs_iget, where the
+ * vnode is already allocated.
+ */
+int
+xfs_vn_iget(
+	vfs_t		*vfsp,
+	struct vnode	*vp,
+	xfs_ino_t	ino)
+{
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	int error;
+
+	error = xfs_iget_core(vp, mp, NULL, ino, 0, &ip, 0);
+
+	return error;
+}
+
+
+/*
+ * Do the setup for the various locks within the incore inode.
+ */
+void
+xfs_inode_lock_init(
+	xfs_inode_t	*ip,
+	vnode_t		*vp)
+{
+	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+		     "xfsino", (long)vp->v_number);
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number);
+#ifdef NOTYET
+	mutex_init(&ip->i_range_lock.r_spinlock, MUTEX_SPIN, "xrange");
+#endif /* NOTYET */
+	init_waitqueue_head(&ip->i_ipin_wait);
+	atomic_set(&ip->i_pincount, 0);
+	init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number);
+}
+
+/*
+ * Look for the inode corresponding to the given ino in the hash table.
+ * If it is there and its i_transp pointer matches tp, return it.
+ * Otherwise, return NULL.
+ */
+xfs_inode_t *
+xfs_inode_incore(xfs_mount_t	*mp,
+		 xfs_ino_t	ino,
+		 xfs_trans_t	*tp)
+{
+	xfs_ihash_t	*ih;
+	xfs_inode_t	*ip;
+
+	ih = XFS_IHASH(mp, ino);
+	read_lock(&ih->ih_lock);
+	for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
+		if (ip->i_ino == ino) {
+			/*
+			 * If we find it and tp matches, return it.
+			 * Otherwise break from the loop and return
+			 * NULL.
+			 */
+			if (ip->i_transp == tp) {
+				read_unlock(&ih->ih_lock);
+				return (ip);
+			}
+			break;
+		}
+	}	
+	read_unlock(&ih->ih_lock);
+	return (NULL);
+}
+
+/*
+ * Decrement reference count of an inode structure and unlock it.
+ *
+ * ip -- the inode being released
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *	 to be released.  See the comment on xfs_iunlock() for a list
+ *	 of valid values.
+ */
+void
+xfs_iput(xfs_inode_t	*ip,
+	 uint		lock_flags)
+{
+	vnode_t *vp = XFS_ITOV(ip);
+
+	vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address);
+
+	xfs_iunlock(ip, lock_flags);
+
+	VN_RELE(vp);
+}
+
+/*
+ * Special iput for brand-new inodes that are still locked
+ */
+void
+xfs_iput_new(xfs_inode_t	*ip,
+	     uint		lock_flags)
+{
+	vnode_t		*vp = XFS_ITOV(ip);
+	struct inode	*inode = LINVFS_GET_IP(vp);
+
+	vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address);
+
+	/* We shouldn't get here without this being true, but just in case */
+	if (inode->i_state & I_NEW) {
+		make_bad_inode(inode);
+		unlock_new_inode(inode);
+	}
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+	VN_RELE(vp);
+}
+
+
+/*
+ * This routine embodies the part of the reclaim code that pulls
+ * the inode from the inode hash table and the mount structure's
+ * inode list.
+ * This should only be called from xfs_reclaim().
+ */
+void
+xfs_ireclaim(xfs_inode_t *ip)
+{
+	vnode_t		*vp;
+
+	/*
+	 * Remove from old hash list and mount list.
+	 */
+	XFS_STATS_INC(xfsstats.xs_ig_reclaims);
+
+	xfs_iextract(ip);
+
+	/*
+	 * Here we do a spurious inode lock in order to coordinate with
+	 * xfs_sync().	This is because xfs_sync() references the inodes
+	 * in the mount list without taking references on the corresponding
+	 * vnodes.  We make that OK here by ensuring that we wait until
+	 * the inode is unlocked in xfs_sync() before we go ahead and
+	 * free it.  We get both the regular lock and the io lock because
+	 * the xfs_sync() code may need to drop the regular one but will
+	 * still hold the io lock.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	/*
+	 * Release dquots (and their references) if any. An inode may escape
+	 * xfs_inactive and get here via vn_alloc->vn_reclaim path.
+	 */
+	if (ip->i_udquot || ip->i_gdquot) {
+		xfs_qm_dqdettach_inode(ip);
+	}
+
+	/*
+	 * Pull our behavior descriptor from the vnode chain.
+	 */
+	vp = XFS_ITOV_NULL(ip);
+	if (vp) {
+		vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
+	}
+
+	/*
+	 * Free all memory associated with the inode.
+	 */
+	xfs_idestroy(ip);
+}
+
+/*
+ * This routine removes an about-to-be-destroyed inode from
+ * all of the lists in which it is lcoated with the exception
+ * of the behavior chain.  It is used by xfs_ireclaim and
+ * by cxfs relocation cocde, in which case, we are removing
+ * the xfs_inode but leaving the vnode alone since it has
+ * been transformed into a client vnode.
+ */
+void
+xfs_iextract(
+	xfs_inode_t	*ip)
+{
+	xfs_ihash_t	*ih;
+	xfs_inode_t	*iq;
+	xfs_mount_t	*mp;
+	xfs_chash_t	*ch;
+	xfs_chashlist_t *chl, *chm;
+	SPLDECL(s);
+
+	ih = ip->i_hash;
+	write_lock(&ih->ih_lock);
+	if ((iq = ip->i_next)) {
+		iq->i_prevp = ip->i_prevp;
+	}
+	*ip->i_prevp = iq;
+	write_unlock(&ih->ih_lock);
+
+	/*
+	 * Remove from cluster hash list
+	 *   1) delete the chashlist if this is the last inode on the chashlist
+	 *   2) unchain from list of inodes
+	 *   3) point chashlist->chl_ip to 'chl_next' if to this inode.
+	 */
+	mp = ip->i_mount;
+	ch = XFS_CHASH(mp, ip->i_blkno);
+	s = mutex_spinlock(&ch->ch_lock);
+
+	if (ip->i_cnext == ip) {
+		/* Last inode on chashlist */
+		ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
+		ASSERT(ip->i_chash != NULL);
+		chm=NULL;
+		for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
+			if (chl->chl_blkno == ip->i_blkno) {
+				if (chm == NULL) {
+					/* first item on the list */
+					ch->ch_list = chl->chl_next;
+				} else {
+					chm->chl_next = chl->chl_next;
+				}
+				kmem_zone_free(xfs_chashlist_zone, chl);
+				break;
+			} else {
+				ASSERT(chl->chl_ip != ip);
+				chm = chl;
+			}
+		}
+		ASSERT_ALWAYS(chl != NULL);
+       } else {
+		/* delete one inode from a non-empty list */
+		iq = ip->i_cnext;
+		iq->i_cprev = ip->i_cprev;
+		ip->i_cprev->i_cnext = iq;
+		if (ip->i_chash->chl_ip == ip) {
+			ip->i_chash->chl_ip = iq;
+		}
+		ip->i_chash = __return_address;
+		ip->i_cprev = __return_address;
+		ip->i_cnext = __return_address;
+	}
+	mutex_spinunlock(&ch->ch_lock, s);
+
+	/*
+	 * Remove from mount's inode list.
+	 */
+	XFS_MOUNT_ILOCK(mp);
+	ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
+	iq = ip->i_mnext;
+	iq->i_mprev = ip->i_mprev;
+	ip->i_mprev->i_mnext = iq;
+
+	/*
+	 * Fix up the head pointer if it points to the inode being deleted.
+	 */
+	if (mp->m_inodes == ip) {
+		if (ip == iq) {
+			mp->m_inodes = NULL;
+		} else {
+			mp->m_inodes = iq;
+		}
+	}
+
+	mp->m_ireclaims++;
+	XFS_MOUNT_IUNLOCK(mp);
+}
+
+/*
+ * This is a wrapper routine around the xfs_ilock() routine
+ * used to centralize some grungy code.	 It is used in places
+ * that wish to lock the inode solely for reading the extents.
+ * The reason these places can't just call xfs_ilock(SHARED)
+ * is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode
+ * is in b-tree format, then we need to lock the inode exclusively
+ * until the extents are read in.  Locking it exclusively all
+ * the time would limit our parallelism unnecessarily, though.
+ * What we do instead is check to see if the extents have been
+ * read in yet, and only lock the inode exclusively if they
+ * have not.
+ *
+ * The function returns a value which should be given to the
+ * corresponding xfs_iunlock_map_shared().  This value is
+ * the mode in which the lock was actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+	xfs_inode_t	*ip)
+{
+	uint	lock_mode;
+
+	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+		lock_mode = XFS_ILOCK_EXCL;
+	} else {
+		lock_mode = XFS_ILOCK_SHARED;
+	}
+
+	xfs_ilock(ip, lock_mode);
+
+	return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+	xfs_inode_t	*ip,
+	unsigned int	lock_mode)
+{
+	xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.	This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *	 to be locked.	It can be:
+ *		XFS_IOLOCK_SHARED,
+ *		XFS_IOLOCK_EXCL,
+ *		XFS_ILOCK_SHARED,
+ *		XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(xfs_inode_t	*ip,
+	  uint		lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		mrupdate(&ip->i_iolock);
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		mraccess(&ip->i_iolock);
+	}
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		mrupdate(&ip->i_lock);
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		mraccess(&ip->i_lock);
+	}
+#ifdef XFS_ILOCK_TRACE
+	xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)return_address);
+#endif
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.	It returns 1 if it gets
+ * the requested locks and 0 otherwise.	 If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *	 to be locked.	See the comment for xfs_ilock() for a list
+ *	 of valid values.
+ *
+ */
+int
+xfs_ilock_nowait(xfs_inode_t	*ip,
+		 uint		lock_flags)
+{
+	int	iolocked;
+	int	ilocked;
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
+
+	iolocked = 0;
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		iolocked = mrtryupdate(&ip->i_iolock);
+		if (!iolocked) {
+			return 0;
+		}
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		iolocked = mrtryaccess(&ip->i_iolock);
+		if (!iolocked) {
+			return 0;
+		}
+	}
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		ilocked = mrtryupdate(&ip->i_lock);
+		if (!ilocked) {
+			if (iolocked) {
+				mrunlock(&ip->i_iolock);
+			}
+			return 0;
+		}
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		ilocked = mrtryaccess(&ip->i_lock);
+		if (!ilocked) {
+			if (iolocked) {
+				mrunlock(&ip->i_iolock);
+			}
+			return 0;
+		}
+	}
+#ifdef XFS_ILOCK_TRACE
+	xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address);
+#endif
+	return 1;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().	The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *	 to be unlocked.  See the comment for xfs_ilock() for a list
+ *	 of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(xfs_inode_t *ip,
+	    uint	lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0);
+	ASSERT(lock_flags != 0);
+
+	if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
+		ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) ||
+		       (ismrlocked(&ip->i_iolock, MR_ACCESS)));
+		ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) ||
+		       (ismrlocked(&ip->i_iolock, MR_UPDATE)));
+		mrunlock(&ip->i_iolock);
+	}
+
+	if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) {
+		ASSERT(!(lock_flags & XFS_ILOCK_SHARED) ||
+		       (ismrlocked(&ip->i_lock, MR_ACCESS)));
+		ASSERT(!(lock_flags & XFS_ILOCK_EXCL) ||
+		       (ismrlocked(&ip->i_lock, MR_UPDATE)));
+		mrunlock(&ip->i_lock);
+
+		/*
+		 * Let the AIL know that this item has been unlocked in case
+		 * it is in the AIL and anyone is waiting on it.  Don't do
+		 * this if the caller has asked us not to.
+		 */
+		if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) &&
+		     ip->i_itemp != NULL) {
+			xfs_trans_unlocked_item(ip->i_mount,
+						(xfs_log_item_t*)(ip->i_itemp));
+		}
+	}
+#ifdef XFS_ILOCK_TRACE
+	xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
+#endif
+}
+
+/*
+ * give up write locks.	 the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(xfs_inode_t	*ip,
+		 uint		lock_flags)
+{
+	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+		mrdemote(&ip->i_lock);
+	}
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+		mrdemote(&ip->i_iolock);
+	}
+}
+
+/*
+ * The following three routines simply manage the i_flock
+ * semaphore embedded in the inode.  This semaphore synchronizes
+ * processes attempting to flush the in-core inode back to disk.
+ */
+void
+xfs_iflock(xfs_inode_t *ip)
+{
+	psema(&(ip->i_flock), PINOD|PLTWAIT);
+}
+
+int
+xfs_iflock_nowait(xfs_inode_t *ip)
+{
+	return (cpsema(&(ip->i_flock)));
+}
+
+void
+xfs_ifunlock(xfs_inode_t *ip)
+{
+	ASSERT(valusema(&(ip->i_flock)) <= 0);
+	vsema(&(ip->i_flock));
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_imap.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_imap.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_imap.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_imap.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IMAP_H__
+#define __XFS_IMAP_H__
+
+/*
+ * This is the structure passed to xfs_imap() to map
+ * an inode number to its on disk location.
+ */
+typedef struct xfs_imap {
+	xfs_daddr_t	im_blkno;	/* starting BB of inode chunk */
+	uint		im_len;		/* length in BBs of inode chunk */
+	xfs_agblock_t	im_agblkno;	/* logical block of inode chunk in ag */
+	ushort		im_ioffset;	/* inode offset in block in "inodes" */
+	ushort		im_boffset;	/* inode offset in block in bytes */
+} xfs_imap_t;
+
+#ifdef __KERNEL__
+struct xfs_mount;
+struct xfs_trans;
+int	xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+		 xfs_imap_t *, uint);
+#endif
+
+#endif	/* __XFS_IMAP_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inode.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inode.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inode.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inode.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,3626 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+kmem_zone_t *xfs_ifork_zone;
+kmem_zone_t *xfs_inode_zone;
+kmem_zone_t *xfs_chashlist_zone;
+
+/*
+ * Used in xfs_itruncate().  This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define XFS_ITRUNC_MAX_EXTENTS	2
+
+STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+STATIC void
+xfs_validate_extents(
+	xfs_bmbt_rec_32_t	*ep,
+	int			nrecs,
+	xfs_exntfmt_t		fmt)
+{
+	xfs_bmbt_irec_t		irec;
+	int			i;
+	xfs_bmbt_rec_t		rec;
+
+	for (i = 0; i < nrecs; i++) {
+		bcopy(ep, &rec, sizeof(rec));
+		xfs_bmbt_get_all(&rec, &irec);
+		if (fmt == XFS_EXTFMT_NOSTATE)
+			ASSERT(irec.br_state == XFS_EXT_NORM);
+		ep++;
+	}
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ep, nrecs, fmt)
+#endif /* DEBUG */
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+	xfs_mount_t	*mp,
+	xfs_buf_t	*bp)
+{
+	int		i;
+	int		j;
+	xfs_dinode_t	*dip;
+
+	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+	for (i = 0; i < j; i++) {
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					i * mp->m_sb.sb_inodesize);
+		if (INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT))  {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.	 About to pop an ASSERT.",
+				bp);
+			ASSERT(!INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT));
+		}
+	}
+}
+#endif
+
+/*
+ * called from bwrite on xfs inode buffers
+ */
+void
+xfs_inobp_bwcheck(xfs_buf_t *bp)
+{
+	xfs_mount_t	*mp;
+	int		i;
+	int		j;
+	xfs_dinode_t	*dip;
+
+	ASSERT(XFS_BUF_FSPRIVATE3(bp, void *) != NULL);
+
+	mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
+
+
+	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+	for (i = 0; i < j; i++)	 {
+		dip = (xfs_dinode_t *) xfs_buf_offset(bp,
+						i * mp->m_sb.sb_inodesize);
+		if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
+			cmn_err(CE_WARN,
+"Bad magic # 0x%x in XFS inode buffer 0x%Lx, starting blockno %Ld, offset 0x%x",
+				INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
+				(__uint64_t)(__psunsigned_t) bp,
+				(__int64_t) XFS_BUF_ADDR(bp),
+				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+			xfs_fs_cmn_err(CE_WARN, mp,
+				"corrupt, unmount and run xfs_repair");
+		}
+		if (INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT))  {
+			cmn_err(CE_WARN,
+"Bad next_unlinked field (0) in XFS inode buffer 0x%x, starting blockno %Ld, offset 0x%x",
+				(__uint64_t)(__psunsigned_t) bp,
+				(__int64_t) XFS_BUF_ADDR(bp),
+				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+			xfs_fs_cmn_err(CE_WARN, mp,
+				"corrupt, unmount and run xfs_repair");
+		}
+	}
+
+	return;
+}
+
+/*
+ * This routine is called to map an inode number within a file
+ * system to the buffer containing the on-disk version of the
+ * inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dip parameter
+ * it returns a pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * Use xfs_imap() to determine the size and location of the
+ * buffer to read from disk.
+ */
+int
+xfs_inotobp(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	xfs_dinode_t	**dipp,
+	xfs_buf_t	**bpp,
+	int		*offset)
+{
+	int		di_ok;
+	xfs_imap_t	imap;
+	xfs_buf_t	*bp;
+	int		error;
+	xfs_dinode_t	*dip;
+
+	/*
+	 * Call the space managment code to find the location of the
+	 * inode on disk.
+	 */
+	imap.im_blkno = 0;
+	error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+	if (error != 0) {
+		cmn_err(CE_WARN,
+	"xfs_inotobp: xfs_imap()  returned an "
+	"error %d on %s.  Returning error.", error, mp->m_fsname);
+		return error;
+	}
+
+	/*
+	 * If the inode number maps to a block outside the bounds of the
+	 * file system then return NULL rather than calling read_buf
+	 * and panicing when we get an error from the driver.
+	 */
+	if ((imap.im_blkno + imap.im_len) >
+	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+		cmn_err(CE_WARN,
+	"xfs_inotobp: inode number (%d + %d) maps to a block outside the bounds "
+	"of the file system %s.	 Returning EINVAL.",
+			imap.im_blkno, imap.im_len,mp->m_fsname);
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * Read in the buffer.	If tp is NULL, xfs_trans_read_buf() will
+	 * default to just a read_buf() call.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
+				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
+
+	if (error) {
+		cmn_err(CE_WARN,
+	"xfs_inotobp: xfs_trans_read_buf()  returned an "
+	"error %d on %s.  Returning error.", error, mp->m_fsname);
+		return error;
+	}
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
+	di_ok =
+		INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
+		XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
+	if (XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
+			XFS_RANDOM_ITOBP_INOTOBP)) {
+		xfs_trans_brelse(tp, bp);
+		cmn_err(CE_WARN,
+	"xfs_inotobp: XFS_TEST_ERROR()	returned an "
+	"error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * Set *dipp to point to the on-disk inode in the buffer.
+	 */
+	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+	*bpp = bp;
+	*offset = imap.im_boffset;
+	return 0;
+}
+
+
+/*
+ * This routine is called to map an inode to the buffer containing
+ * the on-disk version of the inode.  It returns a pointer to the
+ * buffer containing the on-disk inode in the bpp parameter, and in
+ * the dip parameter it returns a pointer to the on-disk inode within
+ * that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * If the inode is new and has not yet been initialized, use xfs_imap()
+ * to determine the size and location of the buffer to read from disk.
+ * If the inode has already been mapped to its buffer and read in once,
+ * then use the mapping information stored in the inode rather than
+ * calling xfs_imap().	This allows us to avoid the overhead of looking
+ * at the inode btree for small block file systems (see xfs_dilocate()).
+ * We can tell whether the inode has been mapped in before by comparing
+ * its disk block address to 0.	 Only uninitialized inodes will have
+ * 0 for the disk block address.
+ */
+int
+xfs_itobp(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dinode_t	**dipp,
+	xfs_buf_t	**bpp,
+	xfs_daddr_t	bno)
+{
+	xfs_buf_t	*bp;
+	int		error;
+	xfs_imap_t	imap;
+#ifdef __KERNEL__
+	int		i;
+	int		ni;
+#endif
+
+	if (ip->i_blkno == (xfs_daddr_t)0) {
+		/*
+		 * Call the space management code to find the location of the
+		 * inode on disk.
+		 */
+		imap.im_blkno = bno;
+		error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP);
+		if (error != 0) {
+			return error;
+		}
+
+		/*
+		 * If the inode number maps to a block outside the bounds
+		 * of the file system then return NULL rather than calling
+		 * read_buf and panicing when we get an error from the
+		 * driver.
+		 */
+		if ((imap.im_blkno + imap.im_len) >
+		    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
+					"(imap.im_blkno (0x%llx) "
+					"+ imap.im_len (0x%llx)) > "
+					" XFS_FSB_TO_BB(mp, "
+					"mp->m_sb.sb_dblocks) (0x%llx)",
+					(unsigned long long) imap.im_blkno,
+					(unsigned long long) imap.im_len,
+					XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+#endif /* DEBUG */
+			return XFS_ERROR(EINVAL);
+		}
+
+		/*
+		 * Fill in the fields in the inode that will be used to
+		 * map the inode to its buffer from now on.
+		 */
+		ip->i_blkno = imap.im_blkno;
+		ip->i_len = imap.im_len;
+		ip->i_boffset = imap.im_boffset;
+	} else {
+		/*
+		 * We've already mapped the inode once, so just use the
+		 * mapping that we saved the first time.
+		 */
+		imap.im_blkno = ip->i_blkno;
+		imap.im_len = ip->i_len;
+		imap.im_boffset = ip->i_boffset;
+	}
+	ASSERT(bno == 0 || bno == imap.im_blkno);
+
+	/*
+	 * Read in the buffer.	If tp is NULL, xfs_trans_read_buf() will
+	 * default to just a read_buf() call.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
+				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
+
+	if (error) {
+#ifdef DEBUG
+		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
+				"xfs_trans_read_buf() returned error %d, "
+				"imap.im_blkno 0x%llx, imap.im_len 0x%llx",
+				error, (unsigned long long) imap.im_blkno,
+				(unsigned long long) imap.im_len);
+#endif /* DEBUG */
+		return error;
+	}
+#ifdef __KERNEL__
+	/*
+	 * Validate the magic number and version of every inode in the buffer
+	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+	 */
+#ifdef DEBUG
+	ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
+#else
+	ni = 1;
+#endif
+	for (i = 0; i < ni; i++) {
+		int		di_ok;
+		xfs_dinode_t	*dip;
+
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					(i << mp->m_sb.sb_inodelog));
+		di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
+			    XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
+		if (XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
+				 XFS_RANDOM_ITOBP_INOTOBP)) {
+#ifdef DEBUG
+			prdev("bad inode magic/vsn daddr 0x%llx #%d (magic=%x)",
+				mp->m_dev, (unsigned long long)imap.im_blkno, i,
+				INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
+#endif
+			xfs_trans_brelse(tp, bp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	}
+#endif	/* __KERNEL__ */
+
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * Mark the buffer as an inode buffer now that it looks good
+	 */
+	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+
+	/*
+	 * Set *dipp to point to the on-disk inode in the buffer.
+	 */
+	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.	For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.	For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+STATIC int
+xfs_iformat(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip)
+{
+	xfs_attr_shortform_t	*atp;
+	int			size;
+	int			error;
+	xfs_fsize_t		di_size;
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	error = 0;
+
+	if (INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) +
+		INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) >
+	    INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT)) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu."
+			"  Unmount and run xfs_repair.",
+			(unsigned long long)ip->i_ino,
+			(int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT)
+			    + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)),
+			(unsigned long long)
+			INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT));
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt dinode %Lu, forkoff = 0x%x."
+			"  Unmount and run xfs_repair.",
+			(unsigned long long)ip->i_ino,
+			(int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT)));
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	switch (ip->i_d.di_mode & IFMT) {
+	case IFIFO:
+	case IFCHR:
+	case IFBLK:
+	case IFSOCK:
+		if (INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)
+			return XFS_ERROR(EFSCORRUPTED);
+		ip->i_d.di_size = 0;
+		ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
+		break;
+
+	case IFREG:
+	case IFLNK:
+	case IFDIR:
+		switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) {
+		case XFS_DINODE_FMT_LOCAL:
+			/*
+			 * no local regular files yet
+			 */
+			if ((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & IFMT) == IFREG) {
+				xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+					"corrupt inode "
+					"(local format for regular file) %Lu.  "
+					"Unmount and run xfs_repair.",
+					(unsigned long long) ip->i_ino);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+
+			di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
+			if (di_size >
+			    XFS_DFORK_DSIZE_ARCH(dip, ip->i_mount, ARCH_CONVERT)) {
+				xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+					"corrupt inode %Lu "
+					"(bad size %Ld for local inode).  "
+					"Unmount and run xfs_repair.",
+					(unsigned long long) ip->i_ino,
+					(long long) di_size);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+
+			size = (int)di_size;
+			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+			break;
+		case XFS_DINODE_FMT_EXTENTS:
+			error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+			break;
+		case XFS_DINODE_FMT_BTREE:
+			error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+			break;
+		default:
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		break;
+
+	default:
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (error) {
+		return error;
+	}
+	if (!XFS_DFORK_Q_ARCH(dip, ARCH_CONVERT))
+		return 0;
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) {
+	case XFS_DINODE_FMT_LOCAL:
+		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR_ARCH(dip, ARCH_CONVERT);
+		size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT);
+		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+		break;
+	default:
+		error = XFS_ERROR(EFSCORRUPTED);
+		break;
+	}
+	if (error) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+	}
+	return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork,
+	int		size)
+{
+	xfs_ifork_t	*ifp;
+	int		real_size;
+
+	/*
+	 * If the size is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or bcopy() below.
+	 */
+	if (size > XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT)) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt inode %Lu "
+			"(bad size %d for local fork, size = %d).  "
+			"Unmount and run xfs_repair.",
+			(unsigned long long) ip->i_ino, size,
+			XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT));
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	real_size = 0;
+	if (size == 0)
+		ifp->if_u1.if_data = NULL;
+	else if (size <= sizeof(ifp->if_u2.if_inline_data))
+		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+	else {
+		real_size = roundup(size, 4);
+		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+	}
+	ifp->if_bytes = size;
+	ifp->if_real_bytes = real_size;
+	if (size)
+		bcopy(XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT), ifp->if_u1.if_data, size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFINLINE;
+	return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+	int		nex;
+	int		real_size;
+	int		size;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nex = XFS_DFORK_NEXTENTS_ARCH(dip, whichfork, ARCH_CONVERT);
+	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+	/*
+	 * If the number of extents is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or bcopy() below.
+	 */
+	if (size < 0 || size > XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT)) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt inode %Lu ((a)extents = %d).  "
+			"Unmount and run xfs_repair.",
+			(unsigned long long) ip->i_ino, nex);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	real_size = 0;
+	if (nex == 0)
+		ifp->if_u1.if_extents = NULL;
+	else if (nex <= XFS_INLINE_EXTS)
+		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+	else {
+		ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
+		ASSERT(ifp->if_u1.if_extents != NULL);
+		real_size = size;
+	}
+	ifp->if_bytes = size;
+	ifp->if_real_bytes = real_size;
+	if (size) {
+		xfs_validate_extents(
+			(xfs_bmbt_rec_32_t *)XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT),
+			nex, XFS_EXTFMT_INODE(ip));
+		bcopy(XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT), ifp->if_u1.if_extents,
+		      size);
+		xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex,
+			whichfork);
+		if (whichfork != XFS_DATA_FORK ||
+			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+				if (xfs_check_nostate_extents(
+				    ifp->if_u1.if_extents, nex))
+					return XFS_ERROR(EFSCORRUPTED);
+	}
+	ifp->if_flags |= XFS_IFEXTENTS;
+	return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	int			whichfork)
+{
+	xfs_bmdr_block_t	*dfp;
+	xfs_ifork_t		*ifp;
+	/* REFERENCED */
+	int			nrecs;
+	int			size;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT);
+	size = XFS_BMAP_BROOT_SPACE(dfp);
+	nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
+
+	/*
+	 * blow out if -- fork has less extents than can fit in
+	 * fork (fork shouldn't be a btree format), root btree
+	 * block has more records than can fit into the fork,
+	 * or the number of extents is greater than the number of
+	 * blocks.
+	 */
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
+	    || XFS_BMDR_SPACE_CALC(nrecs) >
+			XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT)
+	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt inode %Lu (btree).  "
+			"Unmount and run xfs_repair.",
+			(unsigned long long) ip->i_ino);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	ifp->if_broot_bytes = size;
+	ifp->if_broot = kmem_alloc(size, KM_SLEEP);
+	ASSERT(ifp->if_broot != NULL);
+	/*
+	 * Copy and convert from the on-disk structure
+	 * to the in-memory structure.
+	 */
+	xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT),
+		ifp->if_broot, size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFBROOT;
+
+	return 0;
+}
+
+/*
+ * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk
+ * and native format
+ *
+ * buf	= on-disk representation
+ * dip	= native representation
+ * dir	= direction - +ve -> disk to native
+ *		      -ve -> native to disk
+ * arch = on-disk architecture
+ */
+
+void
+xfs_xlate_dinode_core(xfs_caddr_t buf, xfs_dinode_core_t *dip,
+    int dir, xfs_arch_t arch)
+{
+    xfs_dinode_core_t	*buf_core;
+    xfs_dinode_core_t	*mem_core;
+
+    ASSERT(dir);
+
+    buf_core=(xfs_dinode_core_t*)buf;
+    mem_core=(xfs_dinode_core_t*)dip;
+
+    if (arch == ARCH_NOCONVERT) {
+	if (dir>0) {
+	    bcopy((xfs_caddr_t)buf_core, (xfs_caddr_t)mem_core, sizeof(xfs_dinode_core_t));
+	} else {
+	    bcopy((xfs_caddr_t)mem_core, (xfs_caddr_t)buf_core, sizeof(xfs_dinode_core_t));
+	}
+	return;
+    }
+
+    INT_XLATE(buf_core->di_magic,	mem_core->di_magic,	   dir, arch);
+    INT_XLATE(buf_core->di_mode,	mem_core->di_mode,	   dir, arch);
+    INT_XLATE(buf_core->di_version,	mem_core->di_version,	   dir, arch);
+    INT_XLATE(buf_core->di_format,	mem_core->di_format,	   dir, arch);
+    INT_XLATE(buf_core->di_onlink,	mem_core->di_onlink,	   dir, arch);
+    INT_XLATE(buf_core->di_uid,		mem_core->di_uid,	   dir, arch);
+    INT_XLATE(buf_core->di_gid,		mem_core->di_gid,	   dir, arch);
+    INT_XLATE(buf_core->di_nlink,	mem_core->di_nlink,	   dir, arch);
+    INT_XLATE(buf_core->di_projid,	mem_core->di_projid,	   dir, arch);
+
+    if (dir>0) {
+	bcopy(buf_core->di_pad, mem_core->di_pad, sizeof(buf_core->di_pad));
+    } else {
+	bcopy(mem_core->di_pad, buf_core->di_pad, sizeof(buf_core->di_pad));
+    }
+
+    INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec,  dir, arch);
+    INT_XLATE(buf_core->di_atime.t_nsec,mem_core->di_atime.t_nsec, dir, arch);
+
+    INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec,  dir, arch);
+    INT_XLATE(buf_core->di_mtime.t_nsec,mem_core->di_mtime.t_nsec, dir, arch);
+
+    INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec,  dir, arch);
+    INT_XLATE(buf_core->di_ctime.t_nsec,mem_core->di_ctime.t_nsec, dir, arch);
+
+    INT_XLATE(buf_core->di_size,	mem_core->di_size,	   dir, arch);
+    INT_XLATE(buf_core->di_nblocks,	mem_core->di_nblocks,	   dir, arch);
+    INT_XLATE(buf_core->di_extsize,	mem_core->di_extsize,	   dir, arch);
+
+    INT_XLATE(buf_core->di_nextents,	mem_core->di_nextents,	   dir, arch);
+    INT_XLATE(buf_core->di_anextents,	mem_core->di_anextents,	   dir, arch);
+    INT_XLATE(buf_core->di_forkoff,	mem_core->di_forkoff,	   dir, arch);
+    INT_XLATE(buf_core->di_aformat,	mem_core->di_aformat,	   dir, arch);
+    INT_XLATE(buf_core->di_dmevmask,	mem_core->di_dmevmask,	   dir, arch);
+    INT_XLATE(buf_core->di_dmstate,	mem_core->di_dmstate,	   dir, arch);
+    INT_XLATE(buf_core->di_flags,	mem_core->di_flags,	   dir, arch);
+    INT_XLATE(buf_core->di_gen,		mem_core->di_gen,	   dir, arch);
+
+}
+
+/*
+ * Given a mount structure and an inode number, return a pointer
+ * to a newly allocated in-core inode coresponding to the given
+ * inode number.
+ *
+ * Initialize the inode's attributes and extent pointers if it
+ * already has them (it will not if the inode has no links).
+ */
+int
+xfs_iread(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	xfs_inode_t	**ipp,
+	xfs_daddr_t	bno)
+{
+	xfs_buf_t	*bp;
+	xfs_dinode_t	*dip;
+	xfs_inode_t	*ip;
+	int		error;
+
+	ASSERT(xfs_inode_zone != NULL);
+
+	ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
+	ip->i_ino = ino;
+	ip->i_mount = mp;
+
+	/*
+	 * Get pointer's to the on-disk inode and the buffer containing it.
+	 * If the inode number refers to a block outside the file system
+	 * then xfs_itobp() will return NULL.  In this case we should
+	 * return NULL as well.	 Set i_blkno to 0 so that xfs_itobp() will
+	 * know that this is a new incore inode.
+	 */
+	error = xfs_itobp(mp, tp, ip, &dip, &bp, bno);
+
+	if (error != 0) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		return error;
+	}
+
+	/*
+	 * Initialize inode's trace buffers.
+	 * Do this before xfs_iformat in case it adds entries.
+	 */
+#ifdef XFS_BMAP_TRACE
+	ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_BMBT_TRACE
+	ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_RW_TRACE
+	ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_STRAT_TRACE
+	ip->i_strat_trace = ktrace_alloc(XFS_STRAT_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_ILOCK_TRACE
+	ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_DIR2_TRACE
+	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
+#endif
+
+	/*
+	 * If we got something that isn't an inode it means someone
+	 * (nfs or dmi) has a stale handle.
+	 */
+	if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		xfs_trans_brelse(tp, bp);
+#ifdef DEBUG
+		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+				"dip->di_core.di_magic (0x%x) != "
+				"XFS_DINODE_MAGIC (0x%x)",
+				INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
+				XFS_DINODE_MAGIC);
+#endif /* DEBUG */
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * If the on-disk inode is already linked to a directory
+	 * entry, copy all of the inode into the in-core inode.
+	 * xfs_iformat() handles copying in the inode format
+	 * specific information.
+	 * Otherwise, just get the truly permanent information.
+	 */
+	if (!INT_ISZERO(dip->di_core.di_mode, ARCH_CONVERT)) {
+		xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
+		     &(ip->i_d), 1, ARCH_CONVERT);
+		error = xfs_iformat(ip, dip);
+		if (error)  {
+			kmem_zone_free(xfs_inode_zone, ip);
+			xfs_trans_brelse(tp, bp);
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+					"xfs_iformat() returned error %d",
+					error);
+#endif /* DEBUG */
+			return error;
+		}
+	} else {
+		ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT);
+		ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT);
+		ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT);
+		/*
+		 * Make sure to pull in the mode here as well in
+		 * case the inode is released without being used.
+		 * This ensures that xfs_inactive() will see that
+		 * the inode is already free and not try to mess
+		 * with the uninitialized part of it.
+		 */
+		ip->i_d.di_mode = 0;
+		/*
+		 * Initialize the per-fork minima and maxima for a new
+		 * inode here.	xfs_iformat will do it for old inodes.
+		 */
+		ip->i_df.if_ext_max =
+			XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	}
+
+	/*
+	 * The inode format changed when we moved the link count and
+	 * made it 32 bits long.  If this is an old format inode,
+	 * convert it in memory to look like a new one.	 If it gets
+	 * flushed to disk we will convert back before flushing or
+	 * logging it.	We zero out the new projid field and the old link
+	 * count field.	 We'll handle clearing the pad field (the remains
+	 * of the old uuid field) when we actually convert the inode to
+	 * the new format. We don't change the version number so that we
+	 * can distinguish this from a real new format inode.
+	 */
+	if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+		ip->i_d.di_nlink = ip->i_d.di_onlink;
+		ip->i_d.di_onlink = 0;
+		ip->i_d.di_projid = 0;
+	}
+
+	ip->i_delayed_blks = 0;
+
+	/*
+	 * Mark the buffer containing the inode as something to keep
+	 * around for a while.	This helps to keep recently accessed
+	 * meta-data in-core longer.
+	 */
+	 XFS_BUF_SET_REF(bp, XFS_INO_REF);
+
+	/*
+	 * Use xfs_trans_brelse() to release the buffer containing the
+	 * on-disk inode, because it was acquired with xfs_trans_read_buf()
+	 * in xfs_itobp() above.  If tp is NULL, this is just a normal
+	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
+	 * will only release the buffer if it is not dirty within the
+	 * transaction.	 It will be OK to release the buffer in this case,
+	 * because inodes on disk are never destroyed and we will be
+	 * locking the new in-core inode before putting it in the hash
+	 * table where other processes can find it.  Thus we don't have
+	 * to worry about the inode being changed just because we released
+	 * the buffer.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*ipp = ip;
+	return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	int		error;
+	xfs_ifork_t	*ifp;
+	size_t		size;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		return XFS_ERROR(EFSCORRUPTED);
+	size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	/*
+	 * We know that the size is legal (it's checked in iformat_btree)
+	 */
+	ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
+	ASSERT(ifp->if_u1.if_extents != NULL);
+	ifp->if_lastex = NULLEXTNUM;
+	ifp->if_bytes = ifp->if_real_bytes = (int)size;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	error = xfs_bmap_read_extents(tp, ip, whichfork);
+	if (error) {
+		kmem_free(ifp->if_u1.if_extents, size);
+		ifp->if_u1.if_extents = NULL;
+		ifp->if_bytes = ifp->if_real_bytes = 0;
+		ifp->if_flags &= ~XFS_IFEXTENTS;
+		return error;
+	}
+	xfs_validate_extents((xfs_bmbt_rec_32_t *)ifp->if_u1.if_extents,
+		XFS_IFORK_NEXTENTS(ip, whichfork), XFS_EXTFMT_INODE(ip));
+	return 0;
+}
+
+/*
+ * Allocate an inode on disk and return a copy of it's in-core version.
+ * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
+ * appropriately within the inode.  The uid and gid for the inode are
+ * set according to the contents of the given cred structure.
+ *
+ * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
+ * has a free inode available, call xfs_iget()
+ * to obtain the in-core version of the allocated inode.  Finally,
+ * fill in the inode and log its initial contents.  In this case,
+ * ialloc_context would be set to NULL and call_again set to false.
+ *
+ * If xfs_dialloc() does not have an available inode,
+ * it will replenish its supply by doing an allocation. Since we can
+ * only do one allocation within a transaction without deadlocks, we
+ * must commit the current transaction before returning the inode itself.
+ * In this case, therefore, we will set call_again to true and return.
+ * The caller should then commit the current transaction, start a new
+ * transaction, and call xfs_ialloc() again to actually get the inode.
+ *
+ * To ensure that some other process does not grab the inode that
+ * was allocated during the first call to xfs_ialloc(), this routine
+ * also returns the [locked] bp pointing to the head of the freelist
+ * as ialloc_context.  The caller should hold this buffer across
+ * the commit and pass it back into this routine on the second call.
+ */
+int
+xfs_ialloc(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*pip,
+	mode_t		mode,
+	nlink_t		nlink,
+	dev_t		rdev,
+	cred_t		*cr,
+	xfs_prid_t	prid,
+	int		okalloc,
+	xfs_buf_t	**ialloc_context,
+	boolean_t	*call_again,
+	xfs_inode_t	**ipp)
+{
+	xfs_ino_t	ino;
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+	uint		flags;
+	int		error;
+
+	/*
+	 * Call the space management code to pick
+	 * the on-disk inode to be allocated.
+	 */
+	ASSERT(pip != NULL);
+	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
+			    ialloc_context, call_again, &ino);
+	if (error != 0) {
+		return error;
+	}
+	if (*call_again || ino == NULLFSINO) {
+		*ipp = NULL;
+		return 0;
+	}
+	ASSERT(*ialloc_context == NULL);
+
+	/*
+	 * Get the in-core inode with the lock held exclusively.
+	 * This is because we're setting fields here we need
+	 * to prevent others from looking at until we're done.
+	 */
+	error = xfs_trans_iget(tp->t_mountp, tp, ino, XFS_ILOCK_EXCL, &ip);
+	if (error != 0) {
+		return error;
+	}
+	ASSERT(ip != NULL);
+
+	vp = XFS_ITOV(ip);
+	vp->v_type = IFTOVT(mode);
+	ip->i_d.di_mode = (__uint16_t)mode;
+	ip->i_d.di_onlink = 0;
+	ip->i_d.di_nlink = nlink;
+	ASSERT(ip->i_d.di_nlink == nlink);
+	ip->i_d.di_uid = current->fsuid;
+	ip->i_d.di_gid = current->fsgid;
+	ip->i_d.di_projid = prid;
+	bzero(&(ip->i_d.di_pad[0]), sizeof(ip->i_d.di_pad));
+
+	/* now that we have a v_type we can set Linux inode ops (& unlock) */
+	linvfs_set_inode_ops(LINVFS_GET_IP(XFS_ITOV(ip)));
+
+	/*
+	 * If the superblock version is up to where we support new format
+	 * inodes and this is currently an old format inode, then change
+	 * the inode version number now.  This way we only do the conversion
+	 * here rather than here and in the flush/logging code.
+	 */
+	if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) &&
+	    ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+		ip->i_d.di_version = XFS_DINODE_VERSION_2;
+		/*
+		 * We've already zeroed the old link count, the projid field,
+		 * and the pad field.
+		 */
+	}
+
+	/*
+	 * Project ids won't be stored on disk if we are using a version 1 inode.
+	 */
+	if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
+		xfs_bump_ino_vers2(tp, ip);
+
+	if (XFS_INHERIT_GID(pip, vp->v_vfsp)) {
+		ip->i_d.di_gid = pip->i_d.di_gid;
+		if ((pip->i_d.di_mode & ISGID) && (mode & IFMT) == IFDIR) {
+			ip->i_d.di_mode |= ISGID;
+		}
+	}
+
+	/*
+	 * If the group ID of the new file does not match the effective group
+	 * ID or one of the supplementary group IDs, the ISGID bit is
+	 * cleared if the "irixsgid" mount option is set.
+	 */
+	if (ip->i_d.di_mode & ISGID) {
+		if (!in_group_p((gid_t)ip->i_d.di_gid)
+		    && (ip->i_mount->m_flags & XFS_MOUNT_IRIXSGID)) {
+			ip->i_d.di_mode &= ~ISGID;
+		}
+	}
+
+	ip->i_d.di_size = 0;
+	ip->i_d.di_nextents = 0;
+	ASSERT(ip->i_d.di_nblocks == 0);
+	xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
+	/*
+	 * di_gen will have been taken care of in xfs_iread.
+	 */
+	ip->i_d.di_extsize = 0;
+	ip->i_d.di_dmevmask = 0;
+	ip->i_d.di_dmstate = 0;
+	ip->i_d.di_flags = 0;
+	flags = XFS_ILOG_CORE;
+	switch (mode & IFMT) {
+	case IFIFO:
+	case IFCHR:
+	case IFBLK:
+	case IFSOCK:
+		ip->i_d.di_format = XFS_DINODE_FMT_DEV;
+		ip->i_df.if_u2.if_rdev = IRIX_MKDEV(MAJOR(rdev), MINOR(rdev));
+		ip->i_df.if_flags = 0;
+		flags |= XFS_ILOG_DEV;
+		break;
+	case IFREG:
+	case IFDIR:
+	case IFLNK:
+		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+		ip->i_df.if_flags = XFS_IFEXTENTS;
+		ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
+		ip->i_df.if_u1.if_extents = NULL;
+		break;
+	default:
+		ASSERT(0);
+	}
+	/*
+	 * Attribute fork settings for new inode.
+	 */
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	ip->i_d.di_anextents = 0;
+
+#if DEBUG
+	{
+		uint	badflags = VNOSWAP |
+			       VISSWAP |
+			       VREPLICABLE |
+			   /*  VNONREPLICABLE | XXX uncomment this */
+			       VDOCMP |
+			       VFRLOCKS;
+
+		/*
+		 * For shared mounts, VNOSWAP is set in xfs_iget
+		 */
+		if (tp->t_mountp->m_cxfstype != XFS_CXFS_NOT)
+			badflags &= ~VNOSWAP;
+
+		ASSERT(!(vp->v_flag & badflags));
+	}
+#endif /* DEBUG */
+
+	/*
+	 * Log the new values stuffed into the inode.
+	 */
+	xfs_trans_log_inode(tp, ip, flags);
+	*ipp = ip;
+	return 0;
+}
+
+/*
+ * Check to make sure that there are no blocks allocated to the
+ * file beyond the size of the file.  We don't check this for
+ * files with fixed size extents or real time extents, but we
+ * at least do it for regular files.
+ */
+#ifdef DEBUG
+void
+xfs_isize_check(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	xfs_fsize_t	isize)
+{
+	xfs_fileoff_t	map_first;
+	int		nimaps;
+	xfs_bmbt_irec_t imaps[2];
+
+	if ((ip->i_d.di_mode & IFMT) != IFREG)
+		return;
+
+	if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME )
+		return;
+
+	nimaps = 2;
+	map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+	/*
+	 * The filesystem could be shutting down, so bmapi may return
+	 * an error.
+	 */
+	if (xfs_bmapi(NULL, ip, map_first,
+			 (XFS_B_TO_FSB(mp,
+				       (xfs_ufsize_t)XFS_MAX_FILE_OFFSET) -
+			  map_first),
+			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
+			 NULL))
+	    return;
+	ASSERT(nimaps == 1);
+	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
+}
+#endif	/* DEBUG */
+
+/*
+ * Calculate the last possible buffered byte in a file.	 This must
+ * include data that was buffered beyond the EOF by the write code.
+ * This also needs to deal with overflowing the xfs_fsize_t type
+ * which can happen for sizes near the limit.
+ *
+ * We also need to take into account any blocks beyond the EOF.	 It
+ * may be the case that they were buffered by a write which failed.
+ * In that case the pages will still be in memory, but the inode size
+ * will never have been updated.
+ */
+xfs_fsize_t
+xfs_file_last_byte(
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_fsize_t	last_byte;
+	xfs_fileoff_t	last_block;
+	xfs_fileoff_t	size_last_block;
+	int		error;
+
+	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS));
+
+	mp = ip->i_mount;
+	/*
+	 * Only check for blocks beyond the EOF if the extents have
+	 * been read in.  This eliminates the need for the inode lock,
+	 * and it also saves us from looking when it really isn't
+	 * necessary.
+	 */
+	if (ip->i_df.if_flags & XFS_IFEXTENTS) {
+		error = xfs_bmap_last_offset(NULL, ip, &last_block,
+			XFS_DATA_FORK);
+		if (error) {
+			last_block = 0;
+		}
+	} else {
+		last_block = 0;
+	}
+	size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size);
+	last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
+
+	last_byte = XFS_FSB_TO_B(mp, last_block);
+	if (last_byte < 0) {
+		return XFS_MAX_FILE_OFFSET;
+	}
+	last_byte += (1 << mp->m_writeio_log);
+	if (last_byte < 0) {
+		return XFS_MAX_FILE_OFFSET;
+	}
+	return last_byte;
+}
+
+#if defined(XFS_RW_TRACE)
+STATIC void
+xfs_itrunc_trace(
+	int		tag,
+	xfs_inode_t	*ip,
+	int		flag,
+	xfs_fsize_t	new_size,
+	xfs_off_t	toss_start,
+	xfs_off_t	toss_finish)
+{
+	if (ip->i_rwtrace == NULL) {
+		return;
+	}
+
+	ktrace_enter(ip->i_rwtrace,
+		     (void*)((long)tag),
+		     (void*)ip,
+		     (void*)((ip->i_d.di_size >> 32) & 0xffffffff),
+		     (void*)(ip->i_d.di_size & 0xffffffff),
+		     (void*)((long)flag),
+		     (void*)((new_size >> 32) & 0xffffffff),
+		     (void*)(new_size & 0xffffffff),
+		     (void*)((toss_start >> 32) & 0xffffffff),
+		     (void*)(toss_start & 0xffffffff),
+		     (void*)((toss_finish >> 32) & 0xffffffff),
+		     (void*)(toss_finish & 0xffffffff),
+		     (void*)((long)private.p_cpuid),
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0);
+}
+#else
+#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
+#endif
+
+/*
+ * Start the truncation of the file to new_size.  The new size
+ * must be smaller than the current size.  This routine will
+ * clear the buffer and page caches of file data in the removed
+ * range, and xfs_itruncate_finish() will remove the underlying
+ * disk blocks.
+ *
+ * The inode must have its I/O lock locked EXCLUSIVELY, and it
+ * must NOT have the inode lock held at all.  This is because we're
+ * calling into the buffer/page cache code and we can't hold the
+ * inode lock when we do so.
+ *
+ * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
+ * or XFS_ITRUNC_MAYBE.	 The XFS_ITRUNC_MAYBE value should be used
+ * in the case that the caller is locking things out of order and
+ * may not be able to call xfs_itruncate_finish() with the inode lock
+ * held without dropping the I/O lock.	If the caller must drop the
+ * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
+ * must be called again with all the same restrictions as the initial
+ * call.
+ */
+void
+xfs_itruncate_start(
+	xfs_inode_t	*ip,
+	uint		flags,
+	xfs_fsize_t	new_size)
+{
+	xfs_fsize_t	last_byte;
+	xfs_off_t	toss_start;
+	xfs_mount_t	*mp;
+	vnode_t		*vp;
+
+	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
+	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
+	ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
+	       (flags == XFS_ITRUNC_MAYBE));
+
+	mp = ip->i_mount;
+	vp = XFS_ITOV(ip);
+	/*
+	 * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
+	 * overlapping the region being removed.  We have to use
+	 * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
+	 * caller may not be able to finish the truncate without
+	 * dropping the inode's I/O lock.  Make sure
+	 * to catch any pages brought in by buffers overlapping
+	 * the EOF by searching out beyond the isize by our
+	 * block size. We round new_size up to a block boundary
+	 * so that we don't toss things on the same block as
+	 * new_size but before it.
+	 *
+	 * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
+	 * call remapf() over the same region if the file is mapped.
+	 * This frees up mapped file references to the pages in the
+	 * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
+	 * that we get the latest mapped changes flushed out.
+	 */
+	toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+	toss_start = XFS_FSB_TO_B(mp, toss_start);
+	if (toss_start < 0) {
+		/*
+		 * The place to start tossing is beyond our maximum
+		 * file size, so there is no way that the data extended
+		 * out there.
+		 */
+		return;
+	}
+	last_byte = xfs_file_last_byte(ip);
+	xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
+			 last_byte);
+	if (last_byte > toss_start) {
+		if (flags & XFS_ITRUNC_DEFINITE) {
+			VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+		} else {
+			VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+		}
+	}
+
+#ifdef DEBUG
+	if (new_size == 0) {
+		ASSERT(VN_CACHED(vp) == 0);
+	}
+#endif
+}
+
+/*
+ * Shrink the file to the given new_size.  The new
+ * size must be smaller than the current size.
+ * This will free up the underlying blocks
+ * in the removed range after a call to xfs_itruncate_start()
+ * or xfs_atruncate_start().
+ *
+ * The transaction passed to this routine must have made
+ * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
+ * This routine may commit the given transaction and
+ * start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.
+ * Some transaction will be returned to the caller to be
+ * committed.  The incoming transaction must already include
+ * the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On
+ * return the inode will be "held" within the returned transaction.
+ * This routine does NOT require any disk space to be reserved
+ * for it within the transaction.
+ *
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
+ * and it indicates the fork which is to be truncated.	For the
+ * attribute fork we only support truncation to size 0.
+ *
+ * We use the sync parameter to indicate whether or not the first
+ * transaction we perform might have to be synchronous.	 For the attr fork,
+ * it needs to be so if the unlink of the inode is not yet known to be
+ * permanent in the log.  This keeps us from freeing and reusing the
+ * blocks of the attribute fork before the unlink of the inode becomes
+ * permanent.
+ *
+ * For the data fork, we normally have to run synchronously if we're
+ * being called out of the inactive path or we're being called
+ * out of the create path where we're truncating an existing file.
+ * Either way, the truncate needs to be sync so blocks don't reappear
+ * in the file with altered data in case of a crash.  wsync filesystems
+ * can run the first case async because anything that shrinks the inode
+ * has to run sync so by the time we're called here from inactive, the
+ * inode size is permanently set to 0.
+ *
+ * Calls from the truncate path always need to be sync unless we're
+ * in a wsync filesystem and the file has already been unlinked.
+ *
+ * The caller is responsible for correctly setting the sync parameter.
+ * It gets too hard for us to guess here which path we're being called
+ * out of just based on inode state.
+ */
+int
+xfs_itruncate_finish(
+	xfs_trans_t	**tp,
+	xfs_inode_t	*ip,
+	xfs_fsize_t	new_size,
+	int		fork,
+	int		sync)
+{
+	xfs_fsblock_t	first_block;
+	xfs_fileoff_t	first_unmap_block;
+	xfs_fileoff_t	last_block;
+	xfs_filblks_t	unmap_len=0;
+	xfs_mount_t	*mp;
+	xfs_trans_t	*ntp;
+	int		done;
+	int		committed;
+	xfs_bmap_free_t free_list;
+	int		error;
+
+	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
+	ASSERT(*tp != NULL);
+	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+	ASSERT(ip->i_transp == *tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+
+
+	ntp = *tp;
+	mp = (ntp)->t_mountp;
+	ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
+
+	/*
+	 * We only support truncating the entire attribute fork.
+	 */
+	if (fork == XFS_ATTR_FORK) {
+		new_size = 0LL;
+	}
+	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+	xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
+	/*
+	 * The first thing we do is set the size to new_size permanently
+	 * on disk.  This way we don't have to worry about anyone ever
+	 * being able to look at the data being freed even in the face
+	 * of a crash.	What we're getting around here is the case where
+	 * we free a block, it is allocated to another file, it is written
+	 * to, and then we crash.  If the new data gets written to the
+	 * file but the log buffers containing the free and reallocation
+	 * don't, then we'd end up with garbage in the blocks being freed.
+	 * As long as we make the new_size permanent before actually
+	 * freeing any blocks it doesn't matter if they get writtten to.
+	 *
+	 * The callers must signal into us whether or not the size
+	 * setting here must be synchronous.  There are a few cases
+	 * where it doesn't have to be synchronous.  Those cases
+	 * occur if the file is unlinked and we know the unlink is
+	 * permanent or if the blocks being truncated are guaranteed
+	 * to be beyond the inode eof (regardless of the link count)
+	 * and the eof value is permanent.  Both of these cases occur
+	 * only on wsync-mounted filesystems.  In those cases, we're
+	 * guaranteed that no user will ever see the data in the blocks
+	 * that are being truncated so the truncate can run async.
+	 * In the free beyond eof case, the file may wind up with
+	 * more blocks allocated to it than it needs if we crash
+	 * and that won't get fixed until the next time the file
+	 * is re-opened and closed but that's ok as that shouldn't
+	 * be too many blocks.
+	 *
+	 * However, we can't just make all wsync xactions run async
+	 * because there's one call out of the create path that needs
+	 * to run sync where it's truncating an existing file to size
+	 * 0 whose size is > 0.
+	 *
+	 * It's probably possible to come up with a test in this
+	 * routine that would correctly distinguish all the above
+	 * cases from the values of the function parameters and the
+	 * inode state but for sanity's sake, I've decided to let the
+	 * layers above just tell us.  It's simpler to correctly figure
+	 * out in the layer above exactly under what conditions we
+	 * can run async and I think it's easier for others read and
+	 * follow the logic in case something has to be changed.
+	 * cscope is your friend -- rcc.
+	 *
+	 * The attribute fork is much simpler.
+	 *
+	 * For the attribute fork we allow the caller to tell us whether
+	 * the unlink of the inode that led to this call is yet permanent
+	 * in the on disk log.	If it is not and we will be freeing extents
+	 * in this inode then we make the first transaction synchronous
+	 * to make sure that the unlink is permanent by the time we free
+	 * the blocks.
+	 */
+	if (fork == XFS_DATA_FORK) {
+		if (ip->i_d.di_nextents > 0) {
+			ip->i_d.di_size = new_size;
+			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+		}
+	} else if (sync) {
+		ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
+		if (ip->i_d.di_anextents > 0)
+			xfs_trans_set_sync(ntp);
+	}
+	ASSERT(fork == XFS_DATA_FORK ||
+		(fork == XFS_ATTR_FORK &&
+			((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
+			 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
+
+	/*
+	 * Since it is possible for space to become allocated beyond
+	 * the end of the file (in a crash where the space is allocated
+	 * but the inode size is not yet updated), simply remove any
+	 * blocks which show up between the new EOF and the maximum
+	 * possible file size.	If the first block to be removed is
+	 * beyond the maximum file size (ie it is the same as last_block),
+	 * then there is nothing to do.
+	 */
+	last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAX_FILE_OFFSET);
+	ASSERT(first_unmap_block <= last_block);
+	done = 0;
+	if (last_block == first_unmap_block) {
+		done = 1;
+	} else {
+		unmap_len = last_block - first_unmap_block + 1;
+	}
+	while (!done) {
+		/*
+		 * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
+		 * will tell us whether it freed the entire range or
+		 * not.	 If this is a synchronous mount (wsync),
+		 * then we can tell bunmapi to keep all the
+		 * transactions asynchronous since the unlink
+		 * transaction that made this inode inactive has
+		 * already hit the disk.  There's no danger of
+		 * the freed blocks being reused, there being a
+		 * crash, and the reused blocks suddenly reappearing
+		 * in this file with garbage in them once recovery
+		 * runs.
+		 */
+		XFS_BMAP_INIT(&free_list, &first_block);
+		error = xfs_bunmapi(ntp, ip, first_unmap_block,
+				    unmap_len,
+				    XFS_BMAPI_AFLAG(fork) |
+				      (sync ? 0 : XFS_BMAPI_ASYNC),
+				    XFS_ITRUNC_MAX_EXTENTS,
+				    &first_block, &free_list, &done);
+		if (error) {
+			/*
+			 * If the bunmapi call encounters an error,
+			 * return to the caller where the transaction
+			 * can be properly aborted.  We just need to
+			 * make sure we're not holding any resources
+			 * that we were not when we came in.
+			 */
+			xfs_bmap_cancel(&free_list);
+			return error;
+		}
+
+		/*
+		 * Duplicate the transaction that has the permanent
+		 * reservation and commit the old transaction.
+		 */
+		error = xfs_bmap_finish(tp, &free_list, first_block,
+					&committed);
+		ntp = *tp;
+		if (error) {
+			/*
+			 * If the bmap finish call encounters an error,
+			 * return to the caller where the transaction
+			 * can be properly aborted.  We just need to
+			 * make sure we're not holding any resources
+			 * that we were not when we came in.
+			 *
+			 * Aborting from this point might lose some
+			 * blocks in the file system, but oh well.
+			 */
+			xfs_bmap_cancel(&free_list);
+			if (committed) {
+				/*
+				 * If the passed in transaction committed
+				 * in xfs_bmap_finish(), then we want to
+				 * add the inode to this one before returning.
+				 * This keeps things simple for the higher
+				 * level code, because it always knows that
+				 * the inode is locked and held in the
+				 * transaction that returns to it whether
+				 * errors occur or not.	 We don't mark the
+				 * inode dirty so that this transaction can
+				 * be easily aborted if possible.
+				 */
+				xfs_trans_ijoin(ntp, ip,
+					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+				xfs_trans_ihold(ntp, ip);
+			}
+			return error;
+		}
+
+		if (committed) {
+			/*
+			 * The first xact was committed,
+			 * so add the inode to the new one.
+			 * Mark it dirty so it will be logged
+			 * and moved forward in the log as
+			 * part of every commit.
+			 */
+			xfs_trans_ijoin(ntp, ip,
+					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+			xfs_trans_ihold(ntp, ip);
+			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+		}
+		ntp = xfs_trans_dup(ntp);
+		(void) xfs_trans_commit(*tp, 0, NULL);
+		*tp = ntp;
+		error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_ITRUNCATE_LOG_COUNT);
+		/*
+		 * Add the inode being truncated to the next chained
+		 * transaction.
+		 */
+		xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+		xfs_trans_ihold(ntp, ip);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Only update the size in the case of the data fork, but
+	 * always re-log the inode so that our permanent transaction
+	 * can keep on rolling it forward in the log.
+	 */
+	if (fork == XFS_DATA_FORK) {
+		xfs_isize_check(mp, ip, new_size);
+		ip->i_d.di_size = new_size;
+	}
+	xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+	ASSERT((new_size != 0) ||
+	       (fork == XFS_ATTR_FORK) ||
+	       (ip->i_delayed_blks == 0));
+	ASSERT((new_size != 0) ||
+	       (fork == XFS_ATTR_FORK) ||
+	       (ip->i_d.di_nextents == 0));
+	xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
+	return 0;
+}
+
+
+/*
+ * xfs_igrow_start
+ *
+ * Do the first part of growing a file: zero any data in the last
+ * block that is beyond the old EOF.  We need to do this before
+ * the inode is joined to the transaction to modify the i_size.
+ * That way we can drop the inode lock and call into the buffer
+ * cache to get the buffer mapping the EOF.
+ */
+int
+xfs_igrow_start(
+	xfs_inode_t	*ip,
+	xfs_fsize_t	new_size,
+	cred_t		*credp)
+{
+	xfs_fsize_t	isize;
+	int		error;
+
+	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
+	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
+	ASSERT(new_size > ip->i_d.di_size);
+
+	error = 0;
+	isize = ip->i_d.di_size;
+	/*
+	 * Zero any pages that may have been created by
+	 * xfs_write_file() beyond the end of the file
+	 * and any blocks between the old and new file sizes.
+	 */
+	error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize,
+				new_size, NULL);
+	return error;
+}
+
+/*
+ * xfs_igrow_finish
+ *
+ * This routine is called to extend the size of a file.
+ * The inode must have both the iolock and the ilock locked
+ * for update and it must be a part of the current transaction.
+ * The xfs_igrow_start() function must have been called previously.
+ * If the change_flag is not zero, the inode change timestamp will
+ * be updated.
+ */
+void
+xfs_igrow_finish(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_fsize_t	new_size,
+	int		change_flag)
+{
+	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
+	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
+	ASSERT(ip->i_transp == tp);
+	ASSERT(new_size > ip->i_d.di_size);
+
+	/*
+	 * Update the file size.  Update the inode change timestamp
+	 * if change_flag set.
+	 */
+	ip->i_d.di_size = new_size;
+	if (change_flag)
+		xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+}
+
+
+/*
+ * This is called when the inode's link count goes to 0.
+ * We place the on-disk inode on a list in the AGI.  It
+ * will be pulled from this list when the inode is freed.
+ */
+int
+xfs_iunlink(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_agi_t	*agi;
+	xfs_dinode_t	*dip;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_agnumber_t	agno;
+	xfs_daddr_t	agdaddr;
+	xfs_agino_t	agino;
+	short		bucket_index;
+	int		offset;
+	int		error;
+	int		agi_ok;
+
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_mode != 0);
+	ASSERT(ip->i_transp == tp);
+
+	mp = tp->t_mountp;
+
+	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+
+	/*
+	 * Get the agi buffer first.  It ensures lock ordering
+	 * on the list.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+				   1, 0, &agibp);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	agi = XFS_BUF_TO_AGI(agibp);
+	agi_ok =
+		INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
+		XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
+	if (XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
+			XFS_RANDOM_IUNLINK)) {
+		xfs_trans_brelse(tp, agibp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	/*
+	 * Get the index into the agi hash table for the
+	 * list this inode will go on.
+	 */
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	ASSERT(agino != 0);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	ASSERT(!INT_ISZERO(agi->agi_unlinked[bucket_index], ARCH_CONVERT));
+	ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != agino);
+
+	if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO) {
+		/*
+		 * There is already another inode in the bucket we need
+		 * to add ourselves to.	 Add us at the front of the list.
+		 * Here we put the head pointer into our next pointer,
+		 * and then we fall through to point the head at us.
+		 */
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+		if (error) {
+			return error;
+		}
+		ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO);
+		ASSERT(!INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT));
+		/* both on-disk, don't endian flip twice */
+		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
+		offset = ip->i_boffset +
+			offsetof(xfs_dinode_t, di_next_unlinked);
+		xfs_trans_inode_buf(tp, ibp);
+		xfs_trans_log_buf(tp, ibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+		xfs_inobp_check(mp, ibp);
+	}
+
+	/*
+	 * Point the bucket head pointer at the inode being inserted.
+	 */
+	ASSERT(agino != 0);
+	INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, agino);
+	offset = offsetof(xfs_agi_t, agi_unlinked) +
+		(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_log_buf(tp, agibp, offset,
+			  (offset + sizeof(xfs_agino_t) - 1));
+	return 0;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_ino_t	next_ino;
+	xfs_mount_t	*mp;
+	xfs_agi_t	*agi;
+	xfs_dinode_t	*dip;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_agnumber_t	agno;
+	xfs_daddr_t	agdaddr;
+	xfs_agino_t	agino;
+	xfs_agino_t	next_agino;
+	xfs_buf_t	*last_ibp;
+	xfs_dinode_t	*last_dip;
+	short		bucket_index;
+	int		offset, last_offset;
+	int		error;
+	int		agi_ok;
+
+	/*
+	 * First pull the on-disk inode from the AGI unlinked list.
+	 */
+	mp = tp->t_mountp;
+
+	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+
+	/*
+	 * Get the agi buffer first.  It ensures lock ordering
+	 * on the list.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+				   1, 0, &agibp);
+	if (error != 0) {
+		cmn_err(CE_WARN,
+			"xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.	Returning error.",
+			error, mp->m_fsname);
+		return error;
+	}
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	agi = XFS_BUF_TO_AGI(agibp);
+	agi_ok =
+		INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
+		XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
+	if (XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
+			XFS_RANDOM_IUNLINK_REMOVE)) {
+		xfs_trans_brelse(tp, agibp);
+		cmn_err(CE_WARN,
+			"xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.	 Returning EFSCORRUPTED.",
+			 mp->m_fsname);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	/*
+	 * Get the index into the agi hash table for the
+	 * list this inode will go on.
+	 */
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	ASSERT(agino != 0);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO);
+	ASSERT(!INT_ISZERO(agi->agi_unlinked[bucket_index], ARCH_CONVERT));
+
+	if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) == agino) {
+		/*
+		 * We're at the head of the list.  Get the inode's
+		 * on-disk buffer to see if there is anyone after us
+		 * on the list.	 Only modify our next pointer if it
+		 * is not already NULLAGINO.  This saves us the overhead
+		 * of dealing with the buffer when there is no need to
+		 * change it.
+		 */
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+		if (error) {
+			cmn_err(CE_WARN,
+				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+				error, mp->m_fsname);
+			return error;
+		}
+		next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
+		ASSERT(next_agino != 0);
+		if (next_agino != NULLAGINO) {
+			INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
+			offset = ip->i_boffset +
+				offsetof(xfs_dinode_t, di_next_unlinked);
+			xfs_trans_inode_buf(tp, ibp);
+			xfs_trans_log_buf(tp, ibp, offset,
+					  (offset + sizeof(xfs_agino_t) - 1));
+			xfs_inobp_check(mp, ibp);
+		} else {
+			xfs_trans_brelse(tp, ibp);
+		}
+		/*
+		 * Point the bucket head pointer at the next inode.
+		 */
+		ASSERT(next_agino != 0);
+		ASSERT(next_agino != agino);
+		INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, next_agino);
+		offset = offsetof(xfs_agi_t, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+		xfs_trans_log_buf(tp, agibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+	} else {
+		/*
+		 * We need to search the list for the inode being freed.
+		 */
+		next_agino = INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT);
+		last_ibp = NULL;
+		while (next_agino != agino) {
+			/*
+			 * If the last inode wasn't the one pointing to
+			 * us, then release its buffer since we're not
+			 * going to do anything with it.
+			 */
+			if (last_ibp != NULL) {
+				xfs_trans_brelse(tp, last_ibp);
+			}
+			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+			error = xfs_inotobp(mp, tp, next_ino, &last_dip,
+					    &last_ibp, &last_offset);
+			if (error) {
+				cmn_err(CE_WARN,
+			"xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.	 Returning error.",
+					error, mp->m_fsname);
+				return error;
+			}
+			next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT);
+			ASSERT(next_agino != NULLAGINO);
+			ASSERT(next_agino != 0);
+		}
+		/*
+		 * Now last_ibp points to the buffer previous to us on
+		 * the unlinked list.  Pull us from the list.
+		 */
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+		if (error) {
+			cmn_err(CE_WARN,
+				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+				error, mp->m_fsname);
+			return error;
+		}
+		next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
+		ASSERT(next_agino != 0);
+		ASSERT(next_agino != agino);
+		if (next_agino != NULLAGINO) {
+			INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
+			offset = ip->i_boffset +
+				offsetof(xfs_dinode_t, di_next_unlinked);
+			xfs_trans_inode_buf(tp, ibp);
+			xfs_trans_log_buf(tp, ibp, offset,
+					  (offset + sizeof(xfs_agino_t) - 1));
+			xfs_inobp_check(mp, ibp);
+		} else {
+			xfs_trans_brelse(tp, ibp);
+		}
+		/*
+		 * Point the previous inode on the list to the next inode.
+		 */
+		INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino);
+		ASSERT(next_agino != 0);
+		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+		xfs_trans_inode_buf(tp, last_ibp);
+		xfs_trans_log_buf(tp, last_ibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+		xfs_inobp_check(mp, last_ibp);
+	}
+	return 0;
+}
+
+/*
+ * This is called to return an inode to the inode free list.
+ * The inode should already be truncated to 0 length and have
+ * no pages associated with it.	 This routine also assumes that
+ * the inode is already a part of the transaction.
+ *
+ * The on-disk copy of the inode will have been added to the list
+ * of unlinked inodes in the AGI. We need to remove the inode from
+ * that list atomically with respect to freeing it here.
+ */
+int
+xfs_ifree(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	int	error;
+
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_nextents == 0);
+	ASSERT(ip->i_d.di_anextents == 0);
+	ASSERT((ip->i_d.di_size == 0) ||
+	       ((ip->i_d.di_mode & IFMT) != IFREG));
+	ASSERT(ip->i_d.di_nblocks == 0);
+
+	/*
+	 * Pull the on-disk inode from the AGI unlinked list.
+	 */
+	error = xfs_iunlink_remove(tp, ip);
+	if (error != 0) {
+		return error;
+	}
+
+	error = xfs_difree(tp, ip->i_ino);
+	if (error != 0) {
+		return error;
+	}
+	ip->i_d.di_mode = 0;		/* mark incore inode as free */
+	ip->i_d.di_flags = 0;
+	ip->i_d.di_dmevmask = 0;
+	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+	/*
+	 * Bump the generation count so no one will be confused
+	 * by reincarnations of this inode.
+	 */
+	ip->i_d.di_gen++;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return 0;
+}
+
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.	When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we adding records one will be allocated.	The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *	 requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+	xfs_inode_t		*ip,
+	int			rec_diff,
+	int			whichfork)
+{
+	int			cur_max;
+	xfs_ifork_t		*ifp;
+	xfs_bmbt_block_t	*new_broot;
+	int			new_max;
+	size_t			new_size;
+	char			*np;
+	char			*op;
+
+	/*
+	 * Handle the degenerate case quietly.
+	 */
+	if (rec_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (rec_diff > 0) {
+		/*
+		 * If there wasn't any memory allocated before, just
+		 * allocate it now and get out.
+		 */
+		if (ifp->if_broot_bytes == 0) {
+			new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
+			ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
+								     KM_SLEEP);
+			ifp->if_broot_bytes = (int)new_size;
+			return;
+		}
+
+		/*
+		 * If there is already an existing if_broot, then we need
+		 * to realloc() it and shift the pointers to their new
+		 * location.  The records don't change location because
+		 * they are kept butted up against the btree block header.
+		 */
+		cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+		new_max = cur_max + rec_diff;
+		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+		ifp->if_broot = (xfs_bmbt_block_t *)
+		  kmem_realloc(ifp->if_broot,
+				new_size,
+				(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
+				KM_SLEEP);
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+						      ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+						      (int)new_size);
+		ifp->if_broot_bytes = (int)new_size;
+		ASSERT(ifp->if_broot_bytes <=
+			XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+		ovbcopy(op, np, cur_max * (uint)sizeof(xfs_dfsbno_t));
+		return;
+	}
+
+	/*
+	 * rec_diff is less than 0.  In this case, we are shrinking the
+	 * if_broot buffer.  It must already exist.  If we go to zero
+	 * records, just get rid of the root and clear the status bit.
+	 */
+	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+	cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+	new_max = cur_max + rec_diff;
+	ASSERT(new_max >= 0);
+	if (new_max > 0)
+		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+	else
+		new_size = 0;
+	if (new_size > 0) {
+		new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
+		/*
+		 * First copy over the btree block header.
+		 */
+		bcopy(ifp->if_broot, new_broot, sizeof(xfs_bmbt_block_t));
+	} else {
+		new_broot = NULL;
+		ifp->if_flags &= ~XFS_IFBROOT;
+	}
+
+	/*
+	 * Only copy the records and pointers if there are any.
+	 */
+	if (new_max > 0) {
+		/*
+		 * First copy the records.
+		 */
+		op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
+						     (int)new_size);
+		bcopy(op, np, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+		/*
+		 * Then copy the pointers.
+		 */
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
+						     (int)new_size);
+		bcopy(op, np, new_max * (uint)sizeof(xfs_dfsbno_t));
+	}
+	kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+	ifp->if_broot = new_broot;
+	ifp->if_broot_bytes = (int)new_size;
+	ASSERT(ifp->if_broot_bytes <=
+		XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+	return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_extents
+ * is increased or decreased.  The change in size is indicated by
+ * the number of extents that need to be added or deleted in the
+ * ext_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_extents area is changing
+ * ext_diff -- the change in the number of extents, positive or negative,
+ *	 requested for the if_extents array.
+ */
+void
+xfs_iext_realloc(
+	xfs_inode_t	*ip,
+	int		ext_diff,
+	int		whichfork)
+{
+	int		byte_diff;
+	xfs_ifork_t	*ifp;
+	int		new_size;
+	uint		rnew_size;
+
+	if (ext_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t);
+	new_size = (int)ifp->if_bytes + byte_diff;
+	ASSERT(new_size >= 0);
+
+	if (new_size == 0) {
+		if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
+			ASSERT(ifp->if_real_bytes != 0);
+			kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+		}
+		ifp->if_u1.if_extents = NULL;
+		rnew_size = 0;
+	} else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) {
+		/*
+		 * If the valid extents can fit in if_inline_ext,
+		 * copy them from the malloc'd vector and free it.
+		 */
+		if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
+			/*
+			 * For now, empty files are format EXTENTS,
+			 * so the if_extents pointer is null.
+			 */
+			if (ifp->if_u1.if_extents) {
+				bcopy(ifp->if_u1.if_extents,
+				      ifp->if_u2.if_inline_ext, new_size);
+				kmem_free(ifp->if_u1.if_extents,
+					  ifp->if_real_bytes);
+			}
+			ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+		}
+		rnew_size = 0;
+	} else {
+		rnew_size = new_size;
+		if ((rnew_size & (rnew_size - 1)) != 0)
+			rnew_size = xfs_iroundup(rnew_size);
+		/*
+		 * Stuck with malloc/realloc.
+		 */
+		if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) {
+			ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
+				kmem_alloc(rnew_size, KM_SLEEP);
+			bcopy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+			      sizeof(ifp->if_u2.if_inline_ext));
+		} else if (rnew_size != ifp->if_real_bytes) {
+			ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
+			  kmem_realloc(ifp->if_u1.if_extents,
+					rnew_size,
+					ifp->if_real_bytes,
+					KM_NOFS);
+		}
+	}
+	ifp->if_real_bytes = rnew_size;
+	ifp->if_bytes = new_size;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *	 requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+	xfs_inode_t	*ip,
+	int		byte_diff,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+	int		new_size;
+	int		real_size;
+
+	if (byte_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	new_size = (int)ifp->if_bytes + byte_diff;
+	ASSERT(new_size >= 0);
+
+	if (new_size == 0) {
+		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+		}
+		ifp->if_u1.if_data = NULL;
+		real_size = 0;
+	} else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+		/*
+		 * If the valid extents/data can fit in if_inline_ext/data,
+		 * copy them from the malloc'd vector and free it.
+		 */
+		if (ifp->if_u1.if_data == NULL) {
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			ASSERT(ifp->if_real_bytes != 0);
+			bcopy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+			      new_size);
+			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		}
+		real_size = 0;
+	} else {
+		/*
+		 * Stuck with malloc/realloc.
+		 * For inline data, the underlying buffer must be
+		 * a multiple of 4 bytes in size so that it can be
+		 * logged and stay on word boundaries.	We enforce
+		 * that here.
+		 */
+		real_size = roundup(new_size, 4);
+		if (ifp->if_u1.if_data == NULL) {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			/*
+			 * Only do the realloc if the underlying size
+			 * is really changing.
+			 */
+			if (ifp->if_real_bytes != real_size) {
+				ifp->if_u1.if_data =
+					kmem_realloc(ifp->if_u1.if_data,
+							real_size,
+							ifp->if_real_bytes,
+							KM_SLEEP);
+			}
+		} else {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+			bcopy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+			      ifp->if_bytes);
+		}
+	}
+	ifp->if_real_bytes = real_size;
+	ifp->if_bytes = new_size;
+	ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+
+
+
+/*
+ * Map inode to disk block and offset.
+ *
+ * mp -- the mount point structure for the current file system
+ * tp -- the current transaction
+ * ino -- the inode number of the inode to be located
+ * imap -- this structure is filled in with the information necessary
+ *	 to retrieve the given inode from disk
+ * flags -- flags to pass to xfs_dilocate indicating whether or not
+ *	 lookups in the inode btree were OK or not
+ */
+int
+xfs_imap(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	xfs_imap_t	*imap,
+	uint		flags)
+{
+	xfs_fsblock_t	fsbno;
+	int		len;
+	int		off;
+	int		error;
+
+	fsbno = imap->im_blkno ?
+		XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
+	error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
+	if (error != 0) {
+		return error;
+	}
+	imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
+	imap->im_len = XFS_FSB_TO_BB(mp, len);
+	imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
+	imap->im_ioffset = (ushort)off;
+	imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
+	return 0;
+}
+
+void
+xfs_idestroy_fork(
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (ifp->if_broot != NULL) {
+		kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+		ifp->if_broot = NULL;
+	}
+
+	/*
+	 * If the format is local, then we can't have an extents
+	 * array so just look for an inline data array.	 If we're
+	 * not local then we may or may not have an extents list,
+	 * so check and free it up if we do.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+		    (ifp->if_u1.if_data != NULL)) {
+			ASSERT(ifp->if_real_bytes != 0);
+			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+			ifp->if_u1.if_data = NULL;
+			ifp->if_real_bytes = 0;
+		}
+	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+		   (ifp->if_u1.if_extents != NULL) &&
+		   (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) {
+		ASSERT(ifp->if_real_bytes != 0);
+		kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+		ifp->if_u1.if_extents = NULL;
+		ifp->if_real_bytes = 0;
+	}
+	ASSERT(ifp->if_u1.if_extents == NULL ||
+	       ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+	ASSERT(ifp->if_real_bytes == 0);
+	if (whichfork == XFS_ATTR_FORK) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+	}
+}
+
+/*
+ * This is called free all the memory associated with an inode.
+ * It must free the inode itself and any buffers allocated for
+ * if_extents/if_data and if_broot.  It must also free the lock
+ * associated with the inode.
+ */
+void
+xfs_idestroy(
+	xfs_inode_t	*ip)
+{
+
+	switch (ip->i_d.di_mode & IFMT) {
+	case IFREG:
+	case IFDIR:
+	case IFLNK:
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+		break;
+	}
+	if (ip->i_afp)
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+#ifdef NOTYET
+	if (ip->i_range_lock.r_sleep != NULL) {
+		freesema(ip->i_range_lock.r_sleep);
+		kmem_free(ip->i_range_lock.r_sleep, sizeof(sema_t));
+	}
+#endif /* NOTYET */
+	mrfree(&ip->i_lock);
+	mrfree(&ip->i_iolock);
+#ifdef NOTYET
+	mutex_destroy(&ip->i_range_lock.r_spinlock);
+#endif /* NOTYET */
+	freesema(&ip->i_flock);
+#ifdef XFS_BMAP_TRACE
+	ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BMBT_TRACE
+	ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+	ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_STRAT_TRACE
+	ktrace_free(ip->i_strat_trace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+	ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+	ktrace_free(ip->i_dir_trace);
+#endif
+	if (ip->i_itemp) {
+		/* XXXdpd should be able to assert this but shutdown
+		 * is leaving the AIL behind. */
+		ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) ||
+		       XFS_FORCED_SHUTDOWN(ip->i_mount));
+		xfs_inode_item_destroy(ip);
+	}
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
+
+/*
+ * Increment the pin count of the given buffer.
+ * This value is protected by ipinlock spinlock in the mount structure.
+ */
+void
+xfs_ipin(
+	xfs_inode_t	*ip)
+{
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+
+	atomic_inc(&ip->i_pincount);
+}
+
+/*
+ * Decrement the pin count of the given inode, and wake up
+ * anyone in xfs_iwait_unpin() if the count goes to 0.	The
+ * inode must have been previoulsy pinned with a call to xfs_ipin().
+ */
+void
+xfs_iunpin(
+	xfs_inode_t	*ip)
+{
+	ASSERT(atomic_read(&ip->i_pincount) > 0);
+
+	if (atomic_dec_and_test(&ip->i_pincount)) {
+		wake_up(&ip->i_ipin_wait);
+	}
+}
+
+/*
+ * This is called to wait for the given inode to be unpinned.
+ * It will sleep until this happens.  The caller must have the
+ * inode locked in at least shared mode so that the buffer cannot
+ * be subsequently pinned once someone is waiting for it to be
+ * unpinned.
+ */
+void
+xfs_iunpin_wait(
+	xfs_inode_t	*ip)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_lsn_t	lsn;
+
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
+
+	if (atomic_read(&ip->i_pincount) == 0) {
+		return;
+	}
+
+	iip = ip->i_itemp;
+	if (iip && iip->ili_last_lsn) {
+		lsn = iip->ili_last_lsn;
+	} else {
+		lsn = (xfs_lsn_t)0;
+	}
+
+	/*
+	 * Give the log a push so we don't wait here too long.
+	 */
+	xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
+
+	wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+}
+
+
+/*
+ * xfs_iextents_copy()
+ *
+ * This is called to copy the REAL extents (as opposed to the delayed
+ * allocation extents) from the inode into the given buffer.  It
+ * returns the number of bytes copied into the buffer.
+ *
+ * If there are no delayed allocation extents, then we can just
+ * bcopy() the extents into the buffer.	 Otherwise, we need to
+ * examine each extent in turn and skip those which are delayed.
+ */
+int
+xfs_iextents_copy(
+	xfs_inode_t		*ip,
+	xfs_bmbt_rec_32_t	*buffer,
+	int			whichfork)
+{
+	int			copied;
+	xfs_bmbt_rec_32_t	*dest_ep;
+	xfs_bmbt_rec_t		*ep;
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_iextents_copy";
+#endif
+	int			i;
+	xfs_ifork_t		*ifp;
+	int			nrecs;
+	xfs_fsblock_t		start_block;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+	ASSERT(ifp->if_bytes > 0);
+
+	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork);
+	ASSERT(nrecs > 0);
+	if (nrecs == XFS_IFORK_NEXTENTS(ip, whichfork)) {
+		/*
+		 * There are no delayed allocation extents,
+		 * so just copy everything.
+		 */
+		ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+		ASSERT(ifp->if_bytes ==
+		       (XFS_IFORK_NEXTENTS(ip, whichfork) *
+			(uint)sizeof(xfs_bmbt_rec_t)));
+		bcopy(ifp->if_u1.if_extents, buffer, ifp->if_bytes);
+		xfs_validate_extents(buffer, nrecs, XFS_EXTFMT_INODE(ip));
+		return ifp->if_bytes;
+	}
+
+	ASSERT(whichfork == XFS_DATA_FORK);
+	/*
+	 * There are some delayed allocation extents in the
+	 * inode, so copy the extents one at a time and skip
+	 * the delayed ones.  There must be at least one
+	 * non-delayed extent.
+	 */
+	ASSERT(nrecs > ip->i_d.di_nextents);
+	ep = ifp->if_u1.if_extents;
+	dest_ep = buffer;
+	copied = 0;
+	for (i = 0; i < nrecs; i++) {
+		start_block = xfs_bmbt_get_startblock(ep);
+		if (ISNULLSTARTBLOCK(start_block)) {
+			/*
+			 * It's a delayed allocation extent, so skip it.
+			 */
+			ep++;
+			continue;
+		}
+
+		*dest_ep = *(xfs_bmbt_rec_32_t *)ep;
+		dest_ep++;
+		ep++;
+		copied++;
+	}
+	ASSERT(copied != 0);
+	ASSERT(copied == ip->i_d.di_nextents);
+	ASSERT((copied * (uint)sizeof(xfs_bmbt_rec_t)) <= XFS_IFORK_DSIZE(ip));
+	xfs_validate_extents(buffer, copied, XFS_EXTFMT_INODE(ip));
+
+	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_iflush_fork(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	xfs_inode_log_item_t	*iip,
+	int			whichfork,
+	xfs_buf_t		*bp)
+{
+	char			*cp;
+	xfs_ifork_t		*ifp;
+	xfs_mount_t		*mp;
+#ifdef XFS_TRANS_DEBUG
+	int			first;
+#endif
+	static const short	brootflag[2] =
+		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+	static const short	dataflag[2] =
+		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+	static const short	extflag[2] =
+		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+	if (iip == NULL)
+		return 0;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	/*
+	 * This can happen if we gave up in iformat in an error path,
+	 * for the attribute fork.
+	 */
+	if (ifp == NULL) {
+		ASSERT(whichfork == XFS_ATTR_FORK);
+		return 0;
+	}
+	cp = XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT);
+	mp = ip->i_mount;
+	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(ifp->if_u1.if_data != NULL);
+			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+			bcopy(ifp->if_u1.if_data, cp, ifp->if_bytes);
+		}
+		if (whichfork == XFS_DATA_FORK) {
+			if (XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip)) {
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+		}
+		break;
+
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+		       !(iip->ili_format.ilf_fields & extflag[whichfork]));
+		ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0));
+		ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0));
+		if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_32_t *)cp,
+				whichfork);
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
+		    (ifp->if_broot_bytes > 0)) {
+			ASSERT(ifp->if_broot != NULL);
+			ASSERT(ifp->if_broot_bytes <=
+			       (XFS_IFORK_SIZE(ip, whichfork) +
+				XFS_BROOT_SIZE_ADJ));
+			xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
+				(xfs_bmdr_block_t *)cp,
+				XFS_DFORK_SIZE_ARCH(dip, mp, whichfork, ARCH_CONVERT));
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev);
+		}
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			bcopy(&ip->i_df.if_u2.if_uuid, &dip->di_u.di_muuid,
+				sizeof(uuid_t));
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * xfs_iflush() will write a modified inode's changes out to the
+ * inode's on disk home.  The caller must have the inode lock held
+ * in at least shared mode and the inode flush semaphore must be
+ * held as well.  The inode lock will still be held upon return from
+ * the call and the caller is free to unlock it.
+ * The inode flush lock will be unlocked when the inode reaches the disk.
+ * The flags indicate how the inode's buffer should be written out.
+ */
+int
+xfs_iflush(
+	xfs_inode_t		*ip,
+	uint			flags)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_buf_t		*bp;
+	xfs_dinode_t		*dip;
+	xfs_mount_t		*mp;
+	int			error;
+	/* REFERENCED */
+	xfs_chash_t		*ch;
+	xfs_inode_t		*iq;
+	int			clcount;	/* count of inodes clustered */
+	int			bufwasdelwri;
+	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
+	SPLDECL(s);
+
+	XFS_STATS_INC(xfsstats.xs_iflush_count);
+
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+	ASSERT(valusema(&ip->i_flock) <= 0);
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+
+	iip = ip->i_itemp;
+	mp = ip->i_mount;
+
+	/*
+	 * If the inode isn't dirty, then just release the inode
+	 * flush lock and do nothing.
+	 */
+	if ((ip->i_update_core == 0) &&
+	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+		ASSERT((iip != NULL) ?
+			 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
+		xfs_ifunlock(ip);
+		return 0;
+	}
+
+	/*
+	 * We can't flush the inode until it is unpinned, so
+	 * wait for it.	 We know noone new can pin it, because
+	 * we are holding the inode lock shared and you need
+	 * to hold it exclusively to pin the inode.
+	 */
+	xfs_iunpin_wait(ip);
+
+	/*
+	 * This may have been unpinned because the filesystem is shutting
+	 * down forcibly. If that's the case we must not write this inode
+	 * to disk, because the log record didn't make it to disk!
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		ip->i_update_core = 0;
+		if (iip)
+			iip->ili_format.ilf_fields = 0;
+		xfs_ifunlock(ip);
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * Get the buffer containing the on-disk inode.
+	 */
+	error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
+	if (error != 0) {
+		xfs_ifunlock(ip);
+		return error;
+	}
+
+	/*
+	 * Decide how buffer will be flushed out.  This is done before
+	 * the call to xfs_iflush_int because this field is zeroed by it.
+	 */
+	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
+		/*
+		 * Flush out the inode buffer according to the directions
+		 * of the caller.  In the cases where the caller has given
+		 * us a choice choose the non-delwri case.  This is because
+		 * the inode is in the AIL and we need to get it out soon.
+		 */
+		switch (flags) {
+		case XFS_IFLUSH_SYNC:
+		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
+			flags = 0;
+			break;
+		case XFS_IFLUSH_ASYNC:
+		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
+			flags = INT_ASYNC;
+			break;
+		case XFS_IFLUSH_DELWRI:
+			flags = INT_DELWRI;
+			break;
+		default:
+			ASSERT(0);
+			flags = 0;
+			break;
+		}
+	} else {
+		switch (flags) {
+		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
+		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
+		case XFS_IFLUSH_DELWRI:
+			flags = INT_DELWRI;
+			break;
+		case XFS_IFLUSH_ASYNC:
+			flags = INT_ASYNC;
+			break;
+		case XFS_IFLUSH_SYNC:
+			flags = 0;
+			break;
+		default:
+			ASSERT(0);
+			flags = 0;
+			break;
+		}
+	}
+
+	/*
+	 * First flush out the inode that xfs_iflush was called with.
+	 */
+	error = xfs_iflush_int(ip, bp);
+	if (error) {
+		goto corrupt_out;
+	}
+
+	/*
+	 * inode clustering:
+	 * see if other inodes can be gathered into this write
+	 */
+
+#ifdef DEBUG
+	ip->i_chash->chl_buf = bp;		/* inode clustering debug */
+#endif
+
+	ch = XFS_CHASH(mp, ip->i_blkno);
+	s = mutex_spinlock(&ch->ch_lock);
+
+	clcount = 0;
+	for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) {
+		/*
+		 * Do an un-protected check to see if the inode is dirty and
+		 * is a candidate for flushing.	 These checks will be repeated
+		 * later after the appropriate locks are acquired.
+		 */
+		iip = iq->i_itemp;
+		if ((iq->i_update_core == 0) &&
+		    ((iip == NULL) ||
+		     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+		      xfs_ipincount(iq) == 0) {
+			continue;
+		}
+
+		/*
+		 * Try to get locks.  If any are unavailable,
+		 * then this inode cannot be flushed and is skipped.
+		 */
+
+		/* get inode locks (just i_lock) */
+		if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
+			/* get inode flush lock */
+			if (xfs_iflock_nowait(iq)) {
+				/* check if pinned */
+				if (xfs_ipincount(iq) == 0) {
+					/* arriving here means that
+					 * this inode can be flushed.
+					 * first re-check that it's
+					 * dirty
+					 */
+					iip = iq->i_itemp;
+					if ((iq->i_update_core != 0)||
+					    ((iip != NULL) &&
+					     (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+						clcount++;
+						error = xfs_iflush_int(iq, bp);
+						if (error) {
+							xfs_iunlock(iq,
+								    XFS_ILOCK_SHARED);
+							goto cluster_corrupt_out;
+						}
+					} else {
+						xfs_ifunlock(iq);
+					}
+				} else {
+					xfs_ifunlock(iq);
+				}
+			}
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+		}
+	}
+	mutex_spinunlock(&ch->ch_lock, s);
+
+	if (clcount) {
+		XFS_STATS_INC(xfsstats.xs_icluster_flushcnt);
+		XFS_STATS_ADD(xfsstats.xs_icluster_flushinode, clcount);
+	}
+
+	/*
+	 * If the buffer is pinned then push on the log so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (XFS_BUF_ISPINNED(bp)){
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	}
+
+	if (flags & INT_DELWRI) {
+		xfs_bdwrite(mp, bp);
+	} else if (flags & INT_ASYNC) {
+		xfs_bawrite(mp, bp);
+	} else {
+		error = xfs_bwrite(mp, bp);
+	}
+	return error;
+
+corrupt_out:
+	xfs_buf_relse(bp);
+	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+	xfs_iflush_abort(ip);
+	/*
+	 * Unlocks the flush lock
+	 */
+	return XFS_ERROR(EFSCORRUPTED);
+
+cluster_corrupt_out:
+	/* Corruption detected in the clustering loop.	Invalidate the
+	 * inode buffer and shut down the filesystem.
+	 */
+	mutex_spinunlock(&ch->ch_lock, s);
+
+	/*
+	 * Clean up the buffer.	 If it was B_DELWRI, just release it --
+	 * brelse can handle it with no problems.  If not, shut down the
+	 * filesystem before releasing the buffer.
+	 */
+	if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
+		xfs_buf_relse(bp);
+	}
+
+	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+
+	if(!bufwasdelwri)  {
+		/*
+		 * Just like incore_relse: if we have b_iodone functions,
+		 * mark the buffer as an error and call them.  Otherwise
+		 * mark it as stale and brelse.
+		 */
+		if (XFS_BUF_IODONE_FUNC(bp)) {
+			XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+			XFS_BUF_UNDONE(bp);
+			XFS_BUF_STALE(bp);
+			XFS_BUF_SHUT(bp);
+			XFS_BUF_ERROR(bp,EIO);
+			xfs_biodone(bp);
+		} else {
+			XFS_BUF_STALE(bp);
+			xfs_buf_relse(bp);
+		}
+	}
+
+	xfs_iflush_abort(iq);
+	/*
+	 * Unlocks the flush lock
+	 */
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+
+STATIC int
+xfs_iflush_int(
+	xfs_inode_t		*ip,
+	xfs_buf_t		*bp)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_dinode_t		*dip;
+	xfs_mount_t		*mp;
+#ifdef XFS_TRANS_DEBUG
+	int			first;
+#endif
+	SPLDECL(s);
+
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+	ASSERT(valusema(&ip->i_flock) <= 0);
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+
+	iip = ip->i_itemp;
+	mp = ip->i_mount;
+
+
+	/*
+	 * If the inode isn't dirty, then just release the inode
+	 * flush lock and do nothing.
+	 */
+	if ((ip->i_update_core == 0) &&
+	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+		xfs_ifunlock(ip);
+		return 0;
+	}
+
+	/* set *dip = inode's place in the buffer */
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+
+	/*
+	 * Clear i_update_core before copying out the data.
+	 * This is for coordination with our timestamp updates
+	 * that don't hold the inode lock. They will always
+	 * update the timestamps BEFORE setting i_update_core,
+	 * so if we clear i_update_core after they set it we
+	 * are guaranteed to see their updates to the timestamps.
+	 * I believe that this depends on strongly ordered memory
+	 * semantics, but we have that.	 We use the SYNCHRONIZE
+	 * macro to make sure that the compiler does not reorder
+	 * the i_update_core access below the data copy below.
+	 */
+	ip->i_update_core = 0;
+	SYNCHRONIZE();
+
+	if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC,
+			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
+		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+		    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+			ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip);
+		goto corrupt_out;
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
+				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
+		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+			"xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+			ip->i_ino, ip, ip->i_d.di_magic);
+		goto corrupt_out;
+	}
+	if ((ip->i_d.di_mode & IFMT) == IFREG) {
+		if (XFS_TEST_ERROR(
+		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
+			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+				"xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
+				ip->i_ino, ip);
+			goto corrupt_out;
+		}
+	} else if ((ip->i_d.di_mode & IFMT) == IFDIR) {
+		if (XFS_TEST_ERROR(
+		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
+		    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
+			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+				"xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
+				ip->i_ino, ip);
+			goto corrupt_out;
+		}
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
+				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
+				XFS_RANDOM_IFLUSH_5)) {
+		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+			"xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
+			ip->i_ino,
+			ip->i_d.di_nextents + ip->i_d.di_anextents,
+			ip->i_d.di_nblocks,
+			ip);
+		goto corrupt_out;
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
+				mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
+		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+			"xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+			ip->i_ino, ip->i_d.di_forkoff, ip);
+		goto corrupt_out;
+	}
+	/*
+	 * Copy the dirty parts of the inode into the on-disk
+	 * inode.  We always copy out the core of the inode,
+	 * because if the inode is dirty at all the core must
+	 * be.
+	 */
+	xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d),
+		-1, ARCH_CONVERT);
+
+	/*
+	 * If this is really an old format inode and the superblock version
+	 * has not been updated to support only new format inodes, then
+	 * convert back to the old inode format.  If the superblock version
+	 * has been updated, then make the conversion permanent.
+	 */
+	ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+	       XFS_SB_VERSION_HASNLINK(&mp->m_sb));
+	if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+		if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+			/*
+			 * Convert it back.
+			 */
+			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+			INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink);
+		} else {
+			/*
+			 * The superblock version has already been bumped,
+			 * so just make the conversion to the new inode
+			 * format permanent.
+			 */
+			ip->i_d.di_version = XFS_DINODE_VERSION_2;
+			INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2);
+			ip->i_d.di_onlink = 0;
+			INT_ZERO(dip->di_core.di_onlink, ARCH_CONVERT);
+			bzero(&(ip->i_d.di_pad[0]), sizeof(ip->i_d.di_pad));
+			bzero(&(dip->di_core.di_pad[0]),
+			      sizeof(dip->di_core.di_pad));
+			ASSERT(ip->i_d.di_projid == 0);
+		}
+	}
+
+	if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
+		goto corrupt_out;
+	}
+
+	if (XFS_IFORK_Q(ip)) {
+		/*
+		 * The only error from xfs_iflush_fork is on the data fork.
+		 */
+		(void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
+	}
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * We've recorded everything logged in the inode, so we'd
+	 * like to clear the ilf_fields bits so we don't log and
+	 * flush things unnecessarily.	However, we can't stop
+	 * logging all this information until the data we've copied
+	 * into the disk buffer is written to disk.  If we did we might
+	 * overwrite the copy of the inode in the log with all the
+	 * data after re-logging only part of it, and in the face of
+	 * a crash we wouldn't have all the data we need to recover.
+	 *
+	 * What we do is move the bits to the ili_last_fields field.
+	 * When logging the inode, these bits are moved back to the
+	 * ilf_fields field.  In the xfs_iflush_done() routine we
+	 * clear ili_last_fields, since we know that the information
+	 * those bits represent is permanently on disk.	 As long as
+	 * the flush completes before the inode is logged again, then
+	 * both ilf_fields and ili_last_fields will be cleared.
+	 *
+	 * We can play with the ilf_fields bits here, because the inode
+	 * lock must be held exclusively in order to set bits there
+	 * and the flush lock protects the ili_last_fields bits.
+	 * Set ili_logged so the flush done
+	 * routine can tell whether or not to look in the AIL.
+	 * Also, store the current LSN of the inode so that we can tell
+	 * whether the item has moved in the AIL from xfs_iflush_done().
+	 * In order to read the lsn we need the AIL lock, because
+	 * it is a 64 bit value that cannot be read atomically.
+	 */
+	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
+		iip->ili_last_fields = iip->ili_format.ilf_fields;
+		iip->ili_format.ilf_fields = 0;
+		iip->ili_logged = 1;
+
+		ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+		AIL_LOCK(mp,s);
+		iip->ili_flush_lsn = iip->ili_item.li_lsn;
+		AIL_UNLOCK(mp, s);
+
+		/*
+		 * Attach the function xfs_iflush_done to the inode's
+		 * buffer.  This will remove the inode from the AIL
+		 * and unlock the inode's flush lock when the inode is
+		 * completely written to disk.
+		 */
+		xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
+				      xfs_iflush_done, (xfs_log_item_t *)iip);
+
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+		ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
+	} else {
+		/*
+		 * We're flushing an inode which is not in the AIL and has
+		 * not been logged but has i_update_core set.  For this
+		 * case we can use a B_DELWRI flush and immediately drop
+		 * the inode flush lock because we can avoid the whole
+		 * AIL state thing.  It's OK to drop the flush lock now,
+		 * because we've already locked the buffer and to do anything
+		 * you really need both.
+		 */
+		if (iip != NULL) {
+			ASSERT(iip->ili_logged == 0);
+			ASSERT(iip->ili_last_fields == 0);
+			ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
+		}
+		xfs_ifunlock(ip);
+	}
+
+	return 0;
+
+corrupt_out:
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+/*
+ * Flush all inactive inodes in mp.  Return true if no user references
+ * were found, false otherwise.
+ */
+int
+xfs_iflush_all(
+	xfs_mount_t	*mp,
+	int		flag)
+{
+	int		busy;
+	int		done;
+	int		purged;
+	xfs_inode_t	*ip;
+	vmap_t		vmap;
+	vnode_t		*vp;
+
+	busy = done = 0;
+	while (!done) {
+		purged = 0;
+		XFS_MOUNT_ILOCK(mp);
+		ip = mp->m_inodes;
+		if (ip == NULL) {
+			break;
+		}
+		do {
+			/* Make sure we skip markers inserted by sync */
+			if (ip->i_mount == NULL) {
+				ip = ip->i_mnext;
+				continue;
+			}
+
+			/*
+			 * It's up to our caller to purge the root
+			 * and quota vnodes later.
+			 */
+			vp = XFS_ITOV_NULL(ip);
+
+			if (!vp) {
+				XFS_MOUNT_IUNLOCK(mp);
+				xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
+				purged = 1;
+				break;
+			}
+
+			if (vn_count(vp) != 0) {
+				if (vn_count(vp) == 1 &&
+				    (ip == mp->m_rootip ||
+				     (mp->m_quotainfo &&
+				      (ip->i_ino == mp->m_sb.sb_uquotino ||
+				       ip->i_ino == mp->m_sb.sb_gquotino)))) {
+
+					ip = ip->i_mnext;
+					continue;
+				}
+				if (!(flag & XFS_FLUSH_ALL)) {
+					ASSERT(0);
+					busy = 1;
+					done = 1;
+					break;
+				}
+				/*
+				 * Ignore busy inodes but continue flushing
+				 * others.
+				 */
+				ip = ip->i_mnext;
+				continue;
+			}
+			/*
+			 * Sample vp mapping while holding mp locked on MP
+			 * systems, so we don't purge a reclaimed or
+			 * nonexistent vnode.  We break from the loop
+			 * since we know that we modify
+			 * it by pulling ourselves from it in xfs_reclaim()
+			 * called via vn_purge() below.	 Set ip to the next
+			 * entry in the list anyway so we'll know below
+			 * whether we reached the end or not.
+			 */
+			VMAP(vp, ip, vmap);
+			vp->v_flag |= VPURGE;		/* OK for vn_purge */
+			XFS_MOUNT_IUNLOCK(mp);
+
+			vn_purge(vp, &vmap);
+
+			purged = 1;
+			break;
+		} while (ip != mp->m_inodes);
+		/*
+		 * We need to distinguish between when we exit the loop
+		 * after a purge and when we simply hit the end of the
+		 * list.  We can't use the (ip == mp->m_inodes) test,
+		 * because when we purge an inode at the start of the list
+		 * the next inode on the list becomes mp->m_inodes.  That
+		 * would cause such a test to bail out early.  The purged
+		 * variable tells us how we got out of the loop.
+		 */
+		if (!purged) {
+			done = 1;
+		}
+	}
+	XFS_MOUNT_IUNLOCK(mp);
+	return !busy;
+}
+
+
+/*
+ * xfs_iaccess: check accessibility of inode for mode.
+ */
+int
+xfs_iaccess(
+	xfs_inode_t	*ip,
+	mode_t		mode,
+	cred_t		*cr)
+{
+	int		error;
+	mode_t		orgmode = mode;
+	struct inode	*inode = LINVFS_GET_IP(XFS_ITOV(ip));
+
+	/*
+	 * Verify that the MAC policy allows the requested access.
+	 */
+	if ((error = _MAC_XFS_IACCESS(ip, mode, cr)))
+		return XFS_ERROR(error);
+
+	if (mode & IWRITE) {
+		umode_t		imode = inode->i_mode;
+
+		if (IS_RDONLY(inode) &&
+		    (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode)))
+			return XFS_ERROR(EROFS);
+	}
+
+	/*
+	 * If there's an Access Control List it's used instead of
+	 * the mode bits.
+	 */
+	if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1)
+		return error ? XFS_ERROR(error) : 0;
+
+	if (current->fsuid != ip->i_d.di_uid) {
+		mode >>= 3;
+		if (!in_group_p((gid_t)ip->i_d.di_gid))
+			mode >>= 3;
+	}
+
+	/*
+	 * If the DACs are ok we don't need any capability check.
+	 */
+	if ((ip->i_d.di_mode & mode) == mode)
+		return 0;
+	/*
+	 * Read/write DACs are always overridable.
+	 * Executable DACs are overridable if at least one exec bit is set.
+	 */
+	if ((orgmode & (IREAD|IWRITE)) || (inode->i_mode & S_IXUGO))
+		if (capable_cred(cr, CAP_DAC_OVERRIDE))
+			return 0;
+
+	if ((orgmode == IREAD) ||
+	    (((ip->i_d.di_mode & IFMT) == IFDIR) &&
+	     (!(orgmode & ~(IWRITE|IEXEC))))) {
+		if (capable_cred(cr, CAP_DAC_READ_SEARCH))
+			return 0;
+#ifdef	NOISE
+		cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode);
+#endif	/* NOISE */
+		return XFS_ERROR(EACCES);
+	}
+	return XFS_ERROR(EACCES);
+}
+
+/*
+ * Return whether or not it is OK to swap to the given file in the
+ * given range.	 Return 0 for OK and otherwise return the error.
+ *
+ * It is only OK to swap to a file if it has no holes, and all
+ * extents have been initialized.
+ *
+ * We use the vnode behavior chain prevent and allow primitives
+ * to ensure that the vnode chain stays coherent while we do this.
+ * This allows us to walk the chain down to the bottom where XFS
+ * lives without worrying about it changing out from under us.
+ */
+int
+xfs_swappable(
+	bhv_desc_t	*bdp)
+{
+	xfs_inode_t	*ip;
+
+	ip = XFS_BHVTOI(bdp);
+	/*
+	 * Verify that the file does not have any
+	 * holes or unwritten exents.
+	 */
+	return xfs_bmap_check_swappable(ip);
+}
+
+/*
+ * xfs_iroundup: round up argument to next power of two
+ */
+uint
+xfs_iroundup(
+	uint	v)
+{
+	int i;
+	uint m;
+
+	if ((v & (v - 1)) == 0)
+		return v;
+	ASSERT((v & 0x80000000) == 0);
+	if ((v & (v + 1)) == 0)
+		return v + 1;
+	for (i = 0, m = 1; i < 31; i++, m <<= 1) {
+		if (v & m)
+			continue;
+		v |= m;
+		if ((v & (v + 1)) == 0)
+			return v + 1;
+	}
+	ASSERT(0);
+	return( 0 );
+}
+
+/*
+ * Change the requested timestamp in the given inode.
+ * We don't lock across timestamp updates, and we don't log them but
+ * we do record the fact that there is dirty information in core.
+ *
+ * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
+ *		with XFS_ICHGTIME_ACC to be sure that access time
+ *		update will take.  Calling first with XFS_ICHGTIME_ACC
+ *		and then XFS_ICHGTIME_MOD may fail to modify the access
+ *		timestamp if the filesystem is mounted noacctm.
+ */
+void
+xfs_ichgtime(xfs_inode_t *ip,
+	     int flags)
+{
+	timespec_t	tv;
+	vnode_t		*vp = XFS_ITOV(ip);
+	struct inode	*inode = LINVFS_GET_IP(vp);
+
+	/*
+	 * We're not supposed to change timestamps in readonly-mounted
+	 * filesystems.	 Throw it away if anyone asks us.
+	 */
+	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+		return;
+
+	/*
+	 * Don't update access timestamps on reads if mounted "noatime"
+	 * Throw it away if anyone asks us.
+	 */
+	if (ip->i_mount->m_flags & XFS_MOUNT_NOATIME &&
+	    ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG))
+			== XFS_ICHGTIME_ACC))
+		return;
+
+	nanotime(&tv);
+	if (flags & XFS_ICHGTIME_MOD) {
+		inode->i_mtime = ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+		ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+	}
+	if (flags & XFS_ICHGTIME_ACC) {
+		inode->i_atime = ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
+		ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
+	}
+	if (flags & XFS_ICHGTIME_CHG) {
+		inode->i_ctime = ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
+		ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
+	}
+
+	/*
+	 * We update the i_update_core field _after_ changing
+	 * the timestamps in order to coordinate properly with
+	 * xfs_iflush() so that we don't lose timestamp updates.
+	 * This keeps us from having to hold the inode lock
+	 * while doing this.  We use the SYNCHRONIZE macro to
+	 * ensure that the compiler does not reorder the update
+	 * of i_update_core above the timestamp updates above.
+	 */
+	SYNCHRONIZE();
+	ip->i_update_core = 1;
+}
+
+/*
+ * xfs_ibusy_check -- Checks whether inode reference count allows unmount
+ *
+ * The value returned is one if the reference count would prevent an unmount.
+ */
+int
+xfs_ibusy_check(
+	xfs_inode_t	*ip,
+	int		refs)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+
+	if ((refs == 1) && (ip == mp->m_rootip))
+		return (0);
+	if ((refs == 1) && (ip == mp->m_rbmip))
+		return (0);
+	if ((refs == 1) && (ip == mp->m_rsumip))
+		return (0);
+	if (mp->m_quotainfo && ip->i_ino == mp->m_sb.sb_uquotino)
+		return (0);
+	if (mp->m_quotainfo && ip->i_ino == mp->m_sb.sb_gquotino)
+		return (0);
+	return (1);
+}
+
+#ifdef XFS_ILOCK_TRACE
+void
+xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
+{
+	ktrace_enter(ip->i_lock_trace,
+		     (void *)ip,
+		      (void *)(__psint_t)lock,		/* 1 = LOCK, 3=UNLOCK, etc */
+		     (void *)(__psint_t)lockflags,	/* XFS_ILOCK_EXCL etc */
+		     (void *)ra,			/* caller of ilock */
+		     (void *)(__psint_t)cpuid(),
+		     (void *)(__psint_t)current_pid(),
+		     0,0,0,0,0,0,0,0,0,0);
+
+}
+#endif /* ILOCK_TRACE */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inode.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inode.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inode.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inode.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_INODE_H__
+#define __XFS_INODE_H__
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define XFS_INLINE_EXTS 2
+#define XFS_INLINE_DATA 32
+typedef struct xfs_ifork {
+	int			if_bytes;	/* bytes in if_u1 */
+	int			if_real_bytes;	/* bytes allocated in if_u1 */
+	xfs_bmbt_block_t	*if_broot;	/* file's incore btree root */
+	short			if_broot_bytes; /* bytes allocated for root */
+	unsigned char		if_flags;	/* per-fork flags */
+	unsigned char		if_ext_max;	/* max # of extent records */
+	xfs_extnum_t		if_lastex;	/* last if_extents used */
+	union {
+		xfs_bmbt_rec_t	*if_extents;	/* linear map file exts */
+		char		*if_data;	/* inline file data */
+	} if_u1;
+	union {
+		xfs_bmbt_rec_t	if_inline_ext[XFS_INLINE_EXTS];
+						/* very small file extents */
+		char		if_inline_data[XFS_INLINE_DATA];
+						/* very small file data */
+		xfs_dev_t	if_rdev;	/* dev number if special */
+		uuid_t		if_uuid;	/* mount point value */
+	} if_u2;
+} xfs_ifork_t;
+
+/*
+ * Flags for xfs_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
+#define XFS_ICHGTIME_ACC	0x2	/* data fork access timestamp */
+#define XFS_ICHGTIME_CHG	0x4	/* inode field change timestamp */
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE	0x0001	/* Inline data is read in */
+#define XFS_IFEXTENTS	0x0002	/* All extent pointers are read in */
+#define XFS_IFBROOT	0x0004	/* i_broot points to the bmap b-tree root */
+
+/*
+ * Flags for xfs_imap() and xfs_dilocate().
+ */
+#define XFS_IMAP_LOOKUP		0x1
+
+/*
+ * Maximum number of extent pointers in if_u1.if_extents.
+ */
+#define XFS_MAX_INCORE_EXTENTS	32768
+
+
+#ifdef __KERNEL__
+struct bhv_desc;
+struct cred;
+struct ktrace;
+struct vnode;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_bmbt_block;
+struct xfs_inode;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+struct pm;
+
+
+/*
+ * This structure is used to communicate which extents of a file
+ * were holes when a write started from xfs_write_file() to
+ * xfs_strat_read().  This is necessary so that we can know which
+ * blocks need to be zeroed when they are read in in xfs_strat_read()
+ * if they weren\'t allocated when the buffer given to xfs_strat_read()
+ * was mapped.
+ *
+ * We keep a list of these attached to the inode.  The list is
+ * protected by the inode lock and the fact that the io lock is
+ * held exclusively by writers.
+ */
+typedef struct xfs_gap {
+	struct xfs_gap	*xg_next;
+	xfs_fileoff_t	xg_offset_fsb;
+	xfs_extlen_t	xg_count_fsb;
+} xfs_gap_t;
+
+/*
+ * This structure is used to hold common pieces of the buffer
+ * and file for xfs_dio_write and xfs_dio_read.
+ */
+typedef struct xfs_dio {
+	struct xfs_buf	*xd_bp;
+	bhv_desc_t	*xd_bdp;
+	struct xfs_inode *xd_ip;
+	struct xfs_iocore *xd_io;
+	struct cred	*xd_cr;
+	struct pm	*xd_pmp;
+	int		xd_blkalgn;
+	int		xd_ioflag;
+	xfs_off_t	xd_start;
+	size_t		xd_length;
+} xfs_dio_t;
+
+typedef struct dm_attrs_s {
+	__uint32_t	da_dmevmask;	/* DMIG event mask */
+	__uint16_t	da_dmstate;	/* DMIG state info */
+	__uint16_t	da_pad;		/* DMIG extra padding */
+} dm_attrs_t;
+
+typedef struct xfs_iocore {
+	void			*io_obj;	/* pointer to container
+						 * inode or dcxvn structure */
+	struct xfs_mount	*io_mount;	/* fs mount struct ptr */
+#ifdef DEBUG
+	mrlock_t		*io_lock;	/* inode IO lock */
+	mrlock_t		*io_iolock;	/* inode IO lock */
+#endif
+
+	/* I/O state */
+	xfs_fsize_t		io_new_size;	/* sz when write completes */
+
+	/* Miscellaneous state. */
+	unsigned int		io_flags;	/* IO related flags */
+
+	/* DMAPI state */
+	dm_attrs_t		io_dmattrs;
+
+} xfs_iocore_t;
+
+#define	       io_dmevmask     io_dmattrs.da_dmevmask
+#define	       io_dmstate      io_dmattrs.da_dmstate
+
+#define XFS_IO_INODE(io)	((xfs_inode_t *) ((io)->io_obj))
+#define XFS_IO_DCXVN(io)	((dcxvn_t *) ((io)->io_obj))
+
+/*
+ * Flags in the flags field
+ */
+
+#define XFS_IOCORE_ISXFS	0x01
+#define XFS_IOCORE_ISCXFS	0x02
+#define XFS_IOCORE_RT		0x04
+
+#define IO_IS_XFS(io)	((io)->io_flags & XFS_IOCORE_ISXFS)
+
+/*
+ * xfs_iocore prototypes
+ */
+
+extern void xfs_iocore_inode_init(struct xfs_inode *);
+extern void xfs_iocore_inode_reinit(struct xfs_inode *);
+
+
+/*
+ * This is the type used in the xfs inode hash table.
+ * An array of these is allocated for each mounted
+ * file system to hash the inodes for that file system.
+ */
+typedef struct xfs_ihash {
+	struct xfs_inode	*ih_next;	
+	rwlock_t		ih_lock;
+	uint			ih_version;
+} xfs_ihash_t;
+
+/*
+ * Inode hashing and hash bucket locking.
+ */
+#define XFS_BUCKETS(mp) (37*(mp)->m_sb.sb_agcount-1)
+#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)ino) % (mp)->m_ihsize))
+
+/*
+ * This is the xfs inode cluster hash.	This hash is used by xfs_iflush to
+ * find inodes that share a cluster and can be flushed to disk at the same
+ * time.
+ */
+
+typedef struct xfs_chashlist {
+	struct xfs_chashlist	*chl_next;
+	struct xfs_inode	*chl_ip;
+	xfs_daddr_t		chl_blkno;	/* starting block number of
+						 * the cluster */
+#ifdef DEBUG
+	struct xfs_buf		*chl_buf;	/* debug: the inode buffer */
+#endif
+} xfs_chashlist_t;
+
+typedef struct xfs_chash {
+	xfs_chashlist_t		*ch_list;
+	lock_t			ch_lock;
+} xfs_chash_t;
+
+
+/*
+ * This is the xfs in-core inode structure.
+ * Most of the on-disk inode is embedded in the i_d field.
+ *
+ * The extent pointers/inline file space, however, are managed
+ * separately.	The memory for this information is pointed to by
+ * the if_u1 unions depending on the type of the data.
+ * This is used to linearize the array of extents for fast in-core
+ * access.  This is used until the file's number of extents
+ * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
+ * are accessed through the buffer cache.
+ *
+ * Other state kept in the in-core inode is used for identification,
+ * locking, transactional updating, etc of the inode.
+ *
+ * Generally, we do not want to hold the i_rlock while holding the
+ * i_ilock. Hierarchy is i_iolock followed by i_rlock.
+ *
+ * xfs_iptr_t contains all the inode fields upto and including the
+ * i_mnext and i_mprev fields, it is used as a marker in the inode
+ * chain off the mount structure by xfs_sync calls.
+ */
+
+typedef struct {
+	struct xfs_ihash	*ip_hash;	/* pointer to hash header */
+	struct xfs_inode	*ip_next;	/* inode hash link forw */
+	struct xfs_inode	*ip_mnext;	/* next inode in mount list */
+	struct xfs_inode	*ip_mprev;	/* ptr to prev inode */
+	struct xfs_inode	**ip_prevp;	/* ptr to prev i_next */
+	struct xfs_mount	*ip_mount;	/* fs mount struct ptr */
+} xfs_iptr_t;
+
+typedef struct xfs_inode {
+	/* Inode linking and identification information. */
+	struct xfs_ihash	*i_hash;	/* pointer to hash header */
+	struct xfs_inode	*i_next;	/* inode hash link forw */
+	struct xfs_inode	*i_mnext;	/* next inode in mount list */
+	struct xfs_inode	*i_mprev;	/* ptr to prev inode */
+	struct xfs_inode	**i_prevp;	/* ptr to prev i_next */
+	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
+	struct bhv_desc		i_bhv_desc;	/* inode behavior descriptor*/
+	struct xfs_dquot	*i_udquot;	/* user dquot */
+	struct xfs_dquot	*i_gdquot;	/* group dquot */
+
+	/* Inode location stuff */
+	xfs_ino_t		i_ino;		/* inode number (agno/agino)*/
+	xfs_daddr_t		i_blkno;	/* blkno of inode buffer */
+	ushort			i_len;		/* len of inode buffer */
+	ushort			i_boffset;	/* off of inode in buffer */
+
+	/* Extent information. */
+	xfs_ifork_t		*i_afp;		/* attribute fork pointer */
+	xfs_ifork_t		i_df;		/* data fork */
+
+	/* Transaction and locking information. */
+	struct xfs_trans	*i_transp;	/* ptr to owning transaction*/
+	struct xfs_inode_log_item *i_itemp;	/* logging information */
+	mrlock_t		i_lock;		/* inode lock */
+	mrlock_t		i_iolock;	/* inode IO lock */
+	sema_t			i_flock;	/* inode flush lock */
+	atomic_t		i_pincount;	/* inode pin count */
+	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
+	struct xfs_inode	**i_refcache;	/* ptr to entry in ref cache */
+	struct xfs_inode	*i_release;	/* inode to unref */
+
+	/* I/O state */
+	xfs_iocore_t		i_iocore;	/* I/O core */
+
+	/* Miscellaneous state. */
+	unsigned short		i_flags;	/* see defined flags below */
+	unsigned char		i_update_core;	/* timestamps/size is dirty */
+	unsigned char		i_update_size;	/* di_size field is dirty */
+	unsigned int		i_gen;		/* generation count */
+	unsigned int		i_delayed_blks; /* count of delay alloc blks */
+
+	xfs_dinode_core_t	i_d;		/* most of ondisk inode */
+	xfs_chashlist_t		*i_chash;	/* cluster hash list header */
+	struct xfs_inode	*i_cnext;	/* cluster hash link forward */
+	struct xfs_inode	*i_cprev;	/* cluster hash link backward */
+
+#ifdef DEBUG
+	/* Trace buffers per inode. */
+	struct ktrace		*i_xtrace;	/* inode extent list trace */
+	struct ktrace		*i_btrace;	/* inode bmap btree trace */
+	struct ktrace		*i_rwtrace;	/* inode read/write trace */
+	struct ktrace		*i_strat_trace; /* inode strat_write trace */
+	struct ktrace		*i_lock_trace;	/* inode lock/unlock trace */
+	struct ktrace		*i_dir_trace;	/* inode directory trace */
+#endif /* DEBUG */
+} xfs_inode_t;
+
+#endif	/* __KERNEL__ */
+
+
+/*
+ * Fork handling.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_PTR)
+xfs_ifork_t *xfs_ifork_ptr(xfs_inode_t *ip, int w);
+#define XFS_IFORK_PTR(ip,w)		xfs_ifork_ptr(ip,w)
+#else
+#define XFS_IFORK_PTR(ip,w)   ((w) == XFS_DATA_FORK ? &(ip)->i_df : (ip)->i_afp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_Q)
+int xfs_ifork_q(xfs_inode_t *ip);
+#define XFS_IFORK_Q(ip)			xfs_ifork_q(ip)
+#else
+#define XFS_IFORK_Q(ip)			XFS_CFORK_Q(&(ip)->i_d)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_DSIZE)
+int xfs_ifork_dsize(xfs_inode_t *ip);
+#define XFS_IFORK_DSIZE(ip)		xfs_ifork_dsize(ip)
+#else
+#define XFS_IFORK_DSIZE(ip)		XFS_CFORK_DSIZE(&ip->i_d, ip->i_mount)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_ASIZE)
+int xfs_ifork_asize(xfs_inode_t *ip);
+#define XFS_IFORK_ASIZE(ip)		xfs_ifork_asize(ip)
+#else
+#define XFS_IFORK_ASIZE(ip)		XFS_CFORK_ASIZE(&ip->i_d, ip->i_mount)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_SIZE)
+int xfs_ifork_size(xfs_inode_t *ip, int w);
+#define XFS_IFORK_SIZE(ip,w)		xfs_ifork_size(ip,w)
+#else
+#define XFS_IFORK_SIZE(ip,w)		XFS_CFORK_SIZE(&ip->i_d, ip->i_mount, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FORMAT)
+int xfs_ifork_format(xfs_inode_t *ip, int w);
+#define XFS_IFORK_FORMAT(ip,w)		xfs_ifork_format(ip,w)
+#else
+#define XFS_IFORK_FORMAT(ip,w)		XFS_CFORK_FORMAT(&ip->i_d, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FMT_SET)
+void xfs_ifork_fmt_set(xfs_inode_t *ip, int w, int n);
+#define XFS_IFORK_FMT_SET(ip,w,n)	xfs_ifork_fmt_set(ip,w,n)
+#else
+#define XFS_IFORK_FMT_SET(ip,w,n)	XFS_CFORK_FMT_SET(&ip->i_d, w, n)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXTENTS)
+int xfs_ifork_nextents(xfs_inode_t *ip, int w);
+#define XFS_IFORK_NEXTENTS(ip,w)	xfs_ifork_nextents(ip,w)
+#else
+#define XFS_IFORK_NEXTENTS(ip,w)	XFS_CFORK_NEXTENTS(&ip->i_d, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXT_SET)
+void xfs_ifork_next_set(xfs_inode_t *ip, int w, int n);
+#define XFS_IFORK_NEXT_SET(ip,w,n)	xfs_ifork_next_set(ip,w,n)
+#else
+#define XFS_IFORK_NEXT_SET(ip,w,n)	XFS_CFORK_NEXT_SET(&ip->i_d, w, n)
+#endif
+
+
+#ifdef __KERNEL__
+
+/*
+ * In-core inode flags.
+ */
+#define XFS_IGRIO	0x0001	/* inode used for guaranteed rate i/o */
+#define XFS_IUIOSZ	0x0002	/* inode i/o sizes have been explicitly set */
+#define XFS_IQUIESCE	0x0004	/* we have started quiescing for this inode */
+#define XFS_IRECLAIM	0x0008	/* we have started reclaiming this inode    */
+
+/*
+ * Flags for inode locking.
+ */
+#define XFS_IOLOCK_EXCL		0x001
+#define XFS_IOLOCK_SHARED	0x002
+#define XFS_ILOCK_EXCL		0x004
+#define XFS_ILOCK_SHARED	0x008
+#define XFS_IUNLOCK_NONOTIFY	0x010
+#define XFS_EXTENT_TOKEN_RD	0x040
+#define XFS_SIZE_TOKEN_RD	0x080
+#define XFS_EXTSIZE_RD		(XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD)
+#define XFS_WILLLEND		0x100	/* Always acquire tokens for lending */
+#define XFS_EXTENT_TOKEN_WR	(XFS_EXTENT_TOKEN_RD | XFS_WILLLEND)
+#define XFS_SIZE_TOKEN_WR	(XFS_SIZE_TOKEN_RD | XFS_WILLLEND)
+#define XFS_EXTSIZE_WR		(XFS_EXTSIZE_RD | XFS_WILLLEND)
+
+
+#define XFS_LOCK_MASK	\
+	(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \
+	 XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \
+	 XFS_WILLLEND)
+
+/*
+ * Flags for xfs_iflush()
+ */
+#define XFS_IFLUSH_DELWRI_ELSE_SYNC	1
+#define XFS_IFLUSH_DELWRI_ELSE_ASYNC	2
+#define XFS_IFLUSH_SYNC			3
+#define XFS_IFLUSH_ASYNC		4
+#define XFS_IFLUSH_DELWRI		5
+
+/*
+ * Flags for xfs_iflush_all.
+ */
+#define XFS_FLUSH_ALL		0x1
+
+/*
+ * Flags for xfs_itruncate_start().
+ */
+#define XFS_ITRUNC_DEFINITE	0x1
+#define XFS_ITRUNC_MAYBE	0x2
+
+/*
+ * max file offset is 2^(31+PAGE_SHIFT) - 1 (due to linux page cache)
+ *
+ * NOTE: XFS itself can handle 2^63 - 1 (largest positive value of xfs_fsize_t)
+ * but Linux can't go above 2^(31+PAGE_SHIFT)-1: the Linux VM uses a 32 bit
+ * signed variable to index cache data, so 2^31 * PAGE_SIZE is as big as
+ * you can go.
+ */
+#define XFS_MAX_FILE_OFFSET	((long long)((1ULL<<(31+PAGE_SHIFT))-1ULL))
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOV)
+struct vnode *xfs_itov(xfs_inode_t *ip);
+#define XFS_ITOV(ip)		xfs_itov(ip)
+#else
+#define XFS_ITOV(ip)		BHV_TO_VNODE(XFS_ITOBHV(ip))
+#endif
+#define XFS_ITOV_NULL(ip)	BHV_TO_VNODE_NULL(XFS_ITOBHV(ip))
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOBHV)
+struct bhv_desc *xfs_itobhv(xfs_inode_t *ip);
+#define XFS_ITOBHV(ip)		xfs_itobhv(ip)
+#else
+#define XFS_ITOBHV(ip)		((struct bhv_desc *)(&((ip)->i_bhv_desc)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOI)
+xfs_inode_t *xfs_bhvtoi(struct bhv_desc *bhvp);
+#define XFS_BHVTOI(bhvp)	xfs_bhvtoi(bhvp)
+#else
+#define XFS_BHVTOI(bhvp)	\
+	((xfs_inode_t *)((char *)(bhvp) - \
+			 (char *)&(((xfs_inode_t *)0)->i_bhv_desc)))
+#endif
+
+#define BHV_IS_XFS(bdp)		(BHV_OPS(bdp) == &xfs_vnodeops)
+
+/*
+ * Pick the inode cluster hash bucket
+ * (m_chash is the same size as m_ihash)
+ */
+#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize))
+
+/*
+ * For multiple groups support: if ISGID bit is set in the parent
+ * directory, group of new file is set to that of the parent, and
+ * new subdirectory gets ISGID bit from parent.
+ */
+#define XFS_INHERIT_GID(pip, vfsp)	((pip) != NULL && \
+	(((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & ISGID)))
+
+/*
+ * xfs_iget.c prototypes.
+ */
+void		xfs_ihash_init(struct xfs_mount *);
+void		xfs_ihash_free(struct xfs_mount *);
+void		xfs_chash_init(struct xfs_mount *);
+void		xfs_chash_free(struct xfs_mount *);
+xfs_inode_t	*xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
+				  struct xfs_trans *);
+void		xfs_inode_lock_init(xfs_inode_t *, struct vnode *);
+int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+			 uint, xfs_inode_t **, xfs_daddr_t);
+int		xfs_vn_iget(vfs_t *, struct vnode *, xfs_ino_t);
+void		xfs_iput(xfs_inode_t *, uint);
+void		xfs_iput_new(xfs_inode_t *, uint);
+void		xfs_ilock(xfs_inode_t *, uint);
+int		xfs_ilock_nowait(xfs_inode_t *, uint);
+void		xfs_iunlock(xfs_inode_t *, uint);
+void		xfs_ilock_demote(xfs_inode_t *, uint);
+void		xfs_iflock(xfs_inode_t *);
+int		xfs_iflock_nowait(xfs_inode_t *);
+uint		xfs_ilock_map_shared(xfs_inode_t *);
+void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
+void		xfs_ifunlock(xfs_inode_t *);
+void		xfs_ireclaim(xfs_inode_t *);
+int		xfs_finish_reclaim(xfs_inode_t *, int, int);
+
+/*
+ * xfs_inode.c prototypes.
+ */
+int		xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+			    xfs_dinode_t **, struct xfs_buf **, int *);
+int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+			  xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **,
+			  xfs_daddr_t);
+int		xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+			  xfs_inode_t **, xfs_daddr_t);
+int		xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
+int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, nlink_t,
+			   dev_t, struct cred *, xfs_prid_t, int,
+			   struct xfs_buf **, boolean_t *, xfs_inode_t **);
+void		xfs_xlate_dinode_core(xfs_caddr_t, struct xfs_dinode_core *, int,
+			   xfs_arch_t);
+int		xfs_ifree(struct xfs_trans *, xfs_inode_t *);
+int		xfs_atruncate_start(xfs_inode_t *);
+void		xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
+int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
+				     xfs_fsize_t, int, int);
+int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
+int		xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *);
+void		xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *,
+				 xfs_fsize_t, int);
+
+void		xfs_idestroy_fork(xfs_inode_t *, int);
+void		xfs_idestroy(xfs_inode_t *);
+void		xfs_idata_realloc(xfs_inode_t *, int, int);
+void		xfs_iextract(xfs_inode_t *);
+void		xfs_iext_realloc(xfs_inode_t *, int, int);
+void		xfs_iroot_realloc(xfs_inode_t *, int, int);
+void		xfs_ipin(xfs_inode_t *);
+void		xfs_iunpin(xfs_inode_t *);
+int		xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_32_t *, int);
+int		xfs_iflush(xfs_inode_t *, uint);
+int		xfs_iflush_all(struct xfs_mount *, int);
+int		xfs_ibusy_check(xfs_inode_t *, int);
+int		xfs_iaccess(xfs_inode_t *, mode_t, cred_t *);
+uint		xfs_iroundup(uint);
+void		xfs_ichgtime(xfs_inode_t *, int);
+xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
+void		xfs_lock_inodes(xfs_inode_t **, int, int, uint);
+
+#define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
+
+
+
+#ifdef DEBUG
+void		xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
+#else	/* DEBUG */
+#define xfs_isize_check(mp, ip, isize)
+#endif	/* DEBUG */
+
+#if defined(DEBUG)
+void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+
+extern struct kmem_zone	*xfs_chashlist_zone;
+extern struct kmem_zone	*xfs_ifork_zone;
+extern struct kmem_zone	*xfs_inode_zone;
+extern struct kmem_zone	*xfs_ili_zone;
+extern struct vnodeops	xfs_vnodeops;
+
+#ifdef XFS_ILOCK_TRACE
+#define XFS_ILOCK_KTRACE_SIZE	32
+void	xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags,
+			inst_t *ra);
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_INODE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inode_item.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inode_item.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inode_item.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inode_item.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1024 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains the implementation of the xfs_inode_log_item.
+ * It contains the item operations used to manipulate the inode log
+ * items as well as utility routines used by the inode specific
+ * transaction routines.
+ */
+#include <xfs.h>
+
+
+kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
+ */
+STATIC uint
+xfs_inode_item_size(
+	xfs_inode_log_item_t	*iip)
+{
+	uint		nvecs;
+	xfs_inode_t	*ip;
+
+	ip = iip->ili_inode;
+	nvecs = 2;
+
+	/*
+	 * Only log the data/extents/b-tree root if there is something
+	 * left to log.
+	 */
+	iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
+		    (ip->i_d.di_nextents > 0) &&
+		    (ip->i_df.if_bytes > 0)) {
+			ASSERT(ip->i_df.if_u1.if_extents != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(ip->i_df.if_ext_max ==
+		       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
+		    (ip->i_df.if_broot_bytes > 0)) {
+			ASSERT(ip->i_df.if_broot != NULL);
+			nvecs++;
+		} else {
+			ASSERT(!(iip->ili_format.ilf_fields &
+				 XFS_ILOG_DBROOT));
+#ifdef XFS_TRANS_DEBUG
+			if (iip->ili_root_size > 0) {
+				ASSERT(iip->ili_root_size ==
+				       ip->i_df.if_broot_bytes);
+				ASSERT(bcmp(iip->ili_orig_root,
+					    ip->i_df.if_broot,
+					    iip->ili_root_size) == 0);
+			} else {
+				ASSERT(ip->i_df.if_broot_bytes == 0);
+			}
+#endif
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
+		    (ip->i_df.if_bytes > 0)) {
+			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_d.di_size > 0);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEXT | XFS_ILOG_UUID);
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEXT | XFS_ILOG_DEV);
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	/*
+	 * If there are no attributes associated with this file,
+	 * then there cannot be anything more to log.
+	 * Clear all attribute-related log flags.
+	 */
+	if (!XFS_IFORK_Q(ip)) {
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+		return nvecs;
+	}
+
+	/*
+	 * Log any necessary attribute data.
+	 */
+	switch (ip->i_d.di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
+		    (ip->i_d.di_anextents > 0) &&
+		    (ip->i_afp->if_bytes > 0)) {
+			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
+		    (ip->i_afp->if_broot_bytes > 0)) {
+			ASSERT(ip->i_afp->if_broot != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
+		    (ip->i_afp->if_bytes > 0)) {
+			ASSERT(ip->i_afp->if_u1.if_data != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	return nvecs;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode log item.  It fills the first item with an inode
+ * log format structure, the second with the on-disk inode structure,
+ * and a possible third and/or fourth with the inode data/extents/b-tree
+ * root and inode attributes data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+	xfs_inode_log_item_t	*iip,
+	xfs_log_iovec_t		*log_vector)
+{
+	uint			nvecs;
+	xfs_log_iovec_t		*vecp;
+	xfs_inode_t		*ip;
+	size_t			data_bytes;
+	xfs_bmbt_rec_32_t	*ext_buffer;
+	int			nrecs;
+	xfs_mount_t		*mp;
+
+	ip = iip->ili_inode;
+	vecp = log_vector;
+
+	vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
+	vecp->i_len  = sizeof(xfs_inode_log_format_t);
+	vecp++;
+	nvecs	     = 1;
+
+	/*
+	 * Clear i_update_core if the timestamps (or any other
+	 * non-transactional modification) need flushing/logging
+	 * and we're about to log them with the rest of the core.
+	 *
+	 * This is the same logic as xfs_iflush() but this code can't
+	 * run at the same time as xfs_iflush because we're in commit
+	 * processing here and so we have the inode lock held in
+	 * exclusive mode.  Although it doesn't really matter
+	 * for the timestamps if both routines were to grab the
+	 * timestamps or not.  That would be ok.
+	 *
+	 * We clear i_update_core before copying out the data.
+	 * This is for coordination with our timestamp updates
+	 * that don't hold the inode lock. They will always
+	 * update the timestamps BEFORE setting i_update_core,
+	 * so if we clear i_update_core after they set it we
+	 * are guaranteed to see their updates to the timestamps
+	 * either here.	 Likewise, if they set it after we clear it
+	 * here, we'll see it either on the next commit of this
+	 * inode or the next time the inode gets flushed via
+	 * xfs_iflush().  This depends on strongly ordered memory
+	 * semantics, but we have that.	 We use the SYNCHRONIZE
+	 * macro to make sure that the compiler does not reorder
+	 * the i_update_core access below the data copy below.
+	 */
+	if (ip->i_update_core)	{
+		ip->i_update_core = 0;
+		SYNCHRONIZE();
+	}
+
+	/*
+	 * We don't have to worry about re-ordering here because
+	 * the update_size field is protected by the inode lock
+	 * and we have that held in exclusive mode.
+	 */
+	if (ip->i_update_size)
+		ip->i_update_size = 0;
+
+	vecp->i_addr = (xfs_caddr_t)&ip->i_d;
+	vecp->i_len  = sizeof(xfs_dinode_core_t);
+	vecp++;
+	nvecs++;
+	iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+
+	/*
+	 * If this is really an old format inode, then we need to
+	 * log it as such.  This means that we have to copy the link
+	 * count from the new field to the old.	 We don't have to worry
+	 * about the new fields, because nothing trusts them as long as
+	 * the old inode version number is there.  If the superblock already
+	 * has a new version number, then we don't bother converting back.
+	 */
+	mp = ip->i_mount;
+	ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+	       XFS_SB_VERSION_HASNLINK(&mp->m_sb));
+	if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+		if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+			/*
+			 * Convert it back.
+			 */
+			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+			ip->i_d.di_onlink = ip->i_d.di_nlink;
+		} else {
+			/*
+			 * The superblock version has already been bumped,
+			 * so just make the conversion to the new inode
+			 * format permanent.
+			 */
+			ip->i_d.di_version = XFS_DINODE_VERSION_2;
+			ip->i_d.di_onlink = 0;
+			bzero(&(ip->i_d.di_pad[0]), sizeof(ip->i_d.di_pad));
+		}
+	}
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
+			ASSERT(ip->i_df.if_bytes > 0);
+			ASSERT(ip->i_df.if_u1.if_extents != NULL);
+			ASSERT(ip->i_d.di_nextents > 0);
+			ASSERT(iip->ili_extents_buf == NULL);
+			nrecs = ip->i_df.if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t);
+			ASSERT(nrecs > 0);
+			if (nrecs == ip->i_d.di_nextents) {
+				/*
+				 * There are no delayed allocation
+				 * extents, so just point to the
+				 * real extents array.
+				 */
+				vecp->i_addr =
+					(char *)(ip->i_df.if_u1.if_extents);
+				vecp->i_len = ip->i_df.if_bytes;
+			} else {
+				/*
+				 * There are delayed allocation extents
+				 * in the inode.  Use xfs_iextents_copy()
+				 * to copy only the real extents into
+				 * a separate buffer.  We'll free the
+				 * buffer in the unlock routine.
+				 */
+				ext_buffer = kmem_alloc(ip->i_df.if_bytes,
+					KM_SLEEP);
+				iip->ili_extents_buf = ext_buffer;
+				vecp->i_addr = (xfs_caddr_t)ext_buffer;
+				vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
+					XFS_DATA_FORK);
+			}
+			ASSERT(vecp->i_len <= ip->i_df.if_bytes);
+			iip->ili_format.ilf_dsize = vecp->i_len;
+			vecp++;
+			nvecs++;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
+			ASSERT(ip->i_df.if_broot_bytes > 0);
+			ASSERT(ip->i_df.if_broot != NULL);
+			vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
+			vecp->i_len = ip->i_df.if_broot_bytes;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
+			ASSERT(ip->i_df.if_bytes > 0);
+			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_d.di_size > 0);
+
+			vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;
+			/*
+			 * Round i_bytes up to a word boundary.
+			 * The underlying memory is guaranteed to
+			 * to be there by xfs_idata_realloc().
+			 */
+			data_bytes = roundup(ip->i_df.if_bytes, 4);
+			ASSERT((ip->i_df.if_real_bytes == 0) ||
+			       (ip->i_df.if_real_bytes == data_bytes));
+			vecp->i_len = (int)data_bytes;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+			  XFS_ILOG_DDATA | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+			iip->ili_format.ilf_u.ilfu_rdev =
+				ip->i_df.if_u2.if_rdev;
+		}
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+			  XFS_ILOG_DDATA | XFS_ILOG_DEV)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+			iip->ili_format.ilf_u.ilfu_uuid =
+				ip->i_df.if_u2.if_uuid;
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	/*
+	 * If there are no attributes associated with the file,
+	 * then we're done.
+	 * Assert that no attribute-related log flags are set.
+	 */
+	if (!XFS_IFORK_Q(ip)) {
+		ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+		iip->ili_format.ilf_size = nvecs;
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+		return;
+	}
+
+	switch (ip->i_d.di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
+			ASSERT(ip->i_afp->if_bytes > 0);
+			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+			ASSERT(ip->i_d.di_anextents > 0);
+#ifdef DEBUG
+			nrecs = ip->i_afp->if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t);
+#endif
+			ASSERT(nrecs > 0);
+			ASSERT(nrecs == ip->i_d.di_anextents);
+			/*
+			 * There are not delayed allocation extents
+			 * for attributes, so just point at the array.
+			 */
+			vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);
+			vecp->i_len = ip->i_afp->if_bytes;
+			iip->ili_format.ilf_asize = vecp->i_len;
+			vecp++;
+			nvecs++;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
+			ASSERT(ip->i_afp->if_broot_bytes > 0);
+			ASSERT(ip->i_afp->if_broot != NULL);
+			vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
+			vecp->i_len = ip->i_afp->if_broot_bytes;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
+			ASSERT(ip->i_afp->if_bytes > 0);
+			ASSERT(ip->i_afp->if_u1.if_data != NULL);
+
+			vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;
+			/*
+			 * Round i_bytes up to a word boundary.
+			 * The underlying memory is guaranteed to
+			 * to be there by xfs_idata_realloc().
+			 */
+			data_bytes = roundup(ip->i_afp->if_bytes, 4);
+			ASSERT((ip->i_afp->if_real_bytes == 0) ||
+			       (ip->i_afp->if_real_bytes == data_bytes));
+			vecp->i_len = (int)data_bytes;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_asize = (unsigned)data_bytes;
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+	iip->ili_format.ilf_size = nvecs;
+}
+
+
+/*
+ * This is called to pin the inode associated with the inode log
+ * item in memory so it cannot be written out.	Do this by calling
+ * xfs_ipin() to bump the pin count in the inode while holding the
+ * inode pin lock.
+ */
+STATIC void
+xfs_inode_item_pin(
+	xfs_inode_log_item_t	*iip)
+{
+	ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
+	xfs_ipin(iip->ili_inode);
+}
+
+
+/*
+ * This is called to unpin the inode associated with the inode log
+ * item which was previously pinned with a call to xfs_inode_item_pin().
+ * Just call xfs_iunpin() on the inode to do this.
+ */
+STATIC void
+xfs_inode_item_unpin(
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_iunpin(iip->ili_inode);
+}
+
+/* ARGSUSED */
+STATIC void
+xfs_inode_item_unpin_remove(
+	xfs_inode_log_item_t	*iip,
+	xfs_trans_t		*tp)
+{
+	xfs_iunpin(iip->ili_inode);
+}
+
+/*
+ * This is called to attempt to lock the inode associated with this
+ * inode log item, in preparation for the push routine which does the actual
+ * iflush.  Don't sleep on the inode lock or the flush lock.
+ *
+ * If the flush lock is already held, indicating that the inode has
+ * been or is in the process of being flushed, then (ideally) we'd like to
+ * see if the inode's buffer is still incore, and if so give it a nudge.
+ * We delay doing so until the pushbuf routine, though, to avoid holding
+ * the AIL lock across a call to the blackhole which is the buffercache.
+ * Also we don't want to sleep in any device strategy routines, which can happen
+ * if we do the subsequent bawrite in here.
+ */
+STATIC uint
+xfs_inode_item_trylock(
+	xfs_inode_log_item_t	*iip)
+{
+	register xfs_inode_t	*ip;
+
+	ip = iip->ili_inode;
+
+	if (xfs_ipincount(ip) > 0) {
+		return XFS_ITEM_PINNED;
+	}
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+		return XFS_ITEM_LOCKED;
+	}
+
+	if (!xfs_iflock_nowait(ip)) {
+		/*
+		 * If someone else isn't already trying to push the inode
+		 * buffer, we get to do it.
+		 */
+		if (iip->ili_pushbuf_flag == 0) {
+			iip->ili_pushbuf_flag = 1;
+#ifdef DEBUG
+			iip->ili_push_owner = get_thread_id();
+#endif
+			/*
+			 * Inode is left locked in shared mode.
+			 * Pushbuf routine gets to unlock it.
+			 */
+			return XFS_ITEM_PUSHBUF;
+		} else {
+			/*
+			 * We hold the AIL_LOCK, so we must specify the
+			 * NONOTIFY flag so that we won't double trip.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
+			return XFS_ITEM_FLUSHING;
+		}
+		/* NOTREACHED */
+	}
+#ifdef DEBUG
+	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		ASSERT(iip->ili_format.ilf_fields != 0);
+		ASSERT(iip->ili_logged == 0);
+		ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);
+	}
+#endif
+	return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Unlock the inode associated with the inode log item.
+ * Clear the fields of the inode and inode log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the inode.
+ */
+STATIC void
+xfs_inode_item_unlock(
+	xfs_inode_log_item_t	*iip)
+{
+	uint		hold;
+	uint		iolocked;
+	uint		lock_flags;
+	xfs_inode_t	*ip;
+
+	ASSERT(iip != NULL);
+	ASSERT(iip->ili_inode->i_itemp != NULL);
+	ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
+	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
+		  XFS_ILI_IOLOCKED_EXCL)) ||
+	       ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE));
+	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
+		  XFS_ILI_IOLOCKED_SHARED)) ||
+	       ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS));
+	/*
+	 * Clear the transaction pointer in the inode.
+	 */
+	ip = iip->ili_inode;
+	ip->i_transp = NULL;
+
+	/*
+	 * If the inode needed a separate buffer with which to log
+	 * its extents, then free it now.
+	 */
+	/* FIXME */
+	if (iip->ili_extents_buf != NULL) {
+		ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
+		ASSERT(ip->i_d.di_nextents > 0);
+		ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
+		ASSERT(ip->i_df.if_bytes > 0);
+		kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes);
+		iip->ili_extents_buf = NULL;
+	}
+
+	/*
+	 * Figure out if we should unlock the inode or not.
+	 */
+	hold = iip->ili_flags & XFS_ILI_HOLD;
+
+	/*
+	 * Before clearing out the flags, remember whether we
+	 * are holding the inode's IO lock.
+	 */
+	iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
+
+	/*
+	 * Clear out the fields of the inode log item particular
+	 * to the current transaction.
+	 */
+	iip->ili_ilock_recur = 0;
+	iip->ili_iolock_recur = 0;
+	iip->ili_flags = 0;
+
+	/*
+	 * Unlock the inode if XFS_ILI_HOLD was not set.
+	 */
+	if (!hold) {
+		lock_flags = XFS_ILOCK_EXCL;
+		if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
+			lock_flags |= XFS_IOLOCK_EXCL;
+		} else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
+			lock_flags |= XFS_IOLOCK_SHARED;
+		}
+		xfs_iput(iip->ili_inode, lock_flags);
+	}
+}
+
+/*
+ * This is called to find out where the oldest active copy of the
+ * inode log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.  Since we always re-log
+ * all dirty data in an inode, the latest copy in the on disk log
+ * is the only one that matters.  Therefore, simply return the
+ * given lsn.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_inode_item_committed(
+	xfs_inode_log_item_t	*iip,
+	xfs_lsn_t		lsn)
+{
+	return (lsn);
+}
+
+/*
+ * The transaction with the inode locked has aborted.  The inode
+ * must not be dirty within the transaction (unless we're forcibly
+ * shutting down).  We simply unlock just as if the transaction
+ * had been cancelled.
+ */
+STATIC void
+xfs_inode_item_abort(
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_inode_item_unlock(iip);
+	return;
+}
+
+
+/*
+ * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
+ * failed to get the inode flush lock but did get the inode locked SHARED.
+ * Here we're trying to see if the inode buffer is incore, and if so whether it's
+ * marked delayed write. If that's the case, we'll initiate a bawrite on that
+ * buffer to expedite the process.
+ *
+ * We aren't holding the AIL_LOCK (or the flush lock) when this gets called,
+ * so it is inherently race-y.
+ */
+STATIC void
+xfs_inode_item_pushbuf(
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+	xfs_buf_t	*bp;
+	uint		dopush;
+
+	ip = iip->ili_inode;
+
+	ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
+
+	/*
+	 * The ili_pushbuf_flag keeps others from
+	 * trying to duplicate our effort.
+	 */
+	ASSERT(iip->ili_pushbuf_flag != 0);
+	ASSERT(iip->ili_push_owner == get_thread_id());
+
+	/*
+	 * If flushlock isn't locked anymore, chances are that the
+	 * inode flush completed and the inode was taken off the AIL.
+	 * So, just get out.
+	 */
+	if ((valusema(&(ip->i_flock)) > 0)  ||
+	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+		iip->ili_pushbuf_flag = 0;
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return;
+	}
+
+	mp = ip->i_mount;
+	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
+		    iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK);
+
+	if (bp != NULL) {
+		if (XFS_BUF_ISDELAYWRITE(bp)) {
+			/*
+			 * We were racing with iflush because we don't hold
+			 * the AIL_LOCK or the flush lock. However, at this point,
+			 * we have the buffer, and we know that it's dirty.
+			 * So, it's possible that iflush raced with us, and
+			 * this item is already taken off the AIL.
+			 * If not, we can flush it async.
+			 */
+			dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
+				  (valusema(&(ip->i_flock)) <= 0));
+			iip->ili_pushbuf_flag = 0;
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			xfs_buftrace("INODE ITEM PUSH", bp);
+			if (XFS_BUF_ISPINNED(bp)) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE);
+			}
+			if (dopush) {
+				xfs_bawrite(mp, bp);
+			} else {
+				xfs_buf_relse(bp);
+			}
+		} else {
+			iip->ili_pushbuf_flag = 0;
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			xfs_buf_relse(bp);
+		}
+		return;
+	}
+	/*
+	 * We have to be careful about resetting pushbuf flag too early (above).
+	 * Even though in theory we can do it as soon as we have the buflock,
+	 * we don't want others to be doing work needlessly. They'll come to
+	 * this function thinking that pushing the buffer is their
+	 * responsibility only to find that the buffer is still locked by
+	 * another doing the same thing
+	 */
+	iip->ili_pushbuf_flag = 0;
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return;
+}
+
+
+/*
+ * This is called to asynchronously write the inode associated with this
+ * inode log item out to disk. The inode will already have been locked by
+ * a successful call to xfs_inode_item_trylock().
+ */
+STATIC void
+xfs_inode_item_push(
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_inode_t	*ip;
+
+	ip = iip->ili_inode;
+
+	ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
+	ASSERT(valusema(&(ip->i_flock)) <= 0);
+	/*
+	 * Since we were able to lock the inode's flush lock and
+	 * we found it on the AIL, the inode must be dirty.  This
+	 * is because the inode is removed from the AIL while still
+	 * holding the flush lock in xfs_iflush_done().	 Thus, if
+	 * we found it in the AIL and were able to obtain the flush
+	 * lock without sleeping, then there must not have been
+	 * anyone in the process of flushing the inode.
+	 */
+	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+	       iip->ili_format.ilf_fields != 0);
+
+	/*
+	 * Write out the inode.	 The completion routine ('iflush_done') will
+	 * pull it from the AIL, mark it clean, unlock the flush lock.
+	 */
+	(void) xfs_iflush(ip, XFS_IFLUSH_DELWRI);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	return;
+}
+
+/*
+ * XXX rcc - this one really has to do something.  Probably needs
+ * to stamp in a new field in the incore inode.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_inode_item_committing(
+	xfs_inode_log_item_t	*iip,
+	xfs_lsn_t		lsn)
+{
+	iip->ili_last_lsn = lsn;
+	return;
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+struct xfs_item_ops xfs_inode_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_inode_item_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_inode_item_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+					xfs_inode_item_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_inode_item_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_inode_item_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_inode_item_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_inode_item_abort,
+	.iop_pushbuf	= (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_inode_item_committing
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ */
+void
+xfs_inode_item_init(
+	xfs_inode_t	*ip,
+	xfs_mount_t	*mp)
+{
+	xfs_inode_log_item_t	*iip;
+
+	ASSERT(ip->i_itemp == NULL);
+	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+
+	iip->ili_item.li_type = XFS_LI_INODE;
+	iip->ili_item.li_ops = &xfs_inode_item_ops;
+	iip->ili_item.li_mountp = mp;
+	iip->ili_inode = ip;
+
+	/*
+	   We have bzeroed memory. No need ...
+	   iip->ili_extents_buf = NULL;
+	   iip->ili_pushbuf_flag = 0;
+	 */
+
+	iip->ili_format.ilf_type = XFS_LI_INODE;
+	iip->ili_format.ilf_ino = ip->i_ino;
+	iip->ili_format.ilf_blkno = ip->i_blkno;
+	iip->ili_format.ilf_len = ip->i_len;
+	iip->ili_format.ilf_boffset = ip->i_boffset;
+}
+
+/*
+ * Free the inode log item and any memory hanging off of it.
+ */
+void
+xfs_inode_item_destroy(
+	xfs_inode_t	*ip)
+{
+#ifdef XFS_TRANS_DEBUG
+	if (ip->i_itemp->ili_root_size != 0) {
+		kmem_free(ip->i_itemp->ili_orig_root,
+			  ip->i_itemp->ili_root_size);
+	}
+#endif
+	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
+}
+
+
+/*
+ * This is the inode flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the inode is
+ * flushed to disk.  It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ */
+/*ARGSUSED*/
+void
+xfs_iflush_done(
+	xfs_buf_t		*bp,
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_inode_t	*ip;
+	SPLDECL(s);
+
+	ip = iip->ili_inode;
+
+	/*
+	 * We only want to pull the item from the AIL if it is
+	 * actually there and its location in the log has not
+	 * changed since we started the flush.	Thus, we only bother
+	 * if the ili_logged flag is set and the inode's lsn has not
+	 * changed.  First we check the lsn outside
+	 * the lock since it's cheaper, and then we recheck while
+	 * holding the lock before removing the inode from the AIL.
+	 */
+	if (iip->ili_logged &&
+	    (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
+		AIL_LOCK(ip->i_mount, s);
+		if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
+			/*
+			 * xfs_trans_delete_ail() drops the AIL lock.
+			 */
+			xfs_trans_delete_ail(ip->i_mount,
+					     (xfs_log_item_t*)iip, s);
+		} else {
+			AIL_UNLOCK(ip->i_mount, s);
+		}
+	}
+
+	iip->ili_logged = 0;
+
+	/*
+	 * Clear the ili_last_fields bits now that we know that the
+	 * data corresponding to them is safely on disk.
+	 */
+	iip->ili_last_fields = 0;
+
+	/*
+	 * Release the inode's flush lock since we're done with it.
+	 */
+	xfs_ifunlock(ip);
+
+	return;
+}
+
+/*
+ * This is the inode flushing abort routine.  It is called
+ * from xfs_iflush when the filesystem is shutting down to clean
+ * up the inode state.
+ * It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ */
+void
+xfs_iflush_abort(
+	xfs_inode_t		*ip)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_mount_t		*mp;
+	SPLDECL(s);
+
+	iip = ip->i_itemp;
+	mp = ip->i_mount;
+	if (iip) {
+		if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+			AIL_LOCK(mp, s);
+			if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+				/*
+				 * xfs_trans_delete_ail() drops the AIL lock.
+				 */
+				xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip,
+					s);
+			} else
+				AIL_UNLOCK(mp, s);
+		}
+		iip->ili_logged = 0;
+		/*
+		 * Clear the ili_last_fields bits now that we know that the
+		 * data corresponding to them is safely on disk.
+		 */
+		iip->ili_last_fields = 0;
+		/*
+		 * Clear the inode logging fields so no more flushes are
+		 * attempted.
+		 */
+		iip->ili_format.ilf_fields = 0;
+	}
+	/*
+	 * Release the inode's flush lock since we're done with it.
+	 */
+	xfs_ifunlock(ip);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inode_item.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inode_item.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inode_item.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inode_item.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_INODE_ITEM_H__
+#define __XFS_INODE_ITEM_H__
+
+/*
+ * This is the structure used to lay out an inode log item in the
+ * log.	 The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ *
+ * Convention for naming inode log item versions :  The current version
+ * is always named XFS_LI_INODE.  When an inode log item gets superseded,
+ * add the latest version of IRIX that will generate logs with that item
+ * to the version name.
+ *
+ * -Version 1 of this structure (XFS_LI_5_3_INODE) included up to the first
+ *	union (ilf_u) field.  This was released with IRIX 5.3-XFS.
+ * -Version 2 of this structure (XFS_LI_6_1_INODE) is currently the entire
+ *	structure.  This was released with IRIX 6.0.1-XFS and IRIX 6.1.
+ * -Version 3 of this structure (XFS_LI_INODE) is the same as version 2
+ *	so a new structure definition wasn't necessary.	 However, we had
+ *	to add a new type because the inode cluster size changed from 4K
+ *	to 8K and the version number had to be rev'ved to keep older kernels
+ *	from trying to recover logs with the 8K buffers in them.  The logging
+ *	code can handle recovery on different-sized clusters now so hopefully
+ *	this'll be the last time we need to change the inode log item just
+ *	for a change in the inode cluster size.	 This new version was
+ *	released with IRIX 6.2.
+ */
+typedef struct xfs_inode_log_format {
+	unsigned short		ilf_type;	/* inode log item type */
+	unsigned short		ilf_size;	/* size of this item */
+	uint			ilf_fields;	/* flags for fields logged */
+	ushort			ilf_asize;	/* size of attr d/ext/root */
+	ushort			ilf_dsize;	/* size of data/ext/root */
+	xfs_ino_t		ilf_ino;	/* inode number */
+	union {
+		xfs_dev_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	int			ilf_len;	/* len of inode buffer */
+	int			ilf_boffset;	/* off of inode in buffer */
+} xfs_inode_log_format_t;
+
+/* Initial version shipped with IRIX 5.3-XFS */
+typedef struct xfs_inode_log_format_v1 {
+	unsigned short		ilf_type;	/* inode log item type */
+	unsigned short		ilf_size;	/* size of this item */
+	uint			ilf_fields;	/* flags for fields logged */
+	uint			ilf_dsize;	/* size of data/ext/root */
+	xfs_ino_t		ilf_ino;	/* inode number */
+	union {
+		xfs_dev_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+} xfs_inode_log_format_t_v1;
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define XFS_ILOG_CORE	0x001	/* log standard inode fields */
+#define XFS_ILOG_DDATA	0x002	/* log i_df.if_data */
+#define XFS_ILOG_DEXT	0x004	/* log i_df.if_extents */
+#define XFS_ILOG_DBROOT 0x008	/* log i_df.i_broot */
+#define XFS_ILOG_DEV	0x010	/* log the dev field */
+#define XFS_ILOG_UUID	0x020	/* log the uuid field */
+#define XFS_ILOG_ADATA	0x040	/* log i_af.if_data */
+#define XFS_ILOG_AEXT	0x080	/* log i_af.if_extents */
+#define XFS_ILOG_ABROOT 0x100	/* log i_af.i_broot */
+
+#define XFS_ILOG_NONCORE	(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+				 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+				 XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+
+#define XFS_ILOG_DFORK		(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT)
+
+#define XFS_ILOG_AFORK		(XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT)
+
+#define XFS_ILOG_ALL		(XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+				 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+				 XFS_ILOG_DEV | XFS_ILOG_UUID | \
+				 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT)
+
+#define XFS_ILI_HOLD		0x1
+#define XFS_ILI_IOLOCKED_EXCL	0x2
+#define XFS_ILI_IOLOCKED_SHARED 0x4
+
+#define XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
+
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_bmbt_rec_32;
+struct xfs_inode;
+struct xfs_mount;
+
+
+typedef struct xfs_inode_log_item {
+	xfs_log_item_t		ili_item;	   /* common portion */
+	struct xfs_inode	*ili_inode;	   /* inode ptr */
+	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
+	xfs_lsn_t		ili_last_lsn;	   /* lsn at last transaction */
+	unsigned short		ili_ilock_recur;   /* lock recursion count */
+	unsigned short		ili_iolock_recur;  /* lock recursion count */
+	unsigned short		ili_flags;	   /* misc flags */
+	unsigned short		ili_logged;	   /* flushed logged data */
+	unsigned int		ili_last_fields;   /* fields when flushed */
+	struct xfs_bmbt_rec_32	*ili_extents_buf;  /* array of logged exts */
+	unsigned int		ili_pushbuf_flag;  /* one bit used in push_ail */
+
+#ifdef DEBUG
+	uint64_t		ili_push_owner;	   /* one who sets pushbuf_flag
+						      above gets to push the buf */
+#endif
+#ifdef XFS_TRANS_DEBUG
+	int			ili_root_size;
+	char			*ili_orig_root;
+#endif
+	xfs_inode_log_format_t	ili_format;	   /* logged structure */
+} xfs_inode_log_item_t;
+
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FDATA)
+int xfs_ilog_fdata(int w);
+#define XFS_ILOG_FDATA(w)	xfs_ilog_fdata(w)
+#else
+#define XFS_ILOG_FDATA(w)	\
+	((w) == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FBROOT)
+int xfs_ilog_fbroot(int w);
+#define XFS_ILOG_FBROOT(w)	xfs_ilog_fbroot(w)
+#else
+#define XFS_ILOG_FBROOT(w)	\
+	((w) == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FEXT)
+int xfs_ilog_fext(int w);
+#define XFS_ILOG_FEXT(w)	xfs_ilog_fext(w)
+#else
+#define XFS_ILOG_FEXT(w)	\
+	((w) == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT)
+#endif
+
+#ifdef __KERNEL__
+
+void	xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
+void	xfs_inode_item_destroy(struct xfs_inode *);
+void	xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
+void	xfs_iflush_abort(struct xfs_inode *);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_INODE_ITEM_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inum.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inum.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_inum.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_inum.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_INUM_H__
+#define __XFS_INUM_H__
+
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+
+typedef __uint32_t	xfs_agino_t;	/* within allocation grp inode number */
+
+/*
+ * Useful inode bits for this kernel.
+ * Used in some places where having 64-bits in the 32-bit kernels
+ * costs too much.
+ */
+#if XFS_BIG_FILESYSTEMS
+typedef xfs_ino_t	xfs_intino_t;
+#else
+typedef __uint32_t	xfs_intino_t;
+#endif
+
+#define NULLFSINO	((xfs_ino_t)-1)
+#define NULLAGINO	((xfs_agino_t)-1)
+
+struct xfs_mount;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_MASK)
+__uint32_t xfs_ino_mask(int k);
+#define XFS_INO_MASK(k)			xfs_ino_mask(k)
+#else
+#define XFS_INO_MASK(k) ((__uint32_t)((1ULL << (k)) - 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_OFFSET_BITS)
+int xfs_ino_offset_bits(struct xfs_mount *mp);
+#define XFS_INO_OFFSET_BITS(mp)		xfs_ino_offset_bits(mp)
+#else
+#define XFS_INO_OFFSET_BITS(mp) ((mp)->m_sb.sb_inopblog)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGBNO_BITS)
+int xfs_ino_agbno_bits(struct xfs_mount *mp);
+#define XFS_INO_AGBNO_BITS(mp)		xfs_ino_agbno_bits(mp)
+#else
+#define XFS_INO_AGBNO_BITS(mp)	((mp)->m_sb.sb_agblklog)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGINO_BITS)
+int xfs_ino_agino_bits(struct xfs_mount *mp);
+#define XFS_INO_AGINO_BITS(mp)		xfs_ino_agino_bits(mp)
+#else
+#define XFS_INO_AGINO_BITS(mp)		((mp)->m_agino_log)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGNO_BITS)
+int xfs_ino_agno_bits(struct xfs_mount *mp);
+#define XFS_INO_AGNO_BITS(mp)		xfs_ino_agno_bits(mp)
+#else
+#define XFS_INO_AGNO_BITS(mp)	((mp)->m_agno_log)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_BITS)
+int xfs_ino_bits(struct xfs_mount *mp);
+#define XFS_INO_BITS(mp)		xfs_ino_bits(mp)
+#else
+#define XFS_INO_BITS(mp)	(XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGNO)
+xfs_agnumber_t xfs_ino_to_agno(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_AGNO(mp,i)		xfs_ino_to_agno(mp,i)
+#else
+#define XFS_INO_TO_AGNO(mp,i)	\
+	((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGINO)
+xfs_agino_t xfs_ino_to_agino(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_AGINO(mp,i)		xfs_ino_to_agino(mp,i)
+#else
+#define XFS_INO_TO_AGINO(mp,i)	\
+	((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGBNO)
+xfs_agblock_t xfs_ino_to_agbno(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_AGBNO(mp,i)		xfs_ino_to_agbno(mp,i)
+#else
+#define XFS_INO_TO_AGBNO(mp,i)	\
+	(((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+	 XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_OFFSET)
+int xfs_ino_to_offset(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_OFFSET(mp,i)		xfs_ino_to_offset(mp,i)
+#else
+#define XFS_INO_TO_OFFSET(mp,i) \
+	((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_FSB)
+xfs_fsblock_t xfs_ino_to_fsb(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_FSB(mp,i)		xfs_ino_to_fsb(mp,i)
+#else
+#define XFS_INO_TO_FSB(mp,i)	\
+	XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_INO)
+xfs_ino_t
+xfs_agino_to_ino(struct xfs_mount *mp, xfs_agnumber_t a, xfs_agino_t i);
+#define XFS_AGINO_TO_INO(mp,a,i)	xfs_agino_to_ino(mp,a,i)
+#else
+#define XFS_AGINO_TO_INO(mp,a,i)	\
+	(((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_AGBNO)
+xfs_agblock_t xfs_agino_to_agbno(struct xfs_mount *mp, xfs_agino_t i);
+#define XFS_AGINO_TO_AGBNO(mp,i)	xfs_agino_to_agbno(mp,i)
+#else
+#define XFS_AGINO_TO_AGBNO(mp,i)	((i) >> XFS_INO_OFFSET_BITS(mp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_OFFSET)
+int xfs_agino_to_offset(struct xfs_mount *mp, xfs_agino_t i);
+#define XFS_AGINO_TO_OFFSET(mp,i)	xfs_agino_to_offset(mp,i)
+#else
+#define XFS_AGINO_TO_OFFSET(mp,i)	\
+	((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_OFFBNO_TO_AGINO)
+xfs_agino_t xfs_offbno_to_agino(struct xfs_mount *mp, xfs_agblock_t b, int o);
+#define XFS_OFFBNO_TO_AGINO(mp,b,o)	xfs_offbno_to_agino(mp,b,o)
+#else
+#define XFS_OFFBNO_TO_AGINO(mp,b,o)	\
+	((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#endif
+
+#if XFS_BIG_FILESYSTEMS
+#define XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 56) - 1ULL))
+#define XFS_INO64_OFFSET	((xfs_ino_t)(1ULL << 32))
+#else
+#define XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 32) - 1ULL))
+#endif
+#define XFS_MAXINUMBER_32	((xfs_ino_t)((1ULL << 32) - 1ULL))
+
+#endif	/* __XFS_INUM_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_iocore.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_iocore.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_iocore.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_iocore.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+static xfs_fsize_t
+xfs_size_fn(
+	xfs_inode_t	*ip)
+{
+	return (ip->i_d.di_size);
+}
+
+xfs_ioops_t	xfs_iocore_xfs = {
+	.xfs_bmapi_func		= (xfs_bmapi_t) xfs_bmapi,
+	.xfs_bmap_eof_func	= (xfs_bmap_eof_t) xfs_bmap_eof,
+	.xfs_ilock		= (xfs_lock_t) xfs_ilock,
+	.xfs_ilock_demote	= (xfs_lock_demote_t) xfs_ilock_demote,
+	.xfs_ilock_nowait	= (xfs_lock_nowait_t) xfs_ilock_nowait,
+	.xfs_unlock		= (xfs_unlk_t) xfs_iunlock,
+	.xfs_size_func		= (xfs_size_t) xfs_size_fn,
+	.xfs_lastbyte		= (xfs_lastbyte_t) xfs_file_last_byte,
+};
+
+void
+xfs_iocore_inode_reinit(
+	xfs_inode_t	*ip)
+{
+	xfs_iocore_t	*io = &ip->i_iocore;
+
+	io->io_flags = XFS_IOCORE_ISXFS;
+	if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+		io->io_flags |= XFS_IOCORE_RT;
+	}
+
+	io->io_dmevmask = ip->i_d.di_dmevmask;
+	io->io_dmstate = ip->i_d.di_dmstate;
+}
+
+void
+xfs_iocore_inode_init(
+	xfs_inode_t	*ip)
+{
+	xfs_iocore_t	*io = &ip->i_iocore;
+	xfs_mount_t	*mp = ip->i_mount;
+
+	io->io_mount = mp;
+#ifdef DEBUG
+	io->io_lock = &ip->i_lock;
+	io->io_iolock = &ip->i_iolock;
+#endif
+
+	io->io_obj = (void *)ip;
+
+	xfs_iocore_inode_reinit(ip);
+}
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_itable.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_itable.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_itable.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_itable.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,770 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * Return stat information for one inode.
+ * Return 0 if ok, else errno.
+ */
+int					/* error status */
+xfs_bulkstat_one(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		*buffer,	/* buffer to place output in */
+	xfs_daddr_t	bno,		/* starting bno of inode cluster */
+	void		*dibuff,	/* on-disk inode buffer */
+	int		*stat)		/* BULKSTAT_RV_... */
+{
+	xfs_bstat_t	*buf;		/* return buffer */
+	int		error;		/* error value */
+	xfs_dinode_t	*dip;		/* dinode inode pointer */
+	xfs_dinode_core_t *dic;		/* dinode core info pointer */
+	xfs_inode_t	*ip = NULL;	/* incore inode pointer */
+	xfs_arch_t	arch;		/* these are set according to	   */
+	__uint16_t	di_flags;	/* temp */
+
+	buf = (xfs_bstat_t *)buffer;
+	dip = (xfs_dinode_t *)dibuff;
+
+	if (! buf || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+	    (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+	     (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) {
+		*stat = BULKSTAT_RV_NOTHING;
+		return XFS_ERROR(EINVAL);
+	}
+
+	if (dip == NULL) {
+		/* We're not being passed a pointer to a dinode.  This happens
+		 * if BULKSTAT_FG_IGET is selected.  Do the iget.
+		 */
+		error = xfs_iget(mp, tp, ino, XFS_ILOCK_SHARED, &ip, bno);
+		if (error) {
+			*stat = BULKSTAT_RV_NOTHING;
+			return error;
+		}
+		ASSERT(ip != NULL);
+		ASSERT(ip->i_blkno != (xfs_daddr_t)0);
+		if (ip->i_d.di_mode == 0) {
+			xfs_iput_new(ip, XFS_ILOCK_SHARED);
+			*stat = BULKSTAT_RV_NOTHING;
+			return XFS_ERROR(ENOENT);
+		}
+		dic = &ip->i_d;
+		arch = ARCH_NOCONVERT;		/* in-core! */
+		ASSERT(dic != NULL);
+
+		/* xfs_iget returns the following without needing
+		 * further change.
+		 */
+		buf->bs_nlink = dic->di_nlink;
+		buf->bs_projid = dic->di_projid;
+
+	} else {
+		dic = &dip->di_core;
+		ASSERT(dic != NULL);
+
+		/* buffer dinode_core is in on-disk arch */
+		arch = ARCH_CONVERT;
+
+		/*
+		 * The inode format changed when we moved the link count and
+		 * made it 32 bits long.  If this is an old format inode,
+		 * convert it in memory to look like a new one.	 If it gets
+		 * flushed to disk we will convert back before flushing or
+		 * logging it.	We zero out the new projid field and the old link
+		 * count field.	 We'll handle clearing the pad field (the remains
+		 * of the old uuid field) when we actually convert the inode to
+		 * the new format. We don't change the version number so that we
+		 * can distinguish this from a real new format inode.
+		 */
+		if (INT_GET(dic->di_version, arch) == XFS_DINODE_VERSION_1) {
+			buf->bs_nlink = INT_GET(dic->di_onlink, arch);
+			buf->bs_projid = 0;
+		}
+		else {
+			buf->bs_nlink = INT_GET(dic->di_nlink, arch);
+			buf->bs_projid = INT_GET(dic->di_projid, arch);
+		}
+
+	}
+
+	buf->bs_ino = ino;
+	buf->bs_mode = INT_GET(dic->di_mode, arch);
+	buf->bs_uid = INT_GET(dic->di_uid, arch);
+	buf->bs_gid = INT_GET(dic->di_gid, arch);
+	buf->bs_size = INT_GET(dic->di_size, arch);
+	buf->bs_atime.tv_sec = INT_GET(dic->di_atime.t_sec, arch);
+	buf->bs_atime.tv_nsec = INT_GET(dic->di_atime.t_nsec, arch);
+	buf->bs_mtime.tv_sec = INT_GET(dic->di_mtime.t_sec, arch);
+	buf->bs_mtime.tv_nsec = INT_GET(dic->di_mtime.t_nsec, arch);
+	buf->bs_ctime.tv_sec = INT_GET(dic->di_ctime.t_sec, arch);
+	buf->bs_ctime.tv_nsec = INT_GET(dic->di_ctime.t_nsec, arch);
+	/*
+	 * convert di_flags to bs_xflags.
+	 */
+	di_flags=INT_GET(dic->di_flags, arch);
+
+	buf->bs_xflags =
+		((di_flags & XFS_DIFLAG_REALTIME) ?
+			XFS_XFLAG_REALTIME : 0) |
+		((di_flags & XFS_DIFLAG_PREALLOC) ?
+			XFS_XFLAG_PREALLOC : 0) |
+		(XFS_CFORK_Q_ARCH(dic, arch) ?
+			XFS_XFLAG_HASATTR : 0);
+
+	buf->bs_extsize = INT_GET(dic->di_extsize, arch) << mp->m_sb.sb_blocklog;
+	buf->bs_extents = INT_GET(dic->di_nextents, arch);
+	buf->bs_gen = INT_GET(dic->di_gen, arch);
+	bzero(buf->bs_pad, sizeof(buf->bs_pad));
+	buf->bs_dmevmask = INT_GET(dic->di_dmevmask, arch);
+	buf->bs_dmstate = INT_GET(dic->di_dmstate, arch);
+	buf->bs_aextents = INT_GET(dic->di_anextents, arch);
+
+	switch (INT_GET(dic->di_format, arch)) {
+	case XFS_DINODE_FMT_DEV:
+		if ( ip ) {
+			buf->bs_rdev = ip->i_df.if_u2.if_rdev;
+		} else {
+			buf->bs_rdev = INT_GET(dip->di_u.di_dev, arch);
+		}
+
+		buf->bs_blksize = BLKDEV_IOSIZE;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_UUID:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		if ( ip ) {
+			buf->bs_blocks = INT_GET(dic->di_nblocks, arch) + ip->i_delayed_blks;
+		} else {
+			buf->bs_blocks = INT_GET(dic->di_nblocks, arch);
+		}
+		break;
+	}
+
+	if (ip) {
+		xfs_iput(ip, XFS_ILOCK_SHARED);
+	}
+
+	*stat = BULKSTAT_RV_DIDONE;
+	return 0;
+}
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int					/* error status */
+xfs_bulkstat(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_ino_t		*lastinop, /* last inode returned */
+	int			*ubcountp, /* size of buffer/count returned */
+	bulkstat_one_pf		formatter, /* func that'd fill a single buf */
+	size_t			statstruct_size, /* sizeof struct filling */
+	xfs_caddr_t		ubuffer, /* buffer with inode stats */
+	int			flags,	/* defined in xfs_itable.h */
+	int			*done)	/* 1 if there're more stats to get */
+{
+	xfs_agblock_t		agbno=0;/* allocation group block number */
+	xfs_buf_t		*agbp;	/* agi header buffer */
+	xfs_agi_t		*agi;	/* agi header data */
+	xfs_agino_t		agino;	/* inode # in allocation group */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	xfs_daddr_t		bno;	/* inode cluster start daddr */
+	int			chunkidx; /* current index into inode chunk */
+	int			clustidx; /* current index into inode cluster */
+	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
+	int			end_of_ag; /* set if we've seen the ag end */
+	int			error;	/* error code */
+	int			fmterror;/* bulkstat formatter result */
+	__int32_t		gcnt;	/* current btree rec's count */
+	xfs_inofree_t		gfree;	/* current btree rec's free mask */
+	xfs_agino_t		gino;	/* current btree rec's start inode */
+	int			i;	/* loop index */
+	int			icount; /* count of inodes good in irbuf */
+	xfs_ino_t		ino;	/* inode number (filesystem) */
+	xfs_inobt_rec_t		*irbp;	/* current irec buffer pointer */
+	xfs_inobt_rec_t		*irbuf; /* start of irec buffer */
+	xfs_inobt_rec_t		*irbufend; /* end of good irec buffer entries */
+	xfs_ino_t		lastino=0; /* last inode number returned */
+	int			nbcluster; /* # of blocks in a cluster */
+	int			nicluster; /* # of inodes in a cluster */
+	int			nimask; /* mask for inode clusters */
+	int			nirbuf; /* size of irbuf */
+	int			rval;	/* return value error code */
+	int			tmp;	/* result value from btree calls */
+	int			ubcount; /* size of user's buffer */
+	int			ubleft; /* spaces left in user's buffer */
+	xfs_caddr_t		ubufp;	/* current pointer into user's buffer */
+	xfs_buf_t		*bp;	/* ptr to on-disk inode cluster buf */
+	xfs_dinode_t		*dip;	/* ptr into bp for specific inode */
+	xfs_inode_t		*ip;	/* ptr to in-core inode struct */
+
+	/*
+	 * Get the last inode value, see if there's nothing to do.
+	 */
+	ino = (xfs_ino_t)*lastinop;
+	dip = NULL;
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	if (agno >= mp->m_sb.sb_agcount ||
+	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+		*done = 1;
+		*ubcountp = 0;
+		return 0;
+	}
+	ubcount = ubleft = *ubcountp;
+	*ubcountp = 0;
+	*done = 0;
+	fmterror = 0;
+	ubufp = ubuffer;
+	nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
+		mp->m_sb.sb_inopblock :
+		(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
+	nimask = ~(nicluster - 1);
+	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
+	/*
+	 * Lock down the user's buffer. If a buffer was not sent, as in the case
+	 * disk quota code calls here, we skip this.
+	 */
+#if defined(HAVE_USERACC)
+	if (ubuffer &&
+	    (error = useracc(ubuffer, ubcount * statstruct_size,
+			(B_READ|B_PHYS), NULL))) {
+		return error;
+	}
+#endif
+	/*
+	 * Allocate a page-sized buffer for inode btree records.
+	 * We could try allocating something smaller, but for normal
+	 * calls we'll always (potentially) need the whole page.
+	 */
+	irbuf = kmem_alloc(NBPC, KM_SLEEP);
+	nirbuf = NBPC / sizeof(*irbuf);
+	/*
+	 * Loop over the allocation groups, starting from the last
+	 * inode returned; 0 means start of the allocation group.
+	 */
+	rval = 0;
+	while (ubleft > 0 && agno < mp->m_sb.sb_agcount) {
+		bp = NULL;
+		down_read(&mp->m_peraglock);
+		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+		up_read(&mp->m_peraglock);
+		if (error) {
+			/*
+			 * Skip this allocation group and go to the next one.
+			 */
+			agno++;
+			agino = 0;
+			continue;
+		}
+		agi = XFS_BUF_TO_AGI(agbp);
+		/*
+		 * Allocate and initialize a btree cursor for ialloc btree.
+		 */
+		cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+			(xfs_inode_t *)0, 0);
+		irbp = irbuf;
+		irbufend = irbuf + nirbuf;
+		end_of_ag = 0;
+		/*
+		 * If we're returning in the middle of an allocation group,
+		 * we need to get the remainder of the chunk we're in.
+		 */
+		if (agino > 0) {
+			/*
+			 * Lookup the inode chunk that this inode lives in.
+			 */
+			error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+			if (!error &&	/* no I/O error */
+			    tmp &&	/* lookup succeeded */
+					/* got the record, should always work */
+			    !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
+				    &gfree, &i, ARCH_NOCONVERT)) &&
+			    i == 1 &&
+					/* this is the right chunk */
+			    agino < gino + XFS_INODES_PER_CHUNK &&
+					/* lastino was not last in chunk */
+			    (chunkidx = agino - gino + 1) <
+				    XFS_INODES_PER_CHUNK &&
+					/* there are some left allocated */
+			    XFS_INOBT_MASKN(chunkidx,
+				    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+				/*
+				 * Grab the chunk record.  Mark all the
+				 * uninteresting inodes (because they're
+				 * before our start point) free.
+				 */
+				for (i = 0; i < chunkidx; i++) {
+					if (XFS_INOBT_MASK(i) & ~gfree)
+						gcnt++;
+				}
+				gfree |= XFS_INOBT_MASKN(0, chunkidx);
+				INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
+				INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
+				INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+				irbp++;
+				agino = gino + XFS_INODES_PER_CHUNK;
+				icount = XFS_INODES_PER_CHUNK - gcnt;
+			} else {
+				/*
+				 * If any of those tests failed, bump the
+				 * inode number (just in case).
+				 */
+				agino++;
+				icount = 0;
+			}
+			/*
+			 * In any case, increment to the next record.
+			 */
+			if (!error)
+				error = xfs_inobt_increment(cur, 0, &tmp);
+		} else {
+			/*
+			 * Start of ag.	 Lookup the first inode chunk.
+			 */
+			error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+			icount = 0;
+		}
+		/*
+		 * Loop through inode btree records in this ag,
+		 * until we run out of inodes or space in the buffer.
+		 */
+		while (irbp < irbufend && icount < ubcount) {
+			/*
+			 * Loop as long as we're unable to read the
+			 * inode btree.
+			 */
+			while (error) {
+				agino += XFS_INODES_PER_CHUNK;
+				if (XFS_AGINO_TO_AGBNO(mp, agino) >=
+						INT_GET(agi->agi_length, ARCH_CONVERT))
+					break;
+				error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
+							    &tmp);
+			}
+			/*
+			 * If ran off the end of the ag either with an error,
+			 * or the normal way, set end and stop collecting.
+			 */
+			if (error ||
+			    (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
+				    &gfree, &i, ARCH_NOCONVERT)) ||
+			    i == 0) {
+				end_of_ag = 1;
+				break;
+			}
+			/*
+			 * If this chunk has any allocated inodes, save it.
+			 */
+			if (gcnt < XFS_INODES_PER_CHUNK) {
+				INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
+				INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
+				INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+				irbp++;
+				icount += XFS_INODES_PER_CHUNK - gcnt;
+			}
+			/*
+			 * Set agino to after this chunk and bump the cursor.
+			 */
+			agino = gino + XFS_INODES_PER_CHUNK;
+			error = xfs_inobt_increment(cur, 0, &tmp);
+		}
+		/*
+		 * Drop the btree buffers and the agi buffer.
+		 * We can't hold any of the locks these represent
+		 * when calling iget.
+		 */
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		xfs_trans_brelse(tp, agbp);
+		/*
+		 * Now format all the good inodes into the user's buffer.
+		 */
+		irbufend = irbp;
+		for (irbp = irbuf; irbp < irbufend && ubleft > 0; irbp++) {
+			/*
+			 * Read-ahead the next chunk's worth of inodes.
+			 */
+			if (&irbp[1] < irbufend) {
+				/*
+				 * Loop over all clusters in the next chunk.
+				 * Do a readahead if there are any allocated
+				 * inodes in that cluster.
+				 */
+				for (agbno = XFS_AGINO_TO_AGBNO(mp,
+							INT_GET(irbp[1].ir_startino, ARCH_CONVERT)),
+				     chunkidx = 0;
+				     chunkidx < XFS_INODES_PER_CHUNK;
+				     chunkidx += nicluster,
+				     agbno += nbcluster) {
+					if (XFS_INOBT_MASKN(chunkidx,
+							    nicluster) &
+					    ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT)))
+						xfs_btree_reada_bufs(mp, agno,
+							agbno, nbcluster);
+				}
+			}
+			/*
+			 * Now process this chunk of inodes.
+			 */
+			for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0;
+			     ubleft > 0 &&
+				INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK;
+			     chunkidx++, clustidx++, agino++) {
+				ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+				/*
+				 * Recompute agbno if this is the
+				 * first inode of the cluster.
+				 *
+				 * Careful with clustidx.   There can be
+				 * multple clusters per chunk, a single
+				 * cluster per chunk or a cluster that has
+				 * inodes represented from several different
+				 * chunks (if blocksize is large).
+				 *
+				 * Because of this, the starting clustidx is
+				 * initialized to zero in this loop but must
+				 * later be reset after reading in the cluster
+				 * buffer.
+				 */
+				if ((chunkidx & (nicluster - 1)) == 0) {
+					agbno = XFS_AGINO_TO_AGBNO(mp,
+							INT_GET(irbp->ir_startino, ARCH_CONVERT)) +
+						((chunkidx & nimask) >>
+						 mp->m_sb.sb_inopblog);
+
+					if (flags & BULKSTAT_FG_QUICK) {
+						ino = XFS_AGINO_TO_INO(mp, agno,
+								       agino);
+						bno = XFS_AGB_TO_DADDR(mp, agno,
+								       agbno);
+
+						/*
+						 * Get the inode cluster buffer
+						 */
+						ASSERT(xfs_inode_zone != NULL);
+						ip = kmem_zone_zalloc(xfs_inode_zone,
+								      KM_SLEEP);
+						ip->i_ino = ino;
+						ip->i_mount = mp;
+						if (bp)
+							xfs_trans_brelse(tp, bp);
+						error = xfs_itobp(mp, tp, ip,
+								  &dip, &bp, bno);
+						if (!error)
+							clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
+						kmem_zone_free(xfs_inode_zone, ip);
+						if (XFS_TEST_ERROR(error != 0,
+								   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
+								   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
+							bp = NULL;
+							break;
+						}
+					}
+				}
+				/*
+				 * Skip if this inode is free.
+				 */
+				if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT))
+					continue;
+				/*
+				 * Count used inodes as free so we can tell
+				 * when the chunk is used up.
+				 */
+				INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1);
+				ino = XFS_AGINO_TO_INO(mp, agno, agino);
+				bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+				if (flags & BULKSTAT_FG_QUICK) {
+					dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					      (clustidx << mp->m_sb.sb_inodelog));
+
+					if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT)
+						    != XFS_DINODE_MAGIC
+					    || !XFS_DINODE_GOOD_VERSION(
+						    INT_GET(dip->di_core.di_version, ARCH_CONVERT)))
+						continue;
+				}
+
+				/*
+				 * Get the inode and fill in a single buffer.
+				 * BULKSTAT_FG_QUICK uses dip to fill it in.
+				 * BULKSTAT_FG_IGET uses igets.
+				 * See: xfs_bulkstat_one & dm_bulkstat_one.
+				 * This is also used to count inodes/blks, etc
+				 * in xfs_qm_quotacheck.
+				 */
+				error = formatter(mp, tp, ino, ubufp, bno, dip,
+					&fmterror);
+				if (fmterror == BULKSTAT_RV_NOTHING)
+					continue;
+				if (fmterror == BULKSTAT_RV_GIVEUP) {
+					ubleft = 0;
+					ASSERT(error);
+					rval = error;
+					break;
+				}
+				if (ubufp)
+					ubufp += statstruct_size;
+				ubleft--;
+				lastino = ino;
+			}
+		}
+
+		if (bp)
+			xfs_trans_brelse(tp, bp);
+
+		/*
+		 * Set up for the next loop iteration.
+		 */
+		if (ubleft > 0) {
+			if (end_of_ag) {
+				agno++;
+				agino = 0;
+			} else
+				agino = XFS_INO_TO_AGINO(mp, lastino);
+		} else
+			break;
+	}
+	/*
+	 * Done, we're either out of filesystem or space to put the data.
+	 */
+	kmem_free(irbuf, NBPC);
+#if defined(HAVE_USERACC)
+	if (ubuffer)
+		unuseracc(ubuffer, ubcount * statstruct_size, (B_READ|B_PHYS));
+#endif
+	*ubcountp = ubcount - ubleft;
+	if (agno >= mp->m_sb.sb_agcount) {
+		/*
+		 * If we ran out of filesystem, mark lastino as off
+		 * the end of the filesystem, so the next call
+		 * will return immediately.
+		 */
+		*lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+		*done = 1;
+	} else
+		*lastinop = (xfs_ino_t)lastino;
+
+	return rval;
+}
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ * Special case for non-sequential one inode bulkstat.
+ */
+int					/* error status */
+xfs_bulkstat_single(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_ino_t		*lastinop, /* inode to return */
+	xfs_caddr_t		buffer, /* buffer with inode stats */
+	int			*done)	/* 1 if there're more stats to get */
+{
+	xfs_bstat_t		bstat;	/* one bulkstat result structure */
+	int			count;	/* count value for bulkstat call */
+	int			error;	/* return value */
+	xfs_ino_t		ino;	/* filesystem inode number */
+	int			res;	/* result from bs1 */
+
+	/*
+	 * note that requesting valid inode numbers which are not allocated
+	 * to inodes will most likely cause xfs_itobp to generate warning
+	 * messages about bad magic numbers. This is ok. The fact that
+	 * the inode isn't actually an inode is handled by the
+	 * error check below. Done this way to make the usual case faster
+	 * at the expense of the error case.
+	 */
+
+	ino = (xfs_ino_t)*lastinop;
+	error = xfs_bulkstat_one(mp, NULL, ino, &bstat, 0, 0, &res);
+	if (error) {
+		/*
+		 * Special case way failed, do it the "long" way
+		 * to see if that works.
+		 */
+		(*lastinop)--;
+		count = 1;
+		if (xfs_bulkstat(mp, NULL, lastinop, &count, xfs_bulkstat_one,
+				sizeof(bstat), buffer, BULKSTAT_FG_IGET, done))
+			return error;
+		if (count == 0 || (xfs_ino_t)*lastinop != ino)
+			return error == EFSCORRUPTED ?
+				XFS_ERROR(EINVAL) : error;
+		else
+			return 0;
+	}
+	*done = 0;
+	if (copy_to_user(buffer, &bstat, sizeof(bstat)))
+		return XFS_ERROR(EFAULT);
+	return 0;
+}
+
+/*
+ * Return inode number table for the filesystem.
+ */
+int					/* error status */
+xfs_inumbers(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	*lastino,	/* last inode returned */
+	int		*count,		/* size of buffer/count returned */
+	xfs_caddr_t	ubuffer)	/* buffer with inode descriptions */
+{
+	xfs_buf_t	*agbp;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		bcount;
+	xfs_inogrp_t	*buffer;
+	int		bufidx;
+	xfs_btree_cur_t *cur;
+	int		error;
+	__int32_t	gcnt;
+	xfs_inofree_t	gfree;
+	xfs_agino_t	gino;
+	int		i;
+	xfs_ino_t	ino;
+	int		left;
+	int		tmp;
+
+	ino = (xfs_ino_t)*lastino;
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	left = *count;
+	*count = 0;
+	bcount = MIN(left, (int)(NBPP / sizeof(*buffer)));
+	buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
+	error = bufidx = 0;
+	cur = NULL;
+	agbp = NULL;
+	while (left > 0 && agno < mp->m_sb.sb_agcount) {
+		if (agbp == NULL) {
+			down_read(&mp->m_peraglock);
+			error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+			up_read(&mp->m_peraglock);
+			if (error) {
+				/*
+				 * If we can't read the AGI of this ag,
+				 * then just skip to the next one.
+				 */
+				ASSERT(cur == NULL);
+				agbp = NULL;
+				agno++;
+				agino = 0;
+				continue;
+			}
+			cur = xfs_btree_init_cursor(mp, tp, agbp, agno,
+				XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+			error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+			if (error) {
+				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+				cur = NULL;
+				xfs_trans_brelse(tp, agbp);
+				agbp = NULL;
+				/*
+				 * Move up the the last inode in the current
+				 * chunk.  The lookup_ge will always get
+				 * us the first inode in the next chunk.
+				 */
+				agino += XFS_INODES_PER_CHUNK - 1;
+				continue;
+			}
+		}
+		if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
+			&i, ARCH_NOCONVERT)) ||
+		    i == 0) {
+			xfs_trans_brelse(tp, agbp);
+			agbp = NULL;
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			cur = NULL;
+			agno++;
+			agino = 0;
+			continue;
+		}
+		agino = gino + XFS_INODES_PER_CHUNK - 1;
+		buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
+		buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
+		buffer[bufidx].xi_allocmask = ~gfree;
+		bufidx++;
+		left--;
+		if (bufidx == bcount) {
+			if (copy_to_user(ubuffer, buffer,
+					bufidx * sizeof(*buffer))) {
+				error = XFS_ERROR(EFAULT);
+				break;
+			}
+			ubuffer += bufidx * sizeof(*buffer);
+			*count += bufidx;
+			bufidx = 0;
+		}
+		if (left) {
+			error = xfs_inobt_increment(cur, 0, &tmp);
+			if (error) {
+				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+				cur = NULL;
+				xfs_trans_brelse(tp, agbp);
+				agbp = NULL;
+				/*
+				 * The agino value has already been bumped.
+				 * Just try to skip up to it.
+				 */
+				agino += XFS_INODES_PER_CHUNK;
+				continue;
+			}
+		}
+	}
+	if (!error) {
+		if (bufidx) {
+			if (copy_to_user(ubuffer, buffer,
+					bufidx * sizeof(*buffer)))
+				error = XFS_ERROR(EFAULT);
+			else
+				*count += bufidx;
+		}
+		*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
+	}
+	kmem_free(buffer, bcount * sizeof(*buffer));
+	if (cur)
+		xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
+					   XFS_BTREE_NOERROR));
+	if (agbp)
+		xfs_trans_brelse(tp, agbp);
+	return error;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_itable.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_itable.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_itable.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_itable.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ITABLE_H__
+#define __XFS_ITABLE_H__
+
+/*
+ * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
+ * structures (by the dmi library). This is a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ * see xfs_bulkstat_one() and dm_bulkstat_one() in dmi_xfs.c
+ */
+typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
+			       struct xfs_trans *tp,
+			       xfs_ino_t	ino,
+			       void		*buffer,
+			       xfs_daddr_t	bno,
+			       void		*dip,
+			       int		*stat);
+/*
+ * Values for stat return value.
+ */
+#define BULKSTAT_RV_NOTHING	0
+#define BULKSTAT_RV_DIDONE	1
+#define BULKSTAT_RV_GIVEUP	2
+
+/*
+ * Values for bulkstat flag argument.
+ */
+#define BULKSTAT_FG_IGET	0x1	/* Go through the buffer cache */
+#define BULKSTAT_FG_QUICK	0x2	/* No iget, walk the dinode cluster */
+#define BULKSTAT_FG_VFSLOCKED	0x4	/* Already have vfs lock */
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int					/* error status */
+xfs_bulkstat(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	*lastino,	/* last inode returned */
+	int		*count,		/* size of buffer/count returned */
+	bulkstat_one_pf formatter,	/* func that'd fill a single buf */
+	size_t		statstruct_size,/* sizeof struct that we're filling */
+	xfs_caddr_t	ubuffer,	/* buffer with inode stats */
+	int		flags,		/* flag to control access method */
+	int		*done);		/* 1 if there're more stats to get */
+
+int
+xfs_bulkstat_single(
+	xfs_mount_t		*mp,
+	xfs_ino_t		*lastinop,
+	xfs_caddr_t		buffer,
+	int			*done);
+
+int
+xfs_bulkstat_one(
+	xfs_mount_t		*mp,
+	xfs_trans_t		*tp,
+	xfs_ino_t		ino,
+	void			*buffer,
+	xfs_daddr_t		bno,
+	void			*dibuff,
+	int			*stat);
+
+int					/* error status */
+xfs_inumbers(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_ino_t		*last,	/* last inode returned */
+	int			*count, /* size of buffer/count returned */
+	xfs_caddr_t		buffer);/* buffer with inode descriptions */
+
+#endif	/* __XFS_ITABLE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,3628 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * High level interface routines for log manager
+ */
+
+#include <xfs.h>
+
+
+#define xlog_write_adv_cnt(ptr, len, off, bytes) \
+	{ (ptr) += (bytes); \
+	  (len) -= (bytes); \
+	  (off) += (bytes);}
+
+/* Local miscellaneous function prototypes */
+STATIC int	 xlog_bdstrat_cb(struct xfs_buf *);
+STATIC int	 xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
+				    xfs_lsn_t *);
+STATIC xlog_t *	 xlog_alloc_log(xfs_mount_t	*mp,
+				dev_t		log_dev,
+				xfs_daddr_t	blk_offset,
+				int		num_bblks);
+STATIC int	 xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int	 xlog_sync(xlog_t *log, xlog_in_core_t *iclog, uint flags);
+STATIC void	 xlog_unalloc_log(xlog_t *log);
+STATIC int	 xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
+			    int nentries, xfs_log_ticket_t tic,
+			    xfs_lsn_t *start_lsn, uint flags);
+
+/* local state machine functions */
+STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
+STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog);
+static inline void xlog_state_finish_copy(xlog_t	*log,
+					xlog_in_core_t	*iclog,
+					int		first_write,
+					int		bytes);
+STATIC int  xlog_state_get_iclog_space(xlog_t		*log,
+				       int		len,
+				       xlog_in_core_t	**iclog,
+				       xlog_ticket_t	*ticket,
+				       int		*continued_write,
+				       int		*logoffsetp);
+STATIC int  xlog_state_lsn_is_synced(xlog_t		*log,
+				     xfs_lsn_t		lsn,
+				     xfs_log_callback_t *cb,
+				     int		*abortflg);
+STATIC void xlog_state_put_ticket(xlog_t	*log,
+				  xlog_ticket_t *tic);
+STATIC int  xlog_state_release_iclog(xlog_t		*log,
+				     xlog_in_core_t	*iclog);
+STATIC void xlog_state_switch_iclogs(xlog_t		*log,
+				     xlog_in_core_t *iclog,
+				     int		eventual_size);
+STATIC int  xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags);
+STATIC int  xlog_state_sync_all(xlog_t *log, uint flags);
+STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
+
+/* local functions to manipulate grant head */
+STATIC int  xlog_grant_log_space(xlog_t		*log,
+				 xlog_ticket_t	*xtic);
+STATIC void xlog_grant_push_ail(xfs_mount_t	*mp,
+				int		need_bytes);
+STATIC void xlog_regrant_reserve_log_space(xlog_t	 *log,
+					   xlog_ticket_t *ticket);
+STATIC int xlog_regrant_write_log_space(xlog_t		*log,
+					 xlog_ticket_t	*ticket);
+STATIC void xlog_ungrant_log_space(xlog_t	 *log,
+				   xlog_ticket_t *ticket);
+
+
+/* local ticket functions */
+STATIC void		xlog_state_ticket_alloc(xlog_t *log);
+STATIC xlog_ticket_t	*xlog_ticket_get(xlog_t *log,
+					 int	unit_bytes,
+					 int	count,
+					 char	clientid,
+					 uint	flags);
+STATIC void		xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
+
+/* local debug functions */
+#if defined(DEBUG) && !defined(XLOG_NOLOG)
+STATIC void	xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
+#ifdef XFSDEBUG
+STATIC void	xlog_verify_disk_cycle_no(xlog_t *log, xlog_in_core_t *iclog);
+#endif
+STATIC void	xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void	xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
+				  int count, boolean_t syncing);
+STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
+				     xfs_lsn_t tail_lsn);
+#else
+#define xlog_verify_dest_ptr(a,b)
+#define xlog_verify_disk_cycle_no(a,b)
+#define xlog_verify_grant_head(a,b)
+#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_tail_lsn(a,b,c)
+#endif
+
+int		xlog_iclogs_empty(xlog_t *log);
+
+#ifdef DEBUG
+int xlog_do_error = 0;
+int xlog_req_num  = 0;
+int xlog_error_mod = 33;
+#endif
+
+#define XLOG_FORCED_SHUTDOWN(log)	(log->l_flags & XLOG_IO_ERROR)
+
+/*
+ * 0 => disable log manager
+ * 1 => enable log manager
+ * 2 => enable log manager and log debugging
+ */
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+int   xlog_debug = 1;
+dev_t xlog_devt	 = 0;
+#endif
+
+#if defined(XFS_LOG_TRACE)
+void
+xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
+{
+	if (! log->l_grant_trace)
+		log->l_grant_trace = ktrace_alloc(1024, KM_SLEEP);
+
+	ktrace_enter(log->l_grant_trace,
+		     (void *)tic,
+		     (void *)log->l_reserve_headq,
+		     (void *)log->l_write_headq,
+		     (void *)((unsigned long)log->l_grant_reserve_cycle),
+		     (void *)((unsigned long)log->l_grant_reserve_bytes),
+		     (void *)((unsigned long)log->l_grant_write_cycle),
+		     (void *)((unsigned long)log->l_grant_write_bytes),
+		     (void *)((unsigned long)log->l_curr_cycle),
+		     (void *)((unsigned long)log->l_curr_block),
+		     (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn, ARCH_NOCONVERT)),
+		     (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn, ARCH_NOCONVERT)),
+		     (void *)string,
+		     (void *)((unsigned long)13),
+		     (void *)((unsigned long)14),
+		     (void *)((unsigned long)15),
+		     (void *)((unsigned long)16));
+}
+
+void
+xlog_trace_tic(xlog_t *log, xlog_ticket_t *tic)
+{
+	if (! log->l_trace)
+		log->l_trace = ktrace_alloc(256, KM_SLEEP);
+
+	ktrace_enter(log->l_trace,
+		     (void *)tic,
+		     (void *)((unsigned long)tic->t_curr_res),
+		     (void *)((unsigned long)tic->t_unit_res),
+		     (void *)((unsigned long)tic->t_ocnt),
+		     (void *)((unsigned long)tic->t_cnt),
+		     (void *)((unsigned long)tic->t_flags),
+		     (void *)((unsigned long)7),
+		     (void *)((unsigned long)8),
+		     (void *)((unsigned long)9),
+		     (void *)((unsigned long)10),
+		     (void *)((unsigned long)11),
+		     (void *)((unsigned long)12),
+		     (void *)((unsigned long)13),
+		     (void *)((unsigned long)14),
+		     (void *)((unsigned long)15),
+		     (void *)((unsigned long)16));
+}
+
+void
+xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
+{
+	pid_t pid;
+
+	pid = current_pid();
+
+	if (!iclog->ic_trace)
+		iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
+	ktrace_enter(iclog->ic_trace,
+		     (void *)((unsigned long)state),
+		     (void *)((unsigned long)pid),
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0);
+}
+
+#else
+#define xlog_trace_loggrant(log,tic,string)
+#define xlog_trace_iclog(iclog,state)
+#endif /* XFS_LOG_TRACE */
+
+/*
+ * NOTES:
+ *
+ *	1. currblock field gets updated at startup and after in-core logs
+ *		marked as with WANT_SYNC.
+ */
+
+/*
+ * This routine is called when a user of a log manager ticket is done with
+ * the reservation.  If the ticket was ever used, then a commit record for
+ * the associated transaction is written out as a log operation header with
+ * no data.  The flag XLOG_TIC_INITED is set when the first write occurs with
+ * a given ticket.  If the ticket was one with a permanent reservation, then
+ * a few operations are done differently.  Permanent reservation tickets by
+ * default don't release the reservation.  They just commit the current
+ * transaction with the belief that the reservation is still needed.  A flag
+ * must be passed in before permanent reservations are actually released.
+ * When these type of tickets are not released, they need to be set into
+ * the inited state again.  By doing this, a start record will be written
+ * out when the next write occurs.
+ */
+xfs_lsn_t
+xfs_log_done(xfs_mount_t	*mp,
+	     xfs_log_ticket_t	xtic,
+	     uint		flags)
+{
+	xlog_t		*log	= mp->m_log;
+	xlog_ticket_t	*ticket = (xfs_log_ticket_t) xtic;
+	xfs_lsn_t	lsn	= 0;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return 0;
+#endif
+
+	if (XLOG_FORCED_SHUTDOWN(log) ||
+	    /*
+	     * If nothing was ever written, don't write out commit record.
+	     * If we get an error, just continue and give back the log ticket.
+	     */
+	    (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
+	     (xlog_commit_record(mp, ticket, &lsn)))) {
+		lsn = (xfs_lsn_t) -1;
+		if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
+			flags |= XFS_LOG_REL_PERM_RESERV;
+		}
+	}
+
+
+	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
+	    (flags & XFS_LOG_REL_PERM_RESERV)) {
+		/*
+		 * Release ticket if not permanent reservation or a specifc
+		 * request has been made to release a permanent reservation.
+		 */
+		xlog_ungrant_log_space(log, ticket);
+		xlog_state_put_ticket(log, ticket);
+	} else {
+		xlog_regrant_reserve_log_space(log, ticket);
+	}
+
+	/* If this ticket was a permanent reservation and we aren't
+	 * trying to release it, reset the inited flags; so next time
+	 * we write, a start record will be written out.
+	 */
+	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
+	    (flags & XFS_LOG_REL_PERM_RESERV) == 0)
+		ticket->t_flags |= XLOG_TIC_INITED;
+
+	return lsn;
+}	/* xfs_log_done */
+
+
+/*
+ * Force the in-core log to disk.  If flags == XFS_LOG_SYNC,
+ *	the force is done synchronously.
+ *
+ * Asynchronous forces are implemented by setting the WANT_SYNC
+ * bit in the appropriate in-core log and then returning.
+ *
+ * Synchronous forces are implemented with a semaphore.	 All callers
+ * to force a given lsn to disk will wait on a semaphore attached to the
+ * specific in-core log.  When given in-core log finally completes its
+ * write to disk, that thread will wake up all threads waiting on the
+ * semaphore.
+ */
+int
+xfs_log_force(xfs_mount_t *mp,
+	      xfs_lsn_t	  lsn,
+	      uint	  flags)
+{
+	int	rval;
+	xlog_t *log = mp->m_log;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return 0;
+#endif
+
+	ASSERT(flags & XFS_LOG_FORCE);
+
+	XFS_STATS_INC(xfsstats.xs_log_force);
+
+	if ((log->l_flags & XLOG_IO_ERROR) == 0) {
+		if (lsn == 0)
+			rval = xlog_state_sync_all(log, flags);
+		else
+			rval = xlog_state_sync(log, lsn, flags);
+	} else {
+		rval = XFS_ERROR(EIO);
+	}
+
+	return rval;
+
+}	/* xfs_log_force */
+
+
+/*
+ * This function will take a log sequence number and check to see if that
+ * lsn has been flushed to disk.  If it has, then the callback function is
+ * called with the callback argument.  If the relevant in-core log has not
+ * been synced to disk, we add the callback to the callback list of the
+ * in-core log.
+ */
+void
+xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
+	       xfs_lsn_t	  lsn,		/* lsn looking for */
+	       xfs_log_callback_t *cb)
+{
+	xlog_t *log = mp->m_log;
+	int	abortflg;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return;
+#endif
+	cb->cb_next = 0;
+	if (xlog_state_lsn_is_synced(log, lsn, cb, &abortflg))
+		cb->cb_func(cb->cb_arg, abortflg);
+}	/* xfs_log_notify */
+
+
+/*
+ * Initialize log manager data.	 This routine is intended to be called when
+ * a system boots up.  It is not a per filesystem initialization.
+ *
+ * As you can see, we currently do nothing.
+ */
+int
+xfs_log_init(void)
+{
+	return( 0 );
+}
+
+
+/*
+ *  1. Reserve an amount of on-disk log space and return a ticket corresponding
+ *	to the reservation.
+ *  2. Potentially, push buffers at tail of log to disk.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation.  By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(xfs_mount_t	 *mp,
+		int		 unit_bytes,
+		int		 cnt,
+		xfs_log_ticket_t *ticket,
+		__uint8_t	 client,
+		uint		 flags)
+{
+	xlog_t		*log = mp->m_log;
+	xlog_ticket_t	*internal_ticket;
+	int		retval;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return 0;
+#endif
+	retval = 0;
+	ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+	ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return XFS_ERROR(EIO);
+
+	XFS_STATS_INC(xfsstats.xs_try_logspace);
+
+	if (*ticket != NULL) {
+		ASSERT(flags & XFS_LOG_PERM_RESERV);
+		internal_ticket = (xlog_ticket_t *)*ticket;
+		xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+		retval = xlog_regrant_write_log_space(log, internal_ticket);
+	} else {
+		/* may sleep if need to allocate more tickets */
+		internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
+						  client, flags);
+		*ticket = internal_ticket;
+		xlog_grant_push_ail(mp,
+				    (internal_ticket->t_unit_res *
+				     internal_ticket->t_cnt));
+		retval = xlog_grant_log_space(log, internal_ticket);
+	}
+
+	return retval;
+}	/* xfs_log_reserve */
+
+
+/*
+ * Mount a log filesystem
+ *
+ * mp		- ubiquitous xfs mount point structure
+ * log_dev	- device number of on-disk log device
+ * blk_offset	- Start block # where block size is 512 bytes (BBSIZE)
+ * num_bblocks	- Number of BBSIZE blocks in on-disk log
+ *
+ * Return error or zero.
+ */
+int
+xfs_log_mount(xfs_mount_t	*mp,
+	      dev_t		log_dev,
+	      xfs_daddr_t	blk_offset,
+	      int		num_bblks)
+{
+	xlog_t *log;
+
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+		cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
+	else {
+		cmn_err(CE_NOTE,
+			"!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+			mp->m_fsname);
+		ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
+	}
+
+	mp->m_log = log = xlog_alloc_log(mp, log_dev, blk_offset, num_bblks);
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug) {
+		cmn_err(CE_NOTE, "log dev: 0x%x", log_dev);
+		return 0;
+	}
+#endif
+	/*
+	 * skip log recovery on a norecovery mount.  pretend it all
+	 * just worked.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+		int    error;
+
+		error = xlog_recover(log,XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
+		if (error) {
+			cmn_err(CE_WARN, "XFS: log mount/recovery failed");
+			xlog_unalloc_log(log);
+			return error;
+		}
+	}
+
+	/* Normal transactions can now occur */
+	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+
+	/* End mounting message in xfs_log_mount_finish */
+	return 0;
+}	/* xfs_log_mount */
+
+/*
+ * Finish the recovery of the file system.  This is separate from
+ * the xfs_log_mount() call, because it depends on the code in
+ * xfs_mountfs() to read in the root and real-time bitmap inodes
+ * between calling xfs_log_mount() and here.
+ *
+ * mp		- ubiquitous xfs mount point structure
+ */
+int
+xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
+{
+	int	error;
+
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+		error = xlog_recover_finish(mp->m_log, mfsi_flags);
+	else {
+		error = 0;
+		ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
+	}
+
+	return error;
+}
+
+/*
+ * Unmount processing for the log.
+ */
+int
+xfs_log_unmount(xfs_mount_t *mp)
+{
+	int		error;
+
+	error = xfs_log_unmount_write(mp);
+	xfs_log_unmount_dealloc(mp);
+	return (error);
+}
+
+/*
+ * Final log writes as part of unmount.
+ *
+ * Mark the filesystem clean as unmount happens.  Note that during relocation
+ * this routine needs to be executed as part of source-bag while the
+ * deallocation must not be done until source-end.
+ */
+
+/*
+ * Unmount record used to have a string "Unmount filesystem--" in the
+ * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
+ * We just write the magic number now since that particular field isn't
+ * currently architecture converted and "nUmount" is a bit foo.
+ * As far as I know, there weren't any dependencies on the old behaviour.
+ */
+
+int
+xfs_log_unmount_write(xfs_mount_t *mp)
+{
+	xlog_t		 *log = mp->m_log;
+	xlog_in_core_t	 *iclog;
+#ifdef DEBUG
+	xlog_in_core_t	 *first_iclog;
+#endif
+	xfs_log_iovec_t	 reg[1];
+	xfs_log_ticket_t tic = 0;
+	xfs_lsn_t	 lsn;
+	int		 error;
+	SPLDECL(s);
+
+	/* the data section must be 32 bit size aligned */
+	struct {
+	    __uint16_t magic;
+	    __uint16_t pad1;
+	    __uint32_t pad2; /* may as well make it 64 bits */
+	} magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return 0;
+#endif
+
+	/*
+	 * Don't write out unmount record on read-only mounts.
+	 * Or, if we are doing a forced umount (typically because of IO errors).
+	 */
+	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+		return 0;
+
+	xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+
+#ifdef DEBUG
+	first_iclog = iclog = log->l_iclog;
+	do {
+		if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+			ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
+			ASSERT(iclog->ic_offset == 0);
+		}
+		iclog = iclog->ic_next;
+	} while (iclog != first_iclog);
+#endif
+	if (! (XLOG_FORCED_SHUTDOWN(log))) {
+		reg[0].i_addr = (void*)&magic;
+		reg[0].i_len  = sizeof(magic);
+
+		error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+		if (!error) {
+			/* remove inited flag */
+			((xlog_ticket_t *)tic)->t_flags = 0;
+			error = xlog_write(mp, reg, 1, tic, &lsn,
+					   XLOG_UNMOUNT_TRANS);
+			/*
+			 * At this point, we're umounting anyway,
+			 * so there's no point in transitioning log state
+			 * to IOERROR. Just continue...
+			 */
+		}
+
+		if (error) {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"xfs_log_unmount: unmount record failed");
+		}
+
+
+		s = LOG_LOCK(log);
+		iclog = log->l_iclog;
+		iclog->ic_refcnt++;
+		LOG_UNLOCK(log, s);
+		xlog_state_want_sync(log, iclog);
+		(void) xlog_state_release_iclog(log, iclog);
+
+		s = LOG_LOCK(log);
+		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
+		      iclog->ic_state == XLOG_STATE_DIRTY)) {
+			if (!XLOG_FORCED_SHUTDOWN(log)) {
+				sv_wait(&iclog->ic_forcesema, PMEM,
+					&log->l_icloglock, s);
+			} else {
+				LOG_UNLOCK(log, s);
+			}
+		} else {
+			LOG_UNLOCK(log, s);
+		}
+		if (tic)
+			xlog_state_put_ticket(log, tic);
+	} else {
+		/*
+		 * We're already in forced_shutdown mode, couldn't
+		 * even attempt to write out the unmount transaction.
+		 *
+		 * Go through the motions of sync'ing and releasing
+		 * the iclog, even though no I/O will actually happen,
+		 * we need to wait for other log I/O's that may already
+		 * be in progress.  Do this as a separate section of
+		 * code so we'll know if we ever get stuck here that
+		 * we're in this odd situation of trying to unmount
+		 * a file system that went into forced_shutdown as
+		 * the result of an unmount..
+		 */
+		s = LOG_LOCK(log);
+		iclog = log->l_iclog;
+		iclog->ic_refcnt++;
+		LOG_UNLOCK(log, s);
+
+		xlog_state_want_sync(log, iclog);
+		(void) xlog_state_release_iclog(log, iclog);
+
+		s = LOG_LOCK(log);
+
+		if ( ! (   iclog->ic_state == XLOG_STATE_ACTIVE
+			|| iclog->ic_state == XLOG_STATE_DIRTY
+			|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
+
+				sv_wait(&iclog->ic_forcesema, PMEM,
+					&log->l_icloglock, s);
+		} else {
+			LOG_UNLOCK(log, s);
+		}
+	}
+
+	return 0;
+}	/* xfs_log_unmount_write */
+
+/*
+ * Deallocate log structures for unmount/relocation.
+ */
+void
+xfs_log_unmount_dealloc(xfs_mount_t *mp)
+{
+	xlog_unalloc_log(mp->m_log);
+}
+
+/*
+ * Write region vectors to log.	 The write happens using the space reservation
+ * of the ticket (tic).	 It is not a requirement that all writes for a given
+ * transaction occur with one call to xfs_log_write().
+ */
+int
+xfs_log_write(xfs_mount_t *	mp,
+	      xfs_log_iovec_t	reg[],
+	      int		nentries,
+	      xfs_log_ticket_t	tic,
+	      xfs_lsn_t		*start_lsn)
+{
+	int	error;
+	xlog_t *log = mp->m_log;
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+
+	if (! xlog_debug && xlog_devt == log->l_dev) {
+		*start_lsn = 0;
+		return 0;
+	}
+#endif
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return XFS_ERROR(EIO);
+
+	if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, 0))) {
+		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+	}
+	return (error);
+}	/* xfs_log_write */
+
+
+void
+xfs_log_move_tail(xfs_mount_t	*mp,
+		  xfs_lsn_t	tail_lsn)
+{
+	xlog_ticket_t	*tic;
+	xlog_t		*log = mp->m_log;
+	int		need_bytes, free_bytes, cycle, bytes;
+	SPLDECL(s);
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (!xlog_debug && xlog_devt == log->l_dev)
+		return;
+#endif
+	/* XXXsup tmp */
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return;
+	ASSERT(!XFS_FORCED_SHUTDOWN(mp));
+
+	if (tail_lsn == 0) {
+		/* needed since sync_lsn is 64 bits */
+		s = LOG_LOCK(log);
+		tail_lsn = log->l_last_sync_lsn;
+		LOG_UNLOCK(log, s);
+	}
+
+	s = GRANT_LOCK(log);
+
+	/* Also an illegal lsn.	 1 implies that we aren't passing in a legal
+	 * tail_lsn.
+	 */
+	if (tail_lsn != 1)
+		log->l_tail_lsn = tail_lsn;
+
+	if ((tic = log->l_write_headq)) {
+#ifdef DEBUG
+		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+			panic("Recovery problem");
+#endif
+		cycle = log->l_grant_write_cycle;
+		bytes = log->l_grant_write_bytes;
+		free_bytes = xlog_space_left(log, cycle, bytes);
+		do {
+			ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+
+			if (free_bytes < tic->t_unit_res)
+				break;
+			free_bytes -= tic->t_unit_res;
+			sv_signal(&tic->t_sema);
+			tic = tic->t_next;
+		} while (tic != log->l_write_headq);
+	}
+	if ((tic = log->l_reserve_headq)) {
+#ifdef DEBUG
+		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+			panic("Recovery problem");
+#endif
+		cycle = log->l_grant_reserve_cycle;
+		bytes = log->l_grant_reserve_bytes;
+		free_bytes = xlog_space_left(log, cycle, bytes);
+		do {
+			if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+				need_bytes = tic->t_unit_res*tic->t_cnt;
+			else
+				need_bytes = tic->t_unit_res;
+			if (free_bytes < need_bytes)
+				break;
+			free_bytes -= need_bytes;
+			sv_signal(&tic->t_sema);
+			tic = tic->t_next;
+		} while (tic != log->l_reserve_headq);
+	}
+	GRANT_UNLOCK(log, s);
+}	/* xfs_log_move_tail */
+
+/*
+ * Determine if we have a transaction that has gone to disk
+ * that needs to be covered. Log activity needs to be idle (no AIL and
+ * nothing in the iclogs). And, we need to be in the right state indicating
+ * something has gone out.
+ */
+int
+xfs_log_need_covered(xfs_mount_t *mp)
+{
+	SPLDECL(s);
+	int		needed = 0, gen;
+	xlog_t		*log = mp->m_log;
+
+	if (mp->m_frozen || XFS_FORCED_SHUTDOWN(mp))
+		return 0;
+
+	s = LOG_LOCK(log);
+	if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
+		(log->l_covered_state == XLOG_STATE_COVER_NEED2))
+			&& !xfs_trans_first_ail(mp, &gen)
+			&& xlog_iclogs_empty(log)) {
+		if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+			log->l_covered_state = XLOG_STATE_COVER_DONE;
+		else {
+			ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
+			log->l_covered_state = XLOG_STATE_COVER_DONE2;
+		}
+		needed = 1;
+	}
+	LOG_UNLOCK(log, s);
+	return(needed);
+}
+
+/******************************************************************************
+ *
+ *	local routines
+ *
+ ******************************************************************************
+ */
+
+/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
+ * The log manager must keep track of the last LR which was committed
+ * to disk.  The lsn of this LR will become the new tail_lsn whenever
+ * xfs_trans_tail_ail returns 0.  If we don't do this, we run into
+ * the situation where stuff could be written into the log but nothing
+ * was ever in the AIL when asked.  Eventually, we panic since the
+ * tail hits the head.
+ *
+ * We may be holding the log iclog lock upon entering this routine.
+ */
+xfs_lsn_t
+xlog_assign_tail_lsn(xfs_mount_t *mp, xlog_in_core_t *iclog)
+{
+	xfs_lsn_t tail_lsn;
+	SPLDECL(s);
+	xlog_t	  *log = mp->m_log;
+
+	tail_lsn = xfs_trans_tail_ail(mp);
+	s = GRANT_LOCK(log);
+	if (tail_lsn != 0)
+		log->l_tail_lsn = tail_lsn;
+	else
+		tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
+	if (iclog)
+		INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, tail_lsn);
+	GRANT_UNLOCK(log, s);
+
+	return tail_lsn;
+}	/* xlog_assign_tail_lsn */
+
+
+/*
+ * Return the space in the log between the tail and the head.  The head
+ * is passed in the cycle/bytes formal parms.  In the special case where
+ * the reserve head has wrapped passed the tail, this calculation is no
+ * longer valid.  In this case, just return 0 which means there is no space
+ * in the log.	This works for all places where this function is called
+ * with the reserve head.  Of course, if the write head were to ever
+ * wrap the tail, we should blow up.  Rather than catch this case here,
+ * we depend on other ASSERTions in other parts of the code.   XXXmiken
+ *
+ * This code also handles the case where the reservation head is behind
+ * the tail.  The details of this case are described below, but the end
+ * result is that we return the size of the log as the amount of space left.
+ */
+int
+xlog_space_left(xlog_t *log, int cycle, int bytes)
+{
+	int free_bytes;
+	int tail_bytes;
+	int tail_cycle;
+
+	tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn, ARCH_NOCONVERT));
+	tail_cycle = CYCLE_LSN(log->l_tail_lsn, ARCH_NOCONVERT);
+	if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+		free_bytes = log->l_logsize - (bytes - tail_bytes);
+	} else if ((tail_cycle + 1) < cycle) {
+		return 0;
+	} else if (tail_cycle < cycle) {
+		ASSERT(tail_cycle == (cycle - 1));
+		free_bytes = tail_bytes - bytes;
+	} else {
+		/*
+		 * The reservation head is behind the tail.
+		 * This can only happen when the AIL is empty so the tail
+		 * is equal to the head and the l_roundoff value in the
+		 * log structure is taking up the difference between the
+		 * reservation head and the tail.  The bytes accounted for
+		 * by the l_roundoff field are temporarily 'lost' to the
+		 * reservation mechanism, but they are cleaned up when the
+		 * log buffers that created them are reused.  These lost
+		 * bytes are what allow the reservation head to fall behind
+		 * the tail in the case that the log is 'empty'.
+		 * In this case we just want to return the size of the
+		 * log as the amount of space left.
+		 */
+/* This assert does not take into account padding from striped log writes *
+		ASSERT((tail_cycle == (cycle + 1)) ||
+		       ((bytes + log->l_roundoff) >= tail_bytes));
+*/
+		free_bytes = log->l_logsize;
+	}
+	return free_bytes;
+}	/* xlog_space_left */
+
+
+/*
+ * Log function which is called when an io completes.
+ *
+ * The log manager needs its own routine, in order to control what
+ * happens with the buffer after the write completes.
+ */
+void
+xlog_iodone(xfs_buf_t *bp)
+{
+	xlog_in_core_t *iclog;
+	int		aborted;
+
+	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
+	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+	aborted = 0;
+
+	/*
+	 * Race to shutdown the filesystem if we see an error.
+	 */
+	if (XFS_BUF_GETERROR(bp)) {
+		/* Some versions of cpp barf on the recursive definition of
+		 * ic_log -> hic_fields.ic_log and expand ic_log twice when
+		 * it is passed through two macros.  Workaround for broken cpp
+		 */
+		struct log *l;
+		xfs_ioerror_alert("xlog_iodone",
+				  iclog->ic_log->l_mp, bp, XFS_BUF_ADDR(bp));
+		XFS_BUF_STALE(bp);
+		l = iclog->ic_log;
+		xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR);
+		/*
+		 * This flag will be propagated to the trans-committed
+		 * callback routines to let them know that the log-commit
+		 * didn't succeed.
+		 */
+		aborted = XFS_LI_ABORTED;
+	} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		aborted = XFS_LI_ABORTED;
+	}
+	xlog_state_done_syncing(iclog, aborted);
+	if ( !(XFS_BUF_ISASYNC(bp)) ) {
+		/*
+		 * Corresponding psema() will be done in bwrite().  If we don't
+		 * vsema() here, panic.
+		 */
+	  XFS_BUF_V_IODONESEMA(bp);
+	}
+}	/* xlog_iodone */
+
+/*
+ * The bdstrat callback function for log bufs. This gives us a central
+ * place to trap bufs in case we get hit by a log I/O error and need to
+ * shutdown. Actually, in practice, even when we didn't get a log error,
+ * we transition the iclogs to IOERROR state *after* flushing all existing
+ * iclogs to disk. This is because we don't want anymore new transactions to be
+ * started or completed afterwards.
+ */
+STATIC int
+xlog_bdstrat_cb(struct xfs_buf *bp)
+{
+	xlog_in_core_t *iclog;
+
+	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+
+	if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
+	  /* note for irix bstrat will need  struct bdevsw passed
+	   * Fix the following macro if the code ever is merged
+	   */
+	    XFS_bdstrat(bp);
+		return 0;
+	}
+
+	xfs_buftrace("XLOG__BDSTRAT IOERROR", bp);
+	XFS_BUF_ERROR(bp, EIO);
+	XFS_BUF_STALE(bp);
+	xfs_biodone(bp);
+	return (XFS_ERROR(EIO));
+
+
+}
+
+/*
+ * Return size of each in-core log record buffer.
+ *
+ * Low memory machines only get 2 16KB buffers.	 We don't want to waste
+ * memory here.	 However, all other machines get at least 2 32KB buffers.
+ * The number is hard coded because we don't care about the minimum
+ * memory size, just 32MB systems.
+ *
+ * If the filesystem blocksize is too large, we may need to choose a
+ * larger size since the directory code currently logs entire blocks.
+ * XXXmiken XXXcurtis
+ */
+
+STATIC void
+xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
+			   xlog_t	*log)
+{
+	int size;
+	int xhdrs;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	/*
+	 * When logbufs == 0, someone has disabled the log from the FSTAB
+	 * file.  This is not a documented feature.  We need to set xlog_debug
+	 * to zero (this deactivates the log) and set xlog_devt to the
+	 * appropriate dev_t.  Only one filesystem may be affected as such
+	 * since this is just a performance hack to test what we might be able
+	 * to get if the log were not present.
+	 */
+	if (mp->m_logbufs == 0) {
+		xlog_debug = 0;
+		xlog_devt = log->l_dev;
+		log->l_iclog_bufs = XLOG_NUM_ICLOGS;
+	} else
+#endif
+	{
+		/*
+		 * This is the normal path.  If m_logbufs == -1, then the
+		 * admin has chosen to use the system defaults for logbuffers.
+		 */
+		if (mp->m_logbufs == -1)
+			log->l_iclog_bufs = XLOG_NUM_ICLOGS;
+		else
+			log->l_iclog_bufs = mp->m_logbufs;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+		/* We are reactivating a filesystem after it was active */
+		if (log->l_dev == xlog_devt) {
+			xlog_devt = 1;
+			xlog_debug = 1;
+		}
+#endif
+	}
+
+	/*
+	 * Buffer size passed in from mount system call.
+	 */
+	if (mp->m_logbsize != -1) {
+		size = log->l_iclog_size = mp->m_logbsize;
+		log->l_iclog_size_log = 0;
+		while (size != 1) {
+			log->l_iclog_size_log++;
+			size >>= 1;
+		}
+
+		if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
+			/* # headers = size / 32K
+			 * one header holds cycles from 32K of data
+			 */
+
+			xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
+			if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
+				xhdrs++;
+			log->l_iclog_hsize = xhdrs << BBSHIFT;
+			log->l_iclog_heads = xhdrs;
+		} else {
+			ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
+			log->l_iclog_hsize = BBSIZE;
+			log->l_iclog_heads = 1;
+		}
+		return;
+	}
+
+	/*
+	 * Special case machines that have less than 32MB of memory.
+	 * All machines with more memory use 32KB buffers.
+	 */
+	if (xfs_physmem <= btoc(32*1024*1024)) {
+		/* Don't change; min configuration */
+		log->l_iclog_size = XLOG_RECORD_BSIZE;		/* 16k */
+		log->l_iclog_size_log = XLOG_RECORD_BSHIFT;
+	} else {
+		log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;	/* 32k */
+		log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
+	}
+
+	/* the default log size is 16k or 32k which is one header sector */
+	log->l_iclog_hsize = BBSIZE;
+	log->l_iclog_heads = 1;
+
+	/*
+	 * For 16KB, we use 3 32KB buffers.  For 32KB block sizes, we use
+	 * 4 32KB buffers.  For 64KB block sizes, we use 8 32KB buffers.
+	 */
+	if (mp->m_sb.sb_blocksize >= 16*1024) {
+		log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
+		log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
+		if (mp->m_logbufs == -1) {
+			switch (mp->m_sb.sb_blocksize) {
+			    case 16*1024:			/* 16 KB */
+				log->l_iclog_bufs = 3;
+				break;
+			    case 32*1024:			/* 32 KB */
+				log->l_iclog_bufs = 4;
+				break;
+			    case 64*1024:			/* 64 KB */
+				log->l_iclog_bufs = 8;
+				break;
+			    default:
+				xlog_panic("XFS: Illegal blocksize");
+				break;
+			}
+		}
+	}
+}	/* xlog_get_iclog_buffer_size */
+
+
+/*
+ * This routine initializes some of the log structure for a given mount point.
+ * Its primary purpose is to fill in enough, so recovery can occur.  However,
+ * some other stuff may be filled in too.
+ */
+STATIC xlog_t *
+xlog_alloc_log(xfs_mount_t	*mp,
+	       dev_t		log_dev,
+	       xfs_daddr_t	blk_offset,
+	       int		num_bblks)
+{
+	xlog_t			*log;
+	xlog_rec_header_t	*head;
+	xlog_in_core_t		**iclogp;
+	xlog_in_core_t		*iclog, *prev_iclog=NULL;
+	xfs_buf_t		*bp;
+	int			i;
+	int			iclogsize;
+
+	log = (void *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
+
+	log->l_mp	   = mp;
+	log->l_dev	   = log_dev;
+	log->l_logsize	   = BBTOB(num_bblks);
+	log->l_logBBstart  = blk_offset;
+	log->l_logBBsize   = num_bblks;
+	log->l_roundoff	   = 0;
+	log->l_covered_state = XLOG_STATE_COVER_IDLE;
+	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
+
+	log->l_prev_block  = -1;
+	ASSIGN_ANY_LSN(log->l_tail_lsn, 1, 0, ARCH_NOCONVERT);
+	/* log->l_tail_lsn    = 0x100000000LL; cycle = 1; current block = 0 */
+	log->l_last_sync_lsn = log->l_tail_lsn;
+	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
+	log->l_curr_block  = 0;		/* filled in by xlog_recover */
+	log->l_grant_reserve_bytes = 0;
+	log->l_grant_reserve_cycle = 1;
+	log->l_grant_write_bytes = 0;
+	log->l_grant_write_cycle = 1;
+	log->l_quotaoffs_flag = 0;	/* XFS_LI_QUOTAOFF logitems */
+
+	xlog_get_iclog_buffer_size(mp, log);
+
+	bp = log->l_xbuf   = XFS_getrbuf(0,mp); /* get my locked buffer */ /* mp needed for pagebuf/linux only */
+
+	XFS_BUF_SET_TARGET(bp, mp->m_logdev_targp);
+	XFS_BUF_SET_SIZE(bp, log->l_iclog_size);
+	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
+	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+	ASSERT(XFS_BUF_ISBUSY(log->l_xbuf));
+	ASSERT(XFS_BUF_VALUSEMA(log->l_xbuf) <= 0);
+	spinlock_init(&log->l_icloglock, "iclog");
+	spinlock_init(&log->l_grant_lock, "grhead_iclog");
+	initnsema(&log->l_flushsema, 0, "ic-flush");
+	xlog_state_ticket_alloc(log);  /* wait until after icloglock inited */
+
+	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
+	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
+
+	iclogp = &log->l_iclog;
+	/*
+	 * The amount of memory to allocate for the iclog structure is
+	 * rather funky due to the way the structure is defined.  It is
+	 * done this way so that we can use different sizes for machines
+	 * with different amounts of memory.  See the definition of
+	 * xlog_in_core_t in xfs_log_priv.h for details.
+	 */
+	iclogsize = log->l_iclog_size;
+	ASSERT(log->l_iclog_size >= 4096);
+	for (i=0; i < log->l_iclog_bufs; i++) {
+		*iclogp = (xlog_in_core_t *)
+			  kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
+		iclog = *iclogp;
+		iclog->hic_data = (xlog_in_core_2_t *)
+			  kmem_alloc(iclogsize, KM_SLEEP);
+
+		iclog->ic_prev = prev_iclog;
+		prev_iclog = iclog;
+		log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+
+		head = &iclog->ic_header;
+		memset(head, 0, sizeof(xlog_rec_header_t));
+		INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
+		INT_SET(head->h_version, ARCH_CONVERT,
+			XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
+		INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size);
+		/* new fields */
+		INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT);
+		memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+
+		bp = iclog->ic_bp = XFS_getrbuf(0,mp);		/* my locked buffer */ /* mp need for pagebuf/linux only */
+		XFS_BUF_SET_TARGET(bp, mp->m_logdev_targp);
+		XFS_BUF_SET_SIZE(bp, log->l_iclog_size);
+		XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+		XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
+		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+
+		iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
+		iclog->ic_state = XLOG_STATE_ACTIVE;
+		iclog->ic_log = log;
+		iclog->ic_callback_tail = &(iclog->ic_callback);
+		iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
+
+		ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
+		ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
+		sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
+
+		iclogp = &iclog->ic_next;
+	}
+	*iclogp = log->l_iclog;			/* complete ring */
+	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
+
+	return log;
+}	/* xlog_alloc_log */
+
+
+/*
+ * Write out the commit record of a transaction associated with the given
+ * ticket.  Return the lsn of the commit record.
+ */
+STATIC int
+xlog_commit_record(xfs_mount_t	*mp,
+		   xlog_ticket_t *ticket,
+		   xfs_lsn_t	*commitlsnp)
+{
+	int		error;
+	xfs_log_iovec_t reg[1];
+
+	reg[0].i_addr = 0;
+	reg[0].i_len = 0;
+
+	if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
+			       XLOG_COMMIT_TRANS))) {
+		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+	}
+	return (error);
+}	/* xlog_commit_record */
+
+
+/*
+ * Push on the buffer cache code if we ever use more than 75% of the on-disk
+ * log space.  This code pushes on the lsn which would supposedly free up
+ * the 25% which we want to leave free.	 We may need to adopt a policy which
+ * pushes on an lsn which is further along in the log once we reach the high
+ * water mark.	In this manner, we would be creating a low water mark.
+ */
+void
+xlog_grant_push_ail(xfs_mount_t *mp,
+		    int		need_bytes)
+{
+    xlog_t	*log = mp->m_log;	/* pointer to the log */
+    xfs_lsn_t	tail_lsn;		/* lsn of the log tail */
+    xfs_lsn_t	threshold_lsn = 0;	/* lsn we'd like to be at */
+    int		free_blocks;		/* free blocks left to write to */
+    int		free_bytes;		/* free bytes left to write to */
+    int		threshold_block;	/* block in lsn we'd like to be at */
+    int		threshold_cycle;	/* lsn cycle we'd like to be at */
+    int		free_threshold;
+    SPLDECL(s);
+
+    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+    s = GRANT_LOCK(log);
+    free_bytes = xlog_space_left(log,
+				 log->l_grant_reserve_cycle,
+				 log->l_grant_reserve_bytes);
+    tail_lsn = log->l_tail_lsn;
+    free_blocks = BTOBBT(free_bytes);
+
+    /*
+     * Set the threshold for the minimum number of free blocks in the
+     * log to the maximum of what the caller needs, one quarter of the
+     * log, and 256 blocks.
+     */
+    free_threshold = BTOBB(need_bytes);
+    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+    free_threshold = MAX(free_threshold, 256);
+    if (free_blocks < free_threshold) {
+	threshold_block = BLOCK_LSN(tail_lsn, ARCH_NOCONVERT) + free_threshold;
+	threshold_cycle = CYCLE_LSN(tail_lsn, ARCH_NOCONVERT);
+	if (threshold_block >= log->l_logBBsize) {
+	    threshold_block -= log->l_logBBsize;
+	    threshold_cycle += 1;
+	}
+	ASSIGN_ANY_LSN(threshold_lsn, threshold_cycle,
+		       threshold_block, ARCH_NOCONVERT);
+
+	/* Don't pass in an lsn greater than the lsn of the last
+	 * log record known to be on disk.
+	 */
+	if (XFS_LSN_CMP_ARCH(threshold_lsn, log->l_last_sync_lsn, ARCH_NOCONVERT) > 0)
+	    threshold_lsn = log->l_last_sync_lsn;
+    }
+    GRANT_UNLOCK(log, s);
+
+    /*
+     * Get the transaction layer to kick the dirty buffers out to
+     * disk asynchronously. No point in trying to do this if
+     * the filesystem is shutting down.
+     */
+    if (threshold_lsn &&
+	!XLOG_FORCED_SHUTDOWN(log))
+	    xfs_trans_push_ail(mp, threshold_lsn);
+}	/* xlog_grant_push_ail */
+
+
+/*
+ * Flush out the in-core log (iclog) to the on-disk log in a synchronous or
+ * asynchronous fashion.  Previously, we should have moved the current iclog
+ * ptr in the log to point to the next available iclog.	 This allows further
+ * write to continue while this code syncs out an iclog ready to go.
+ * Before an in-core log can be written out, the data section must be scanned
+ * to save away the 1st word of each BBSIZE block into the header.  We replace
+ * it with the current cycle count.  Each BBSIZE block is tagged with the
+ * cycle count because there in an implicit assumption that drives will
+ * guarantee that entire 512 byte blocks get written at once.  In other words,
+ * we can't have part of a 512 byte block written and part not written.	 By
+ * tagging each block, we will know which blocks are valid when recovering
+ * after an unclean shutdown.
+ *
+ * This routine is single threaded on the iclog.  No other thread can be in
+ * this routine with the same iclog.  Changing contents of iclog can there-
+ * fore be done without grabbing the state machine lock.  Updating the global
+ * log will require grabbing the lock though.
+ *
+ * The entire log manager uses a logical block numbering scheme.  Only
+ * log_sync (and then only bwrite()) know about the fact that the log may
+ * not start with block zero on a given device.	 The log block start offset
+ * is added immediately before calling bwrite().
+ */
+
+int
+xlog_sync(xlog_t		*log,
+	  xlog_in_core_t	*iclog,
+	  uint			flags)
+{
+	xfs_caddr_t	dptr;		/* pointer to byte sized element */
+	xfs_buf_t	*bp;
+	int		i;
+	uint		roundup;
+	uint		count;		/* byte count of bwrite */
+	int		split = 0;	/* split write into two regions */
+	int		error;
+
+	XFS_STATS_INC(xfsstats.xs_log_writes);
+	ASSERT(iclog->ic_refcnt == 0);
+
+#ifdef DEBUG
+	if (flags != 0 && (flags & XFS_LOG_SYNC) )
+		xlog_panic("xlog_sync: illegal flag");
+#endif
+
+	/* Round out the log write size */
+	if (iclog->ic_offset & BBMASK) {
+		/* count of 0 is already accounted for up in
+		 * xlog_state_sync_all().  Once in this routine,
+		 * operations on the iclog are single threaded.
+		 *
+		 * Difference between rounded up size and size
+		 */
+		count = iclog->ic_offset & BBMASK;
+		iclog->ic_roundoff += BBSIZE - count;
+	}
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		unsigned sunit = BTOBB(log->l_mp->m_sb.sb_logsunit);
+		if (!sunit)
+			sunit = 1;
+
+		count = BTOBB(log->l_iclog_hsize + iclog->ic_offset);
+		if (count & (sunit - 1)) {
+			roundup = sunit - (count & (sunit - 1));
+		} else {
+			roundup = 0;
+		}
+		iclog->ic_offset += BBTOB(roundup);
+	}
+
+	log->l_roundoff += iclog->ic_roundoff;
+
+	xlog_pack_data(log, iclog);	  /* put cycle number in every block */
+	INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset);	/* real byte length */
+
+	bp	    = iclog->ic_bp;
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1);
+	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(iclog->ic_header.h_lsn, ARCH_CONVERT));
+
+	/* Count is already rounded up to a BBSIZE above */
+	count = iclog->ic_offset + iclog->ic_roundoff;
+	ASSERT((count & BBMASK) == 0);
+
+	/* Add for LR header */
+	count += log->l_iclog_hsize;
+	XFS_STATS_ADD(xfsstats.xs_log_blocks, BTOBB(count));
+
+	/* Do we need to split this write into 2 parts? */
+	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+		split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
+		count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
+		iclog->ic_bwritecnt = 2;	/* split into 2 writes */
+	} else {
+		iclog->ic_bwritecnt = 1;
+	}
+	XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count);
+	XFS_BUF_SET_FSPRIVATE(bp, iclog);	/* save for later */
+	if (flags & XFS_LOG_SYNC){
+		XFS_BUF_BUSY(bp);
+		XFS_BUF_HOLD(bp);
+	} else {
+		XFS_BUF_BUSY(bp);
+		XFS_BUF_ASYNC(bp);
+	}
+	/*
+	 * Do a disk write cache flush for the log block.
+	 * This is a bit of a sledgehammer, it would be better
+	 * to use a tag barrier here that just prevents reordering.
+	 * It may not be needed to flush the first split block in the log wrap
+	 * case, but do it anyways to be safe -AK
+	 */
+	if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
+		XFS_BUF_FLUSH(bp);
+
+	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+	xlog_verify_iclog(log, iclog, count, B_TRUE);
+
+	/* account for log which doesn't start at block #0 */
+	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+	/*
+	 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
+	 * is shutting down.
+	 */
+	XFS_BUF_WRITE(bp);
+
+	if ((error = XFS_bwrite(bp))) {
+		xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
+				  XFS_BUF_ADDR(bp));
+		return (error);
+	}
+	if (split) {
+		bp		= iclog->ic_log->l_xbuf;
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) ==
+							(unsigned long)1);
+		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+		XFS_BUF_SET_ADDR(bp, 0);	     /* logical 0 */
+		XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
+					    (__psint_t)count), split);
+		XFS_BUF_SET_FSPRIVATE(bp, iclog);
+		XFS_BUF_BUSY(bp);
+		XFS_BUF_ASYNC(bp);
+		if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
+			XFS_BUF_FLUSH(bp);
+		dptr = XFS_BUF_PTR(bp);
+		/*
+		 * Bump the cycle numbers at the start of each block
+		 * since this part of the buffer is at the start of
+		 * a new cycle.	 Watch out for the header magic number
+		 * case, though.
+		 */
+		for (i=0; i<split; i += BBSIZE) {
+			INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1);
+			if (INT_GET(*(uint *)dptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
+				INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1);
+			dptr += BBSIZE;
+		}
+
+		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+		/* account for internal log which does't start at block #0 */
+		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+		XFS_BUF_WRITE(bp);
+		if ((error = XFS_bwrite(bp))) {
+			xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
+					  bp, XFS_BUF_ADDR(bp));
+			return (error);
+		}
+	}
+	return (0);
+}	/* xlog_sync */
+
+
+/*
+ * Unallocate a log structure
+ */
+void
+xlog_unalloc_log(xlog_t *log)
+{
+	xlog_in_core_t	*iclog, *next_iclog;
+	xlog_ticket_t	*tic, *next_tic;
+	int		i;
+
+
+	iclog = log->l_iclog;
+	for (i=0; i<log->l_iclog_bufs; i++) {
+		sv_destroy(&iclog->ic_forcesema);
+		XFS_freerbuf(iclog->ic_bp);
+#ifdef DEBUG
+		if (iclog->ic_trace != NULL) {
+			ktrace_free(iclog->ic_trace);
+		}
+#endif
+		next_iclog = iclog->ic_next;
+		kmem_free(iclog->hic_data, log->l_iclog_size);
+		kmem_free(iclog, sizeof(xlog_in_core_t));
+		iclog = next_iclog;
+	}
+	freesema(&log->l_flushsema);
+	spinlock_destroy(&log->l_icloglock);
+	spinlock_destroy(&log->l_grant_lock);
+
+	/* XXXsup take a look at this again. */
+	if ((log->l_ticket_cnt != log->l_ticket_tcnt)  &&
+	    !XLOG_FORCED_SHUTDOWN(log)) {
+		xfs_fs_cmn_err(CE_WARN, log->l_mp,
+			"xlog_unalloc_log: (cnt: %d, total: %d)",
+			log->l_ticket_cnt, log->l_ticket_tcnt);
+		/* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
+
+	} else {
+		tic = log->l_unmount_free;
+		while (tic) {
+			next_tic = tic->t_next;
+			kmem_free(tic, NBPP);
+			tic = next_tic;
+		}
+	}
+	XFS_freerbuf(log->l_xbuf);
+#ifdef DEBUG
+	if (log->l_trace != NULL) {
+		ktrace_free(log->l_trace);
+	}
+	if (log->l_grant_trace != NULL) {
+		ktrace_free(log->l_grant_trace);
+	}
+#endif
+	log->l_mp->m_log = NULL;
+	kmem_free(log, sizeof(xlog_t));
+}	/* xlog_unalloc_log */
+
+
+/*
+ * Write some region out to in-core log
+ *
+ * This will be called when writing externally provided regions or when
+ * writing out a commit record for a given transaction.
+ *
+ * General algorithm:
+ *	1. Find total length of this write.  This may include adding to the
+ *		lengths passed in.
+ *	2. Check whether we violate the tickets reservation.
+ *	3. While writing to this iclog
+ *	    A. Reserve as much space in this iclog as can get
+ *	    B. If this is first write, save away start lsn
+ *	    C. While writing this region:
+ *		1. If first write of transaction, write start record
+ *		2. Write log operation header (header per region)
+ *		3. Find out if we can fit entire region into this iclog
+ *		4. Potentially, verify destination bcopy ptr
+ *		5. Bcopy (partial) region
+ *		6. If partial copy, release iclog; otherwise, continue
+ *			copying more regions into current iclog
+ *	4. Mark want sync bit (in simulation mode)
+ *	5. Release iclog for potential flush to on-disk log.
+ *
+ * ERRORS:
+ * 1.	Panic if reservation is overrun.  This should never happen since
+ *	reservation amounts are generated internal to the filesystem.
+ * NOTES:
+ * 1. Tickets are single threaded data structures.
+ * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
+ *	syncing routine.  When a single log_write region needs to span
+ *	multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
+ *	on all log operation writes which don't contain the end of the
+ *	region.	 The XLOG_END_TRANS bit is used for the in-core log
+ *	operation which contains the end of the continued log_write region.
+ * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
+ *	we don't really know exactly how much space will be used.  As a result,
+ *	we don't update ic_offset until the end when we know exactly how many
+ *	bytes have been written out.
+ */
+int
+xlog_write(xfs_mount_t *	mp,
+	   xfs_log_iovec_t	reg[],
+	   int			nentries,
+	   xfs_log_ticket_t	tic,
+	   xfs_lsn_t		*start_lsn,
+	   uint			flags)
+{
+    xlog_t	     *log    = mp->m_log;
+    xlog_ticket_t    *ticket = (xlog_ticket_t *)tic;
+    xlog_op_header_t *logop_head;    /* ptr to log operation header */
+    xlog_in_core_t   *iclog;	     /* ptr to current in-core log */
+    __psint_t	     ptr;	     /* copy address into data region */
+    int		     len;	     /* # xlog_write() bytes 2 still copy */
+    int		     index;	     /* region index currently copying */
+    int		     log_offset;     /* offset (from 0) into data region */
+    int		     start_rec_copy; /* # bytes to copy for start record */
+    int		     partial_copy;   /* did we split a region? */
+    int		     partial_copy_len;/* # bytes copied if split region */
+    int		     need_copy;	     /* # bytes need to bcopy this region */
+    int		     copy_len;	     /* # bytes actually bcopy'ing */
+    int		     copy_off;	     /* # bytes from entry start */
+    int		     contwr;	     /* continued write of in-core log? */
+    int		     firstwr = 0;    /* first write of transaction */
+    int		     error;
+
+    partial_copy_len = partial_copy = 0;
+
+    /* Calculate potential maximum space.  Each region gets its own
+     * xlog_op_header_t and may need to be double word aligned.
+     */
+    len = 0;
+    if (ticket->t_flags & XLOG_TIC_INITED)     /* acct for start rec of xact */
+	len += sizeof(xlog_op_header_t);
+
+    for (index = 0; index < nentries; index++) {
+	len += sizeof(xlog_op_header_t);	    /* each region gets >= 1 */
+	len += reg[index].i_len;
+    }
+    contwr = *start_lsn = 0;
+
+    if (ticket->t_curr_res < len) {
+#ifdef DEBUG
+	xlog_panic(
+		"xfs_log_write: reservation ran out. Need to up reservation");
+#else
+	/* Customer configurable panic */
+	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+		"xfs_log_write: reservation ran out. Need to up reservation");
+	/* If we did not panic, shutdown the filesystem */
+	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+#endif
+    } else
+	ticket->t_curr_res -= len;
+
+    for (index = 0; index < nentries; ) {
+	if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+					       &contwr, &log_offset)))
+		return (error);
+
+	ASSERT(log_offset <= iclog->ic_size - 1);
+	ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
+
+	/* start_lsn is the first lsn written to. That's all we need. */
+	if (! *start_lsn)
+	    *start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
+
+	/* This loop writes out as many regions as can fit in the amount
+	 * of space which was allocated by xlog_state_get_iclog_space().
+	 */
+	while (index < nentries) {
+	    ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
+	    ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
+	    start_rec_copy = 0;
+
+	    /* If first write for transaction, insert start record.
+	     * We can't be trying to commit if we are inited.  We can't
+	     * have any "partial_copy" if we are inited.
+	     */
+	    if (ticket->t_flags & XLOG_TIC_INITED) {
+		logop_head		= (xlog_op_header_t *)ptr;
+		INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
+		logop_head->oh_clientid = ticket->t_clientid;
+		INT_ZERO(logop_head->oh_len, ARCH_CONVERT);
+		logop_head->oh_flags	= XLOG_START_TRANS;
+		INT_ZERO(logop_head->oh_res2, ARCH_CONVERT);
+		ticket->t_flags		&= ~XLOG_TIC_INITED;	/* clear bit */
+		firstwr++;			  /* increment log ops below */
+
+		start_rec_copy = sizeof(xlog_op_header_t);
+		xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
+	    }
+
+	    /* Copy log operation header directly into data section */
+	    logop_head			= (xlog_op_header_t *)ptr;
+	    INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
+	    logop_head->oh_clientid	= ticket->t_clientid;
+	    INT_ZERO(logop_head->oh_res2, ARCH_CONVERT);
+
+	    /* header copied directly */
+	    xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
+
+	    /* are we copying a commit or unmount record? */
+	    logop_head->oh_flags = flags;
+
+	    /*
+	     * We've seen logs corrupted with bad transaction client
+	     * ids.  This makes sure that XFS doesn't generate them on.
+	     * Turn this into an EIO and shut down the filesystem.
+	     */
+	    switch (logop_head->oh_clientid)  {
+	    case XFS_TRANSACTION:
+	    case XFS_VOLUME:
+	    case XFS_LOG:
+		break;
+	    default:
+		xfs_fs_cmn_err(CE_WARN, mp,
+		    "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+		    logop_head->oh_clientid, tic);
+		return XFS_ERROR(EIO);
+	    }
+
+	    /* Partial write last time? => (partial_copy != 0)
+	     * need_copy is the amount we'd like to copy if everything could
+	     * fit in the current bcopy.
+	     */
+	    need_copy = reg[index].i_len - partial_copy_len;
+
+	    copy_off = partial_copy_len;
+	    if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
+		INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy);
+		if (partial_copy)
+		    logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+		partial_copy_len = partial_copy = 0;
+	    } else {					    /* partial write */
+		copy_len = iclog->ic_size - log_offset;
+		INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len);
+		logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
+		if (partial_copy)
+			logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
+		partial_copy_len += copy_len;
+		partial_copy++;
+		len += sizeof(xlog_op_header_t); /* from splitting of region */
+		/* account for new log op header */
+		ticket->t_curr_res -= sizeof(xlog_op_header_t);
+	    }
+	    xlog_verify_dest_ptr(log, ptr);
+
+	    /* copy region */
+	    ASSERT(copy_len >= 0);
+	    bcopy(reg[index].i_addr + copy_off, (xfs_caddr_t)ptr, copy_len);
+	    xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
+
+	    /* make copy_len total bytes copied, including headers */
+	    copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+	    xlog_state_finish_copy(log, iclog, firstwr, (contwr? copy_len : 0));
+	    firstwr = 0;
+	    if (partial_copy) {			/* copied partial region */
+		    /* already marked WANT_SYNC by xlog_state_get_iclog_space */
+		    if ((error = xlog_state_release_iclog(log, iclog)))
+			    return (error);
+		    break;			/* don't increment index */
+	    } else {				/* copied entire region */
+		index++;
+		partial_copy_len = partial_copy = 0;
+
+		if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+		    xlog_state_want_sync(log, iclog);
+		    if ((error = xlog_state_release_iclog(log, iclog)))
+			   return (error);
+		    if (index == nentries)
+			    return 0;		/* we are done */
+		    else
+			    break;
+		}
+	    } /* if (partial_copy) */
+	} /* while (index < nentries) */
+    } /* for (index = 0; index < nentries; ) */
+    ASSERT(len == 0);
+
+    return (xlog_state_release_iclog(log, iclog));
+}	/* xlog_write */
+
+
+/*****************************************************************************
+ *
+ *		State Machine functions
+ *
+ *****************************************************************************
+ */
+
+/* Clean iclogs starting from the head.	 This ordering must be
+ * maintained, so an iclog doesn't become ACTIVE beyond one that
+ * is SYNCING.	This is also required to maintain the notion that we use
+ * a counting semaphore to hold off would be writers to the log when every
+ * iclog is trying to sync to disk.
+ *
+ * State Change: DIRTY -> ACTIVE
+ */
+void
+xlog_state_clean_log(xlog_t *log)
+{
+	xlog_in_core_t	*iclog;
+	int changed = 0;
+
+	iclog = log->l_iclog;
+	do {
+		if (iclog->ic_state == XLOG_STATE_DIRTY) {
+			iclog->ic_state = XLOG_STATE_ACTIVE;
+			iclog->ic_offset       = 0;
+			iclog->ic_callback	= 0;   /* don't need to free */
+			/*
+			 * If the number of ops in this iclog indicate it just
+			 * contains the dummy transaction, we can
+			 * change state into IDLE (the second time around).
+			 * Otherwise we should change the state into NEED a dummy.
+			 * We don't need to cover the dummy.
+			 */
+			if (!changed &&
+			   (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) {
+				changed = 1;
+			} else {	/* we have two dirty iclogs so start over */
+					/* This could also be num of ops indicates
+						this is not the dummy going out. */
+				changed = 2;
+			}
+			INT_ZERO(iclog->ic_header.h_num_logops, ARCH_CONVERT);
+			bzero(iclog->ic_header.h_cycle_data,
+			      sizeof(iclog->ic_header.h_cycle_data));
+			INT_ZERO(iclog->ic_header.h_lsn, ARCH_CONVERT);
+		} else if (iclog->ic_state == XLOG_STATE_ACTIVE)
+			/* do nothing */;
+		else
+			break;	/* stop cleaning */
+		iclog = iclog->ic_next;
+	} while (iclog != log->l_iclog);
+
+	/* log is locked when we are called */
+	/*
+	 * Change state for the dummy log recording.
+	 * We usually go to NEED. But we go to NEED2 if the changed indicates
+	 * we are done writing the dummy record.
+	 * If we are done with the second dummy recored (DONE2), then
+	 * we go to IDLE.
+	 */
+	if (changed) {
+		switch (log->l_covered_state) {
+		case XLOG_STATE_COVER_IDLE:
+		case XLOG_STATE_COVER_NEED:
+		case XLOG_STATE_COVER_NEED2:
+			log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		case XLOG_STATE_COVER_DONE:
+			if (changed == 1)
+				log->l_covered_state = XLOG_STATE_COVER_NEED2;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		case XLOG_STATE_COVER_DONE2:
+			if (changed == 1)
+				log->l_covered_state = XLOG_STATE_COVER_IDLE;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		default:
+			ASSERT(0);
+		}
+	}
+}	/* xlog_state_clean_log */
+
+STATIC xfs_lsn_t
+xlog_get_lowest_lsn(
+	xlog_t		*log)
+{
+	xlog_in_core_t	*lsn_log;
+	xfs_lsn_t	lowest_lsn, lsn;
+
+	lsn_log = log->l_iclog;
+	lowest_lsn = 0;
+	do {
+	    if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
+		lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT);
+		if ((lsn && !lowest_lsn) ||
+		    (XFS_LSN_CMP_ARCH(lsn, lowest_lsn, ARCH_NOCONVERT) < 0)) {
+			lowest_lsn = lsn;
+		}
+	    }
+	    lsn_log = lsn_log->ic_next;
+	} while (lsn_log != log->l_iclog);
+	return(lowest_lsn);
+}
+
+
+STATIC void
+xlog_state_do_callback(
+	xlog_t		*log,
+	int		aborted,
+	xlog_in_core_t	*ciclog)
+{
+	xlog_in_core_t	   *iclog;
+	xlog_in_core_t	   *first_iclog;	/* used to know when we've
+						 * processed all iclogs once */
+	xfs_log_callback_t *cb, *cb_next;
+	int		   flushcnt = 0;
+	xfs_lsn_t	   lowest_lsn;
+	int		   ioerrors;	/* counter: iclogs with errors */
+	int		   loopdidcallbacks; /* flag: inner loop did callbacks*/
+	int		   funcdidcallbacks; /* flag: function did callbacks */
+	int		   repeats;	/* for issuing console warnings if
+					 * looping too many times */
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+	first_iclog = iclog = log->l_iclog;
+	ioerrors = 0;
+	funcdidcallbacks = 0;
+	repeats = 0;
+
+	do {
+		/*
+		 * Scan all iclogs starting with the one pointed to by the
+		 * log.	 Reset this starting point each time the log is
+		 * unlocked (during callbacks).
+		 *
+		 * Keep looping through iclogs until one full pass is made
+		 * without running any callbacks.
+		 */
+		first_iclog = log->l_iclog;
+		iclog = log->l_iclog;
+		loopdidcallbacks = 0;
+		repeats++;
+
+		do {
+
+			/* skip all iclogs in the ACTIVE & DIRTY states */
+			if (iclog->ic_state &
+			    (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+				iclog = iclog->ic_next;
+				continue;
+			}
+
+			/*
+			 * Between marking a filesystem SHUTDOWN and stopping
+			 * the log, we do flush all iclogs to disk (if there
+			 * wasn't a log I/O error). So, we do want things to
+			 * go smoothly in case of just a SHUTDOWN  w/o a
+			 * LOG_IO_ERROR.
+			 */
+			if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+				/*
+				 * Can only perform callbacks in order.	 Since
+				 * this iclog is not in the DONE_SYNC/
+				 * DO_CALLBACK state, we skip the rest and
+				 * just try to clean up.  If we set our iclog
+				 * to DO_CALLBACK, we will not process it when
+				 * we retry since a previous iclog is in the
+				 * CALLBACK and the state cannot change since
+				 * we are holding the LOG_LOCK.
+				 */
+				if (!(iclog->ic_state &
+					(XLOG_STATE_DONE_SYNC |
+						 XLOG_STATE_DO_CALLBACK))) {
+					if (ciclog && (ciclog->ic_state ==
+							XLOG_STATE_DONE_SYNC)) {
+						ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
+					}
+					break;
+				}
+				/*
+				 * We now have an iclog that is in either the
+				 * DO_CALLBACK or DONE_SYNC states. The other
+				 * states (WANT_SYNC, SYNCING, or CALLBACK were
+				 * caught by the above if and are going to
+				 * clean (i.e. we aren't doing their callbacks)
+				 * see the above if.
+				 */
+
+				/*
+				 * We will do one more check here to see if we
+				 * have chased our tail around.
+				 */
+
+				lowest_lsn = xlog_get_lowest_lsn(log);
+				if (lowest_lsn && (
+					XFS_LSN_CMP_ARCH(
+						lowest_lsn,
+						INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT),
+						ARCH_NOCONVERT
+					)<0)) {
+					iclog = iclog->ic_next;
+					continue; /* Leave this iclog for
+						   * another thread */
+				}
+
+				iclog->ic_state = XLOG_STATE_CALLBACK;
+
+				LOG_UNLOCK(log, s);
+
+				/* l_last_sync_lsn field protected by
+				 * GRANT_LOCK. Don't worry about iclog's lsn.
+				 * No one else can be here except us.
+				 */
+				s = GRANT_LOCK(log);
+				ASSERT(XFS_LSN_CMP_ARCH(
+						log->l_last_sync_lsn,
+						INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT),
+						ARCH_NOCONVERT
+					)<=0);
+				log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
+				GRANT_UNLOCK(log, s);
+
+				/*
+				 * Keep processing entries in the callback list
+				 * until we come around and it is empty.  We
+				 * need to atomically see that the list is
+				 * empty and change the state to DIRTY so that
+				 * we don't miss any more callbacks being added.
+				 */
+				s = LOG_LOCK(log);
+			} else {
+				ioerrors++;
+			}
+			cb = iclog->ic_callback;
+
+			while (cb != 0) {
+				iclog->ic_callback_tail = &(iclog->ic_callback);
+				iclog->ic_callback = 0;
+				LOG_UNLOCK(log, s);
+
+				/* perform callbacks in the order given */
+				for (; cb != 0; cb = cb_next) {
+					cb_next = cb->cb_next;
+					cb->cb_func(cb->cb_arg, aborted);
+				}
+				s = LOG_LOCK(log);
+				cb = iclog->ic_callback;
+			}
+
+			loopdidcallbacks++;
+			funcdidcallbacks++;
+
+			ASSERT(iclog->ic_callback == 0);
+			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
+				iclog->ic_state = XLOG_STATE_DIRTY;
+
+			/* wake up threads waiting in xfs_log_force() */
+			sv_broadcast(&iclog->ic_forcesema);
+
+			iclog = iclog->ic_next;
+		} while (first_iclog != iclog);
+		if (repeats && (repeats % 10) == 0) {
+			xfs_fs_cmn_err(CE_WARN, log->l_mp,
+				"xlog_state_do_callback: looping %d\n", repeats);
+		}
+	} while (!ioerrors && loopdidcallbacks);
+
+	/*
+	 * make one last gasp attempt to see if iclogs are being left in
+	 * limbo..
+	 */
+#ifdef DEBUG
+	if (funcdidcallbacks) {
+		first_iclog = iclog = log->l_iclog;
+		do {
+			ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+			/*
+			 * Terminate the loop if iclogs are found in states
+			 * which will cause other threads to clean up iclogs.
+			 *
+			 * SYNCING - i/o completion will go through logs
+			 * DONE_SYNC - interrupt thread should be waiting for
+			 *		LOG_LOCK
+			 * IOERROR - give up hope all ye who enter here
+			 */
+			if (iclog->ic_state == XLOG_STATE_SYNCING ||
+			    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+			    iclog->ic_state == XLOG_STATE_IOERROR )
+				break;
+			iclog = iclog->ic_next;
+		} while (first_iclog != iclog);
+	}
+#endif
+
+	/*
+	 * Transition from DIRTY to ACTIVE if applicable. NOP if
+	 * STATE_IOERROR.
+	 */
+	xlog_state_clean_log(log);
+
+	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) {
+		flushcnt = log->l_flushcnt;
+		log->l_flushcnt = 0;
+	}
+	LOG_UNLOCK(log, s);
+	while (flushcnt--)
+		vsema(&log->l_flushsema);
+}	/* xlog_state_do_callback */
+
+
+/*
+ * Finish transitioning this iclog to the dirty state.
+ *
+ * Make sure that we completely execute this routine only when this is
+ * the last call to the iclog.	There is a good chance that iclog flushes,
+ * when we reach the end of the physical log, get turned into 2 separate
+ * calls to bwrite.  Hence, one iclog flush could generate two calls to this
+ * routine.  By using the reference count bwritecnt, we guarantee that only
+ * the second completion goes through.
+ *
+ * Callbacks could take time, so they are done outside the scope of the
+ * global state machine log lock.  Assume that the calls to cvsema won't
+ * take a long time.  At least we know it won't sleep.
+ */
+void
+xlog_state_done_syncing(
+	xlog_in_core_t	*iclog,
+	int		aborted)
+{
+	xlog_t		   *log = iclog->ic_log;
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+
+	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
+	       iclog->ic_state == XLOG_STATE_IOERROR);
+	ASSERT(iclog->ic_refcnt == 0);
+	ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
+
+
+	/*
+	 * If we got an error, either on the first buffer, or in the case of
+	 * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
+	 * and none should ever be attempted to be written to disk
+	 * again.
+	 */
+	if (iclog->ic_state != XLOG_STATE_IOERROR) {
+		if (--iclog->ic_bwritecnt == 1) {
+			LOG_UNLOCK(log, s);
+			return;
+		}
+		iclog->ic_state = XLOG_STATE_DONE_SYNC;
+	}
+
+	/*
+	 * Someone could be sleeping on the next iclog even though it is
+	 * in the ACTIVE state.	 We kick off one thread to force the
+	 * iclog buffer out.
+	 */
+	if (iclog->ic_next->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+		sv_signal(&iclog->ic_next->ic_forcesema);
+	LOG_UNLOCK(log, s);
+	xlog_state_do_callback(log, aborted, iclog);	/* also cleans log */
+}	/* xlog_state_done_syncing */
+
+
+/*
+ * Update counters atomically now that bcopy is done.
+ */
+/* ARGSUSED */
+static inline void
+xlog_state_finish_copy(xlog_t		*log,
+		       xlog_in_core_t	*iclog,
+		       int		first_write,
+		       int		copy_bytes)
+{
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+
+	if (first_write)
+		INT_MOD(iclog->ic_header.h_num_logops, ARCH_CONVERT, +1);
+	INT_MOD(iclog->ic_header.h_num_logops, ARCH_CONVERT, +1);
+	iclog->ic_offset += copy_bytes;
+
+	LOG_UNLOCK(log, s);
+}	/* xlog_state_finish_copy */
+
+
+
+/*
+ * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
+ * sleep.  The flush semaphore is set to the number of in-core buffers and
+ * decremented around disk syncing.  Therefore, if all buffers are syncing,
+ * this semaphore will cause new writes to sleep until a sync completes.
+ * Otherwise, this code just does p() followed by v().	This approximates
+ * a sleep/wakeup except we can't race.
+ *
+ * The in-core logs are used in a circular fashion. They are not used
+ * out-of-order even when an iclog past the head is free.
+ *
+ * return:
+ *	* log_offset where xlog_write() can start writing into the in-core
+ *		log's data space.
+ *	* in-core log pointer to which xlog_write() should write.
+ *	* boolean indicating this is a continued write to an in-core log.
+ *		If this is the last write, then the in-core log's offset field
+ *		needs to be incremented, depending on the amount of data which
+ *		is copied.
+ */
+int
+xlog_state_get_iclog_space(xlog_t	  *log,
+			   int		  len,
+			   xlog_in_core_t **iclogp,
+			   xlog_ticket_t  *ticket,
+			   int		  *continued_write,
+			   int		  *logoffsetp)
+{
+	SPLDECL(s);
+	int		  log_offset;
+	xlog_rec_header_t *head;
+	xlog_in_core_t	  *iclog;
+	int		  error;
+
+	xlog_state_do_callback(log, 0, NULL);	/* also cleans log */
+
+restart:
+	s = LOG_LOCK(log);
+	if (XLOG_FORCED_SHUTDOWN(log)) {
+		LOG_UNLOCK(log, s);
+		return XFS_ERROR(EIO);
+	}
+
+	iclog = log->l_iclog;
+	if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) {
+		log->l_flushcnt++;
+		LOG_UNLOCK(log, s);
+		xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
+		XFS_STATS_INC(xfsstats.xs_log_noiclogs);
+		/* Ensure that log writes happen */
+		psema(&log->l_flushsema, PINOD);
+		goto restart;
+	}
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+	head = &iclog->ic_header;
+
+	iclog->ic_refcnt++;			/* prevents sync */
+	log_offset = iclog->ic_offset;
+
+	/* On the 1st write to an iclog, figure out lsn.  This works
+	 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
+	 * committing to.  If the offset is set, that's how many blocks
+	 * must be written.
+	 */
+	if (log_offset == 0) {
+		ticket->t_curr_res -= log->l_iclog_hsize;
+		INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle);
+		ASSIGN_LSN(head->h_lsn, log, ARCH_CONVERT);
+		ASSERT(log->l_curr_block >= 0);
+
+		/* round off error from last write with this iclog */
+		ticket->t_curr_res -= iclog->ic_roundoff;
+		log->l_roundoff -= iclog->ic_roundoff;
+		iclog->ic_roundoff = 0;
+	}
+
+	/* If there is enough room to write everything, then do it.  Otherwise,
+	 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
+	 * bit is on, so this will get flushed out.  Don't update ic_offset
+	 * until you know exactly how many bytes get copied.  Therefore, wait
+	 * until later to update ic_offset.
+	 *
+	 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+	 * can fit into remaining data section.
+	 */
+	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+
+		/* If I'm the only one writing to this iclog, sync it to disk */
+		if (iclog->ic_refcnt == 1) {
+			LOG_UNLOCK(log, s);
+			if ((error = xlog_state_release_iclog(log, iclog)))
+				return (error);
+		} else {
+			iclog->ic_refcnt--;
+			LOG_UNLOCK(log, s);
+		}
+		goto restart;
+	}
+
+	/* Do we have enough room to write the full amount in the remainder
+	 * of this iclog?  Or must we continue a write on the next iclog and
+	 * mark this iclog as completely taken?	 In the case where we switch
+	 * iclogs (to mark it taken), this particular iclog will release/sync
+	 * to disk in xlog_write().
+	 */
+	if (len <= iclog->ic_size - iclog->ic_offset) {
+		*continued_write = 0;
+		iclog->ic_offset += len;
+	} else {
+		*continued_write = 1;
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+	}
+	*iclogp = iclog;
+
+	ASSERT(iclog->ic_offset <= iclog->ic_size);
+	LOG_UNLOCK(log, s);
+
+	*logoffsetp = log_offset;
+	return 0;
+}	/* xlog_state_get_iclog_space */
+
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto the reserveq, it will only return after
+ * the needed reservation is satisfied.
+ */
+STATIC int
+xlog_grant_log_space(xlog_t	   *log,
+		     xlog_ticket_t *tic)
+{
+	int		 free_bytes;
+	int		 need_bytes;
+	SPLDECL(s);
+#ifdef DEBUG
+	xfs_lsn_t	 tail_lsn;
+#endif
+
+
+#ifdef DEBUG
+	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+		panic("grant Recovery problem");
+#endif
+
+	/* Is there space or do we need to sleep? */
+	s = GRANT_LOCK(log);
+	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter");
+
+	/* something is already sleeping; insert new transaction at end */
+	if (log->l_reserve_headq) {
+		XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
+		xlog_trace_loggrant(log, tic,
+				    "xlog_grant_log_space: sleep 1");
+		/*
+		 * Gotta check this before going to sleep, while we're
+		 * holding the grant lock.
+		 */
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto error_return;
+
+		XFS_STATS_INC(xfsstats.xs_sleep_logspace);
+		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+		/*
+		 * If we got an error, and the filesystem is shutting down,
+		 * we'll catch it down below. So just continue...
+		 */
+		xlog_trace_loggrant(log, tic,
+				    "xlog_grant_log_space: wake 1");
+		s = GRANT_LOCK(log);
+	}
+	if (tic->t_flags & XFS_LOG_PERM_RESERV)
+		need_bytes = tic->t_unit_res*tic->t_ocnt;
+	else
+		need_bytes = tic->t_unit_res;
+
+redo:
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto error_return;
+
+	free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+				     log->l_grant_reserve_bytes);
+	if (free_bytes < need_bytes) {
+		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+			XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
+		xlog_trace_loggrant(log, tic,
+				    "xlog_grant_log_space: sleep 2");
+		XFS_STATS_INC(xfsstats.xs_sleep_logspace);
+		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+
+		if (XLOG_FORCED_SHUTDOWN(log)) {
+			s = GRANT_LOCK(log);
+			goto error_return;
+		}
+
+		xlog_trace_loggrant(log, tic,
+				    "xlog_grant_log_space: wake 2");
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		s = GRANT_LOCK(log);
+		goto redo;
+	} else if (tic->t_flags & XLOG_TIC_IN_Q)
+		XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+
+	/* we've got enough space */
+	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w');
+	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r');
+#ifdef DEBUG
+	tail_lsn = log->l_tail_lsn;
+	/*
+	 * Check to make sure the grant write head didn't just over lap the
+	 * tail.  If the cycles are the same, we can't be overlapping.
+	 * Otherwise, make sure that the cycles differ by exactly one and
+	 * check the byte count.
+	 */
+	if (CYCLE_LSN(tail_lsn, ARCH_NOCONVERT) != log->l_grant_write_cycle) {
+		ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn, ARCH_NOCONVERT));
+		ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn, ARCH_NOCONVERT)));
+	}
+#endif
+	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit");
+	xlog_verify_grant_head(log, 1);
+	GRANT_UNLOCK(log, s);
+	return 0;
+
+ error_return:
+	if (tic->t_flags & XLOG_TIC_IN_Q)
+		XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
+	/*
+	 * If we are failing, make sure the ticket doesn't have any
+	 * current reservations. We don't want to add this back when
+	 * the ticket/transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+	GRANT_UNLOCK(log, s);
+	return XFS_ERROR(EIO);
+}	/* xlog_grant_log_space */
+
+
+/*
+ * Replenish the byte reservation required by moving the grant write head.
+ *
+ *
+ */
+STATIC int
+xlog_regrant_write_log_space(xlog_t	   *log,
+			     xlog_ticket_t *tic)
+{
+	SPLDECL(s);
+	int		free_bytes, need_bytes;
+	xlog_ticket_t	*ntic;
+#ifdef DEBUG
+	xfs_lsn_t	tail_lsn;
+#endif
+
+	tic->t_curr_res = tic->t_unit_res;
+
+	if (tic->t_cnt > 0)
+		return (0);
+
+#ifdef DEBUG
+	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+		panic("regrant Recovery problem");
+#endif
+
+	s = GRANT_LOCK(log);
+	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter");
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto error_return;
+
+	/* If there are other waiters on the queue then give them a
+	 * chance at logspace before us. Wake up the first waiters,
+	 * if we do not wake up all the waiters then go to sleep waiting
+	 * for more free space, otherwise try to get some space for
+	 * this transaction.
+	 */
+
+	if ((ntic = log->l_write_headq)) {
+		free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+					     log->l_grant_write_bytes);
+		do {
+			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
+
+			if (free_bytes < ntic->t_unit_res)
+				break;
+			free_bytes -= ntic->t_unit_res;
+			sv_signal(&ntic->t_sema);
+			ntic = ntic->t_next;
+		} while (ntic != log->l_write_headq);
+
+		if (ntic != log->l_write_headq) {
+			if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+				XLOG_INS_TICKETQ(log->l_write_headq, tic);
+
+			xlog_trace_loggrant(log, tic,
+				    "xlog_regrant_write_log_space: sleep 1");
+			XFS_STATS_INC(xfsstats.xs_sleep_logspace);
+			sv_wait(&tic->t_sema, PINOD|PLTWAIT,
+				&log->l_grant_lock, s);
+
+			/* If we're shutting down, this tic is already
+			 * off the queue */
+			if (XLOG_FORCED_SHUTDOWN(log)) {
+				s = GRANT_LOCK(log);
+				goto error_return;
+			}
+
+			xlog_trace_loggrant(log, tic,
+				    "xlog_regrant_write_log_space: wake 1");
+			xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
+			s = GRANT_LOCK(log);
+		}
+	}
+
+	need_bytes = tic->t_unit_res;
+
+redo:
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto error_return;
+
+	free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+				     log->l_grant_write_bytes);
+	if (free_bytes < need_bytes) {
+		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+			XLOG_INS_TICKETQ(log->l_write_headq, tic);
+		XFS_STATS_INC(xfsstats.xs_sleep_logspace);
+		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+
+		/* If we're shutting down, this tic is already off the queue */
+		if (XLOG_FORCED_SHUTDOWN(log)) {
+			s = GRANT_LOCK(log);
+			goto error_return;
+		}
+
+		xlog_trace_loggrant(log, tic,
+				    "xlog_regrant_write_log_space: wake 2");
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		s = GRANT_LOCK(log);
+		goto redo;
+	} else if (tic->t_flags & XLOG_TIC_IN_Q)
+		XLOG_DEL_TICKETQ(log->l_write_headq, tic);
+
+	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */
+#ifdef DEBUG
+	tail_lsn = log->l_tail_lsn;
+	if (CYCLE_LSN(tail_lsn, ARCH_NOCONVERT) != log->l_grant_write_cycle) {
+		ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn, ARCH_NOCONVERT));
+		ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn, ARCH_NOCONVERT)));
+	}
+#endif
+
+	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit");
+	xlog_verify_grant_head(log, 1);
+	GRANT_UNLOCK(log, s);
+	return (0);
+
+
+ error_return:
+	if (tic->t_flags & XLOG_TIC_IN_Q)
+		XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
+	/*
+	 * If we are failing, make sure the ticket doesn't have any
+	 * current reservations. We don't want to add this back when
+	 * the ticket/transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+	GRANT_UNLOCK(log, s);
+	return XFS_ERROR(EIO);
+}	/* xlog_regrant_write_log_space */
+
+
+/* The first cnt-1 times through here we don't need to
+ * move the grant write head because the permanent
+ * reservation has reserved cnt times the unit amount.
+ * Release part of current permanent unit reservation and
+ * reset current reservation to be one units worth.  Also
+ * move grant reservation head forward.
+ */
+STATIC void
+xlog_regrant_reserve_log_space(xlog_t	     *log,
+			       xlog_ticket_t *ticket)
+{
+	SPLDECL(s);
+
+	xlog_trace_loggrant(log, ticket,
+			    "xlog_regrant_reserve_log_space: enter");
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	s = GRANT_LOCK(log);
+	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
+	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
+	ticket->t_curr_res = ticket->t_unit_res;
+	xlog_trace_loggrant(log, ticket,
+			    "xlog_regrant_reserve_log_space: sub current res");
+	xlog_verify_grant_head(log, 1);
+
+	/* just return if we still have some of the pre-reserved space */
+	if (ticket->t_cnt > 0) {
+		GRANT_UNLOCK(log, s);
+		return;
+	}
+
+	XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r');
+	xlog_trace_loggrant(log, ticket,
+			    "xlog_regrant_reserve_log_space: exit");
+	xlog_verify_grant_head(log, 0);
+	GRANT_UNLOCK(log, s);
+	ticket->t_curr_res = ticket->t_unit_res;
+}	/* xlog_regrant_reserve_log_space */
+
+
+/*
+ * Give back the space left from a reservation.
+ *
+ * All the information we need to make a correct determination of space left
+ * is present.	For non-permanent reservations, things are quite easy.	The
+ * count should have been decremented to zero.	We only need to deal with the
+ * space remaining in the current reservation part of the ticket.  If the
+ * ticket contains a permanent reservation, there may be left over space which
+ * needs to be released.  A count of N means that N-1 refills of the current
+ * reservation can be done before we need to ask for more space.  The first
+ * one goes to fill up the first current reservation.  Once we run out of
+ * space, the count will stay at zero and the only space remaining will be
+ * in the current reservation field.
+ */
+STATIC void
+xlog_ungrant_log_space(xlog_t	     *log,
+		       xlog_ticket_t *ticket)
+{
+	SPLDECL(s);
+
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	s = GRANT_LOCK(log);
+	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
+
+	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
+	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
+
+	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
+
+	/* If this is a permanent reservation ticket, we may be able to free
+	 * up more space based on the remaining count.
+	 */
+	if (ticket->t_cnt > 0) {
+		ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
+		XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w');
+		XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r');
+	}
+
+	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
+	xlog_verify_grant_head(log, 1);
+	GRANT_UNLOCK(log, s);
+	xfs_log_move_tail(log->l_mp, 1);
+}	/* xlog_ungrant_log_space */
+
+
+/*
+ * If the lsn is not found or the iclog with the lsn is in the callback
+ * state, we need to call the function directly.  This is done outside
+ * this function's scope.  Otherwise, we insert the callback at the end
+ * of the iclog's callback list.
+ */
+int
+xlog_state_lsn_is_synced(xlog_t		    *log,
+			 xfs_lsn_t	    lsn,
+			 xfs_log_callback_t *cb,
+			 int		    *abortflg)
+{
+	xlog_in_core_t *iclog;
+	SPLDECL(s);
+	int	      lsn_is_synced = 1;
+
+	*abortflg = 0;
+	s = LOG_LOCK(log);
+
+	iclog = log->l_iclog;
+	do {
+		if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) {
+			iclog = iclog->ic_next;
+			continue;
+		} else {
+			if (iclog->ic_state & XLOG_STATE_DIRTY) /* call it*/
+				break;
+
+			if (iclog->ic_state & XLOG_STATE_IOERROR) {
+				*abortflg = XFS_LI_ABORTED;
+				break;
+			}
+			/* insert callback onto end of list */
+			cb->cb_next = 0;
+			*(iclog->ic_callback_tail) = cb;
+			iclog->ic_callback_tail = &(cb->cb_next);
+			lsn_is_synced = 0;
+			break;
+		}
+	} while (iclog != log->l_iclog);
+
+	LOG_UNLOCK(log, s);
+	return lsn_is_synced;
+}	/* xlog_state_lsn_is_synced */
+
+
+/*
+ * Atomically put back used ticket.
+ */
+void
+xlog_state_put_ticket(xlog_t	    *log,
+		      xlog_ticket_t *tic)
+{
+	unsigned long s;
+
+	s = LOG_LOCK(log);
+	xlog_ticket_put(log, tic);
+	LOG_UNLOCK(log, s);
+}	/* xlog_state_put_ticket */
+
+void xlog_sync_sched(
+	void	*v)
+{
+	xlog_in_core_t	*iclog = (xlog_in_core_t *)v;
+	xlog_t		*log  = iclog->ic_log;
+
+	xlog_sync(log, iclog, 0);
+}
+
+int xlog_mode = 1;
+
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and
+ * the WANT_SYNC bit is set.
+ *
+ * When this function is entered, the iclog is not necessarily in the
+ * WANT_SYNC state.  It may be sitting around waiting to get filled.
+ *
+ *
+ */
+int
+xlog_state_release_iclog(xlog_t		*log,
+			 xlog_in_core_t *iclog)
+{
+	SPLDECL(s);
+	int		sync = 0;	/* do we sync? */
+
+	xlog_assign_tail_lsn(log->l_mp, 0);
+
+	s = LOG_LOCK(log);
+
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		LOG_UNLOCK(log, s);
+		return XFS_ERROR(EIO);
+	}
+
+	ASSERT(iclog->ic_refcnt > 0);
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
+	       iclog->ic_state == XLOG_STATE_WANT_SYNC);
+
+	if (--iclog->ic_refcnt == 0 &&
+	    iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+		sync++;
+		iclog->ic_state = XLOG_STATE_SYNCING;
+		INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn);
+		xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+		/* cycle incremented when incrementing curr_block */
+	}
+
+	LOG_UNLOCK(log, s);
+
+	/*
+	 * We let the log lock go, so it's possible that we hit a log I/O
+	 * error or someother SHUTDOWN condition that marks the iclog
+	 * as XLOG_STATE_IOERROR before the bwrite. However, we know that
+	 * this iclog has consistent data, so we ignore IOERROR
+	 * flags after this point.
+	 */
+	if (sync) {
+		INIT_TQUEUE(&iclog->ic_write_sched,
+			xlog_sync_sched, (void *) iclog);
+		switch (xlog_mode) {
+		case 0:
+			return xlog_sync(log, iclog, 0);
+		case 1:
+			pagebuf_queue_task(&iclog->ic_write_sched);
+		}
+	}
+	return (0);
+
+}	/* xlog_state_release_iclog */
+
+
+/*
+ * This routine will mark the current iclog in the ring as WANT_SYNC
+ * and move the current iclog pointer to the next iclog in the ring.
+ * When this routine is called from xlog_state_get_iclog_space(), the
+ * exact size of the iclog has not yet been determined.	 All we know is
+ * that every data block.  We have run out of space in this log record.
+ */
+STATIC void
+xlog_state_switch_iclogs(xlog_t		*log,
+			 xlog_in_core_t *iclog,
+			 int		eventual_size)
+{
+	uint roundup;
+
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+	if (!eventual_size)
+		eventual_size = iclog->ic_offset;
+	iclog->ic_state = XLOG_STATE_WANT_SYNC;
+	INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block);
+	log->l_prev_block = log->l_curr_block;
+	log->l_prev_cycle = log->l_curr_cycle;
+
+	/* roll log?: ic_offset changed later */
+	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
+
+	/* Round up to next log-sunit */
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		if (log->l_curr_block & (log->l_mp->m_lstripemask - 1)) {
+			roundup = log->l_mp->m_lstripemask -
+				(log->l_curr_block &
+				 (log->l_mp->m_lstripemask - 1));
+		} else {
+			roundup = 0;
+		}
+		log->l_curr_block += roundup;
+	}
+
+	if (log->l_curr_block >= log->l_logBBsize) {
+		log->l_curr_cycle++;
+		if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
+			log->l_curr_cycle++;
+		log->l_curr_block -= log->l_logBBsize;
+		ASSERT(log->l_curr_block >= 0);
+	}
+	ASSERT(iclog == log->l_iclog);
+	log->l_iclog = iclog->ic_next;
+}	/* xlog_state_switch_iclogs */
+
+
+/*
+ * Write out all data in the in-core log as of this exact moment in time.
+ *
+ * Data may be written to the in-core log during this call.  However,
+ * we don't guarantee this data will be written out.  A change from past
+ * implementation means this routine will *not* write out zero length LRs.
+ *
+ * Basically, we try and perform an intelligent scan of the in-core logs.
+ * If we determine there is no flushable data, we just return.	There is no
+ * flushable data if:
+ *
+ *	1. the current iclog is active and has no data; the previous iclog
+ *		is in the active or dirty state.
+ *	2. the current iclog is drity, and the previous iclog is in the
+ *		active or dirty state.
+ *
+ * We may sleep (call psema) if:
+ *
+ *	1. the current iclog is not in the active nor dirty state.
+ *	2. the current iclog dirty, and the previous iclog is not in the
+ *		active nor dirty state.
+ *	3. the current iclog is active, and there is another thread writing
+ *		to this particular iclog.
+ *	4. a) the current iclog is active and has no other writers
+ *	   b) when we return from flushing out this iclog, it is still
+ *		not in the active nor dirty state.
+ */
+STATIC int
+xlog_state_sync_all(xlog_t *log, uint flags)
+{
+	xlog_in_core_t	*iclog;
+	xfs_lsn_t	lsn;
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+
+	iclog = log->l_iclog;
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		LOG_UNLOCK(log, s);
+		return XFS_ERROR(EIO);
+	}
+
+	/* If the head iclog is not active nor dirty, we just attach
+	 * ourselves to the head and go to sleep.
+	 */
+	if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+	    iclog->ic_state == XLOG_STATE_DIRTY) {
+		/*
+		 * If the head is dirty or (active and empty), then
+		 * we need to look at the previous iclog.  If the previous
+		 * iclog is active or dirty we are done.  There is nothing
+		 * to sync out.	 Otherwise, we attach ourselves to the
+		 * previous iclog and go to sleep.
+		 */
+		if (iclog->ic_state == XLOG_STATE_DIRTY ||
+		    (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
+			iclog = iclog->ic_prev;
+			if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+			    iclog->ic_state == XLOG_STATE_DIRTY)
+				goto no_sleep;
+			else
+				goto maybe_sleep;
+		} else {
+			if (iclog->ic_refcnt == 0) {
+				/* We are the only one with access to this
+				 * iclog.  Flush it out now.  There should
+				 * be a roundoff of zero to show that someone
+				 * has already taken care of the roundoff from
+				 * the previous sync.
+				 */
+				ASSERT(iclog->ic_roundoff == 0);
+				iclog->ic_refcnt++;
+				lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
+				xlog_state_switch_iclogs(log, iclog, 0);
+				LOG_UNLOCK(log, s);
+
+				if (xlog_state_release_iclog(log, iclog))
+					return XFS_ERROR(EIO);
+				s = LOG_LOCK(log);
+				if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn &&
+				    iclog->ic_state != XLOG_STATE_DIRTY)
+					goto maybe_sleep;
+				else
+					goto no_sleep;
+			} else {
+				/* Someone else is writing to this iclog.
+				 * Use its call to flush out the data.	However,
+				 * the other thread may not force out this LR,
+				 * so we mark it WANT_SYNC.
+				 */
+				xlog_state_switch_iclogs(log, iclog, 0);
+				goto maybe_sleep;
+			}
+		}
+	}
+
+	/* By the time we come around again, the iclog could've been filled
+	 * which would give it another lsn.  If we have a new lsn, just
+	 * return because the relevant data has been flushed.
+	 */
+maybe_sleep:
+	if (flags & XFS_LOG_SYNC) {
+		/*
+		 * We must check if we're shutting down here, before
+		 * we wait, while we're holding the LOG_LOCK.
+		 * Then we check again after waking up, in case our
+		 * sleep was disturbed by a bad news.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR) {
+			LOG_UNLOCK(log, s);
+			return XFS_ERROR(EIO);
+		}
+		XFS_STATS_INC(xfsstats.xs_log_force_sleep);
+		sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
+		/*
+		 * No need to grab the log lock here since we're
+		 * only deciding whether or not to return EIO
+		 * and the memory read should be atomic.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR)
+			return XFS_ERROR(EIO);
+
+	} else {
+
+no_sleep:
+		LOG_UNLOCK(log, s);
+	}
+	return 0;
+}	/* xlog_state_sync_all */
+
+
+/*
+ * Used by code which implements synchronous log forces.
+ *
+ * Find in-core log with lsn.
+ *	If it is in the DIRTY state, just return.
+ *	If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ *		state and go to sleep or return.
+ *	If it is in any other state, go to sleep or return.
+ *
+ * If filesystem activity goes to zero, the iclog will get flushed only by
+ * bdflush().
+ */
+int
+xlog_state_sync(xlog_t	  *log,
+		xfs_lsn_t lsn,
+		uint	  flags)
+{
+    xlog_in_core_t	*iclog;
+    int			already_slept = 0;
+    SPLDECL(s);
+
+
+try_again:
+    s = LOG_LOCK(log);
+    iclog = log->l_iclog;
+
+    if (iclog->ic_state & XLOG_STATE_IOERROR) {
+	    LOG_UNLOCK(log, s);
+	    return XFS_ERROR(EIO);
+    }
+
+    do {
+	if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) {
+	    iclog = iclog->ic_next;
+	    continue;
+	}
+
+	if (iclog->ic_state == XLOG_STATE_DIRTY) {
+		LOG_UNLOCK(log, s);
+		return 0;
+	}
+
+	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+		/*
+		 * We sleep here if we haven't already slept (e.g.
+		 * this is the first time we've looked at the correct
+		 * iclog buf) and the buffer before us is going to
+		 * be sync'ed.	We have to do that to ensure that the
+		 * log records go out in the proper order.  When it's
+		 * done, someone waiting on this buffer will be woken up
+		 * (maybe us) to flush this buffer out.
+		 *
+		 * Otherwise, we mark the buffer WANT_SYNC, and bump
+		 * up the refcnt so we can release the log (which drops
+		 * the ref count).  The state switch keeps new transaction
+		 * commits from using this buffer.  When the current commits
+		 * finish writing into the buffer, the refcount will drop to
+		 * zero and the buffer will go out then.
+		 */
+		if (!already_slept &&
+		    (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC |
+						 XLOG_STATE_SYNCING))) {
+			ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+			XFS_STATS_INC(xfsstats.xs_log_force_sleep);
+			sv_wait(&iclog->ic_prev->ic_forcesema, PSWP,
+				&log->l_icloglock, s);
+			already_slept = 1;
+			goto try_again;
+		} else {
+			iclog->ic_refcnt++;
+			xlog_state_switch_iclogs(log, iclog, 0);
+			LOG_UNLOCK(log, s);
+			if (xlog_state_release_iclog(log, iclog))
+				return XFS_ERROR(EIO);
+			s = LOG_LOCK(log);
+		}
+	}
+
+	if ((flags & XFS_LOG_SYNC) && /* sleep */
+	    !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+
+		/*
+		 * Don't wait on the forcesema if we know that we've
+		 * gotten a log write error.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR) {
+			LOG_UNLOCK(log, s);
+			return XFS_ERROR(EIO);
+		}
+		XFS_STATS_INC(xfsstats.xs_log_force_sleep);
+		sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
+		/*
+		 * No need to grab the log lock here since we're
+		 * only deciding whether or not to return EIO
+		 * and the memory read should be atomic.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR)
+			return XFS_ERROR(EIO);
+	} else {		/* just return */
+		LOG_UNLOCK(log, s);
+	}
+	return 0;
+
+    } while (iclog != log->l_iclog);
+
+    LOG_UNLOCK(log, s);
+    return (0);
+}	/* xlog_state_sync */
+
+
+/*
+ * Called when we want to mark the current iclog as being ready to sync to
+ * disk.
+ */
+void
+xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
+{
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+
+	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+		xlog_state_switch_iclogs(log, iclog, 0);
+	} else {
+		ASSERT(iclog->ic_state &
+			(XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
+	}
+
+	LOG_UNLOCK(log, s);
+}	/* xlog_state_want_sync */
+
+
+
+/*****************************************************************************
+ *
+ *		TICKET functions
+ *
+ *****************************************************************************
+ */
+
+/*
+ *	Algorithm doesn't take into account page size. ;-(
+ */
+STATIC void
+xlog_state_ticket_alloc(xlog_t *log)
+{
+	xlog_ticket_t	*t_list;
+	xlog_ticket_t	*next;
+	xfs_caddr_t	buf;
+	uint		i = (NBPP / sizeof(xlog_ticket_t)) - 2;
+	SPLDECL(s);
+
+	/*
+	 * The kmem_zalloc may sleep, so we shouldn't be holding the
+	 * global lock.	 XXXmiken: may want to use zone allocator.
+	 */
+	buf = (xfs_caddr_t) kmem_zalloc(NBPP, 0);
+
+	s = LOG_LOCK(log);
+
+	/* Attach 1st ticket to Q, so we can keep track of allocated memory */
+	t_list = (xlog_ticket_t *)buf;
+	t_list->t_next = log->l_unmount_free;
+	log->l_unmount_free = t_list++;
+	log->l_ticket_cnt++;
+	log->l_ticket_tcnt++;
+
+	/* Next ticket becomes first ticket attached to ticket free list */
+	if (log->l_freelist != NULL) {
+		ASSERT(log->l_tail != NULL);
+		log->l_tail->t_next = t_list;
+	} else {
+		log->l_freelist = t_list;
+	}
+	log->l_ticket_cnt++;
+	log->l_ticket_tcnt++;
+
+	/* Cycle through rest of alloc'ed memory, building up free Q */
+	for ( ; i > 0; i--) {
+		next = t_list + 1;
+		t_list->t_next = next;
+		t_list = next;
+		log->l_ticket_cnt++;
+		log->l_ticket_tcnt++;
+	}
+	t_list->t_next = 0;
+	log->l_tail = t_list;
+	LOG_UNLOCK(log, s);
+}	/* xlog_state_ticket_alloc */
+
+
+/*
+ * Put ticket into free list
+ *
+ * Assumption: log lock is held around this call.
+ */
+STATIC void
+xlog_ticket_put(xlog_t		*log,
+		xlog_ticket_t	*ticket)
+{
+	sv_destroy(&ticket->t_sema);
+
+	/*
+	 * Don't think caching will make that much difference.	It's
+	 * more important to make debug easier.
+	 */
+#if 0
+	/* real code will want to use LIFO for caching */
+	ticket->t_next = log->l_freelist;
+	log->l_freelist = ticket;
+	/* no need to clear fields */
+#else
+	/* When we debug, it is easier if tickets are cycled */
+	ticket->t_next	   = 0;
+	if (log->l_tail != 0) {
+		log->l_tail->t_next = ticket;
+	} else {
+		ASSERT(log->l_freelist == 0);
+		log->l_freelist = ticket;
+	}
+	log->l_tail	    = ticket;
+#endif /* DEBUG */
+	log->l_ticket_cnt++;
+}	/* xlog_ticket_put */
+
+
+/*
+ * Grab ticket off freelist or allocation some more
+ */
+xlog_ticket_t *
+xlog_ticket_get(xlog_t		*log,
+		int		unit_bytes,
+		int		cnt,
+		char		client,
+		uint		xflags)
+{
+	xlog_ticket_t	*tic;
+	SPLDECL(s);
+
+ alloc:
+	if (log->l_freelist == NULL)
+		xlog_state_ticket_alloc(log);		/* potentially sleep */
+
+	s = LOG_LOCK(log);
+	if (log->l_freelist == NULL) {
+		LOG_UNLOCK(log, s);
+		goto alloc;
+	}
+	tic		= log->l_freelist;
+	log->l_freelist = tic->t_next;
+	if (log->l_freelist == NULL)
+		log->l_tail = NULL;
+	log->l_ticket_cnt--;
+	LOG_UNLOCK(log, s);
+
+	/*
+	 * Permanent reservations have up to 'cnt'-1 active log operations
+	 * in the log.	A unit in this case is the amount of space for one
+	 * of these log operations.  Normal reservations have a cnt of 1
+	 * and their unit amount is the total amount of space required.
+	 * The following line of code adds one log record header length
+	 * for each part of an operation which may fall on a different
+	 * log record.
+	 *
+	 * One more XLOG_HEADER_SIZE is added to account for possible
+	 * round off errors when syncing a LR to disk.	The bytes are
+	 * subtracted if the thread using this ticket is the first writer
+	 * to a new LR.
+	 *
+	 * We add an extra log header for the possibility that the commit
+	 * record is the first data written to a new log record.  In this
+	 * case it is separate from the rest of the transaction data and
+	 * will be charged for the log record header.
+	 */
+	unit_bytes += log->l_iclog_hsize * (XLOG_BTOLRBB(unit_bytes) + 2);
+
+	tic->t_unit_res		= unit_bytes;
+	tic->t_curr_res		= unit_bytes;
+	tic->t_cnt		= cnt;
+	tic->t_ocnt		= cnt;
+	tic->t_tid		= (xlog_tid_t)((__psint_t)tic & 0xffffffff);
+	tic->t_clientid		= client;
+	tic->t_flags		= XLOG_TIC_INITED;
+	if (xflags & XFS_LOG_PERM_RESERV)
+		tic->t_flags |= XLOG_TIC_PERM_RESERV;
+	sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
+
+	return tic;
+}	/* xlog_ticket_get */
+
+
+/******************************************************************************
+ *
+ *		Log debug routines
+ *
+ ******************************************************************************
+ */
+#if defined(DEBUG) && !defined(XLOG_NOLOG)
+/*
+ * Make sure that the destination ptr is within the valid data region of
+ * one of the iclogs.  This uses backup pointers stored in a different
+ * part of the log in case we trash the log structure.
+ */
+void
+xlog_verify_dest_ptr(xlog_t	*log,
+		     __psint_t	ptr)
+{
+	int i;
+	int good_ptr = 0;
+
+	for (i=0; i < log->l_iclog_bufs; i++) {
+		if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
+		    ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
+			good_ptr++;
+	}
+	if (! good_ptr)
+		xlog_panic("xlog_verify_dest_ptr: invalid ptr");
+}	/* xlog_verify_dest_ptr */
+
+
+#ifdef XFSDEBUG
+/* check split LR write */
+STATIC void
+xlog_verify_disk_cycle_no(xlog_t	 *log,
+			  xlog_in_core_t *iclog)
+{
+    xfs_buf_t	*bp;
+    uint	cycle_no;
+    xfs_daddr_t i;
+
+    if (BLOCK_LSN(iclog->ic_header.h_lsn, ARCH_CONVERT) < 10) {
+	cycle_no = CYCLE_LSN(iclog->ic_header.h_lsn, ARCH_CONVERT);
+	bp = xlog_get_bp(1, log->l_mp);
+	ASSERT(bp);
+	for (i = 0; i < BLOCK_LSN(iclog->ic_header.h_lsn, ARCH_CONVERT); i++) {
+	    xlog_bread(log, i, 1, bp);
+	    if (GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT) != cycle_no)
+		xlog_warn("XFS: xlog_verify_disk_cycle_no: bad cycle no");
+	}
+	xlog_put_bp(bp);
+    }
+}	/* xlog_verify_disk_cycle_no */
+#endif
+
+STATIC void
+xlog_verify_grant_head(xlog_t *log, int equals)
+{
+    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+	if (equals)
+	    ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
+	else
+	    ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+    } else {
+	ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+	ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+    }
+}	/* xlog_verify_grant_head */
+
+/* check if it will fit */
+STATIC void
+xlog_verify_tail_lsn(xlog_t	    *log,
+		     xlog_in_core_t *iclog,
+		     xfs_lsn_t	    tail_lsn)
+{
+    int blocks;
+
+    if (CYCLE_LSN(tail_lsn, ARCH_NOCONVERT) == log->l_prev_cycle) {
+	blocks =
+	    log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn, ARCH_NOCONVERT));
+	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
+	    xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+    } else {
+	ASSERT(CYCLE_LSN(tail_lsn, ARCH_NOCONVERT)+1 == log->l_prev_cycle);
+
+	if (BLOCK_LSN(tail_lsn, ARCH_NOCONVERT) == log->l_prev_block)
+	    xlog_panic("xlog_verify_tail_lsn: tail wrapped");
+
+	blocks = BLOCK_LSN(tail_lsn, ARCH_NOCONVERT) - log->l_prev_block;
+	if (blocks < BTOBB(iclog->ic_offset) + 1)
+	    xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+    }
+}	/* xlog_verify_tail_lsn */
+
+/*
+ * Perform a number of checks on the iclog before writing to disk.
+ *
+ * 1. Make sure the iclogs are still circular
+ * 2. Make sure we have a good magic number
+ * 3. Make sure we don't have magic numbers in the data
+ * 4. Check fields of each log operation header for:
+ *	A. Valid client identifier
+ *	B. tid ptr value falls in valid ptr space (user space code)
+ *	C. Length in log record header is correct according to the
+ *		individual operation headers within record.
+ * 5. When a bwrite will occur within 5 blocks of the front of the physical
+ *	log, check the preceding blocks of the physical log to make sure all
+ *	the cycle numbers agree with the current cycle number.
+ */
+STATIC void
+xlog_verify_iclog(xlog_t	 *log,
+		  xlog_in_core_t *iclog,
+		  int		 count,
+		  boolean_t	 syncing)
+{
+	xlog_op_header_t	*ophead;
+	xlog_in_core_t		*icptr;
+	xfs_caddr_t		ptr;
+	xfs_caddr_t		base_ptr;
+	__psint_t		field_offset;
+	__uint8_t		clientid;
+	int			len, i, j, k, op_len;
+	int			idx;
+	SPLDECL(s);
+
+	union ich {
+		xlog_rec_ext_header_t	hic_xheader;
+		char			hic_sector[XLOG_HEADER_SIZE];
+	}*xhdr;
+
+	/* check validity of iclog pointers */
+	s = LOG_LOCK(log);
+	icptr = log->l_iclog;
+	for (i=0; i < log->l_iclog_bufs; i++) {
+		if (icptr == 0)
+			xlog_panic("xlog_verify_iclog: illegal ptr");
+		icptr = icptr->ic_next;
+	}
+	if (icptr != log->l_iclog)
+		xlog_panic("xlog_verify_iclog: corrupt iclog ring");
+	LOG_UNLOCK(log, s);
+
+	/* check log magic numbers */
+	ptr = (xfs_caddr_t) &(iclog->ic_header);
+	if (INT_GET(*(uint *)ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM)
+		xlog_panic("xlog_verify_iclog: illegal magic num");
+
+	for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count;
+	     ptr += BBSIZE) {
+		if (INT_GET(*(uint *)ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
+			xlog_panic("xlog_verify_iclog: unexpected magic num");
+	}
+
+	/* check fields */
+	len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT);
+	ptr = iclog->ic_datap;
+	base_ptr = ptr;
+	ophead = (xlog_op_header_t *)ptr;
+	xhdr = (union ich*)&iclog->ic_header;
+	for (i = 0; i < len; i++) {
+		ophead = (xlog_op_header_t *)ptr;
+
+		/* clientid is only 1 byte */
+		field_offset = (__psint_t)
+			       ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+		if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+			clientid = ophead->oh_clientid;
+		} else {
+			idx = BTOBB((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+			if (idx > (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
+			} else {
+				clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
+			}
+		}
+		if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+			cmn_err(CE_WARN, "xlog_verify_iclog: illegal clientid %d op 0x%p offset 0x%x", clientid, ophead, field_offset);
+
+		/* check length */
+		field_offset = (__psint_t)
+			       ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+		if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+			op_len = INT_GET(ophead->oh_len, ARCH_CONVERT);
+		} else {
+			idx = BTOBB((__psint_t)&ophead->oh_len -
+				    (__psint_t)iclog->ic_datap);
+			if (idx > (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
+			} else {
+				op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
+			}
+		}
+		ptr += sizeof(xlog_op_header_t) + op_len;
+	}
+}	/* xlog_verify_iclog */
+#endif /* DEBUG && !XLOG_NOLOG */
+
+/*
+ * Mark all iclogs IOERROR. LOG_LOCK is held by the caller.
+ */
+STATIC int
+xlog_state_ioerror(
+	xlog_t	*log)
+{
+	xlog_in_core_t	*iclog, *ic;
+
+	iclog = log->l_iclog;
+	if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
+		/*
+		 * Mark all the incore logs IOERROR.
+		 * From now on, no log flushes will result.
+		 */
+		ic = iclog;
+		do {
+			ic->ic_state = XLOG_STATE_IOERROR;
+			ic = ic->ic_next;
+		} while (ic != iclog);
+		return (0);
+	}
+	/*
+	 * Return non-zero, if state transition has already happened.
+	 */
+	return (1);
+}
+
+/*
+ * This is called from xfs_force_shutdown, when we're forcibly
+ * shutting down the filesystem, typically because of an IO error.
+ * Our main objectives here are to make sure that:
+ *	a. the filesystem gets marked 'SHUTDOWN' for all interested
+ *	   parties to find out, 'atomically'.
+ *	b. those who're sleeping on log reservations, pinned objects and
+ *	    other resources get woken up, and be told the bad news.
+ *	c. nothing new gets queued up after (a) and (b) are done.
+ *	d. if !logerror, flush the iclogs to disk, then seal them off
+ *	   for business.
+ */
+int
+xfs_log_force_umount(
+	struct xfs_mount	*mp,
+	int			logerror)
+{
+	xlog_ticket_t	*tic;
+	xlog_t		*log;
+	int		retval;
+	SPLDECL(s);
+	SPLDECL(s2);
+
+	log = mp->m_log;
+
+	/*
+	 * If this happens during log recovery, don't worry about
+	 * locking; the log isn't open for business yet.
+	 */
+	if (!log ||
+	    log->l_flags & XLOG_ACTIVE_RECOVERY) {
+		mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+		XFS_BUF_DONE(mp->m_sb_bp);
+		return (0);
+	}
+
+	/*
+	 * Somebody could've already done the hard work for us.
+	 * No need to get locks for this.
+	 */
+	if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
+		ASSERT(XLOG_FORCED_SHUTDOWN(log));
+		return (1);
+	}
+	retval = 0;
+	/*
+	 * We must hold both the GRANT lock and the LOG lock,
+	 * before we mark the filesystem SHUTDOWN and wake
+	 * everybody up to tell the bad news.
+	 */
+	s = GRANT_LOCK(log);
+	s2 = LOG_LOCK(log);
+	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+	XFS_BUF_DONE(mp->m_sb_bp);
+	/*
+	 * This flag is sort of redundant because of the mount flag, but
+	 * it's good to maintain the separation between the log and the rest
+	 * of XFS.
+	 */
+	log->l_flags |= XLOG_IO_ERROR;
+
+	/*
+	 * If we hit a log error, we want to mark all the iclogs IOERROR
+	 * while we're still holding the loglock.
+	 */
+	if (logerror)
+		retval = xlog_state_ioerror(log);
+	LOG_UNLOCK(log, s2);
+
+	/*
+	 * We don't want anybody waiting for log reservations
+	 * after this. That means we have to wake up everybody
+	 * queued up on reserve_headq as well as write_headq.
+	 * In addition, we make sure in xlog_{re}grant_log_space
+	 * that we don't enqueue anything once the SHUTDOWN flag
+	 * is set, and this action is protected by the GRANTLOCK.
+	 */
+	if ((tic = log->l_reserve_headq)) {
+		do {
+			sv_signal(&tic->t_sema);
+			tic = tic->t_next;
+		} while (tic != log->l_reserve_headq);
+	}
+
+	if ((tic = log->l_write_headq)) {
+		do {
+			sv_signal(&tic->t_sema);
+			tic = tic->t_next;
+		} while (tic != log->l_write_headq);
+	}
+	GRANT_UNLOCK(log, s);
+
+	if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
+		ASSERT(!logerror);
+		/*
+		 * Force the incore logs to disk before shutting the
+		 * log down completely.
+		 */
+		xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC);
+		s2 = LOG_LOCK(log);
+		retval = xlog_state_ioerror(log);
+		LOG_UNLOCK(log, s2);
+	}
+	/*
+	 * Wake up everybody waiting on xfs_log_force.
+	 * Callback all log item committed functions as if the
+	 * log writes were completed.
+	 */
+	xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+
+#ifdef XFSERRORDEBUG
+	{
+		xlog_in_core_t	*iclog;
+
+		s = LOG_LOCK(log);
+		iclog = log->l_iclog;
+		do {
+			ASSERT(iclog->ic_callback == 0);
+			iclog = iclog->ic_next;
+		} while (iclog != log->l_iclog);
+		LOG_UNLOCK(log, s);
+	}
+#endif
+	/* return non-zero if log IOERROR transition had already happened */
+	return (retval);
+}
+
+int
+xlog_iclogs_empty(xlog_t *log)
+{
+	xlog_in_core_t	*iclog;
+
+	iclog = log->l_iclog;
+	do {
+		if (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT))
+			return(0);
+		iclog = iclog->ic_next;
+	} while (iclog != log->l_iclog);
+	return(1);
+}
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LOG_H__
+#define __XFS_LOG_H__
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define LSN_FIELD_CYCLE(arch) (((arch)==ARCH_NOCONVERT)?1:0)
+#define LSN_FIELD_BLOCK(arch) (((arch)==ARCH_NOCONVERT)?0:1)
+#else
+#define LSN_FIELD_CYCLE(arch) (0)
+#define LSN_FIELD_BLOCK(arch) (1)
+#endif
+
+/* get lsn fields */
+
+#define CYCLE_LSN(lsn,arch) (INT_GET(((uint *)&(lsn))[LSN_FIELD_CYCLE(arch)], arch))
+#define BLOCK_LSN(lsn,arch) (INT_GET(((uint *)&(lsn))[LSN_FIELD_BLOCK(arch)], arch))
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_NOCONV(lsn,arch) (((uint *)&(lsn))[LSN_FIELD_CYCLE(arch)])
+
+#ifdef __KERNEL__
+/*
+ * By comparing each compnent, we don't have to worry about extra
+ * endian issues in treating two 32 bit numbers as one 64 bit number
+ */
+static
+#ifdef __GNUC__
+# if !((__GNUC__ == 2) && (__GNUC_MINOR__ == 95))
+__inline__
+#endif
+#endif
+xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2, xfs_arch_t arch)
+{
+	if (CYCLE_LSN(lsn1, arch) != CYCLE_LSN(lsn2, arch))
+		return (CYCLE_LSN(lsn1, arch)<CYCLE_LSN(lsn2, arch))? -999 : 999;
+
+	if (BLOCK_LSN(lsn1, arch) != BLOCK_LSN(lsn2, arch))
+		return (BLOCK_LSN(lsn1, arch)<BLOCK_LSN(lsn2, arch))? -999 : 999;
+
+	return 0;
+}
+
+#define XFS_LSN_CMP_ARCH(x,y,arch)	_lsn_cmp(x, y, arch)
+#define XFS_LSN_CMP(x,y) XFS_LSN_CMP_ARCH(x,y,ARCH_NOCONVERT)
+#define XFS_LSN_DIFF_ARCH(x,y,arch)	_lsn_cmp(x, y, arch)
+#define XFS_LSN_DIFF(x,y) XFS_LSN_DIFF_ARCH(x,y,ARCH_NOCONVERT)
+
+/*
+ * Macros, structures, prototypes for interface to the log manager.
+ */
+
+/*
+ * Flags to xfs_log_mount
+ */
+#define XFS_LOG_RECOVER		0x1
+
+/*
+ * Flags to xfs_log_done()
+ */
+#define XFS_LOG_REL_PERM_RESERV 0x1
+
+
+/*
+ * Flags to xfs_log_reserve()
+ *
+ *	XFS_LOG_SLEEP:	 If space is not available, sleep (default)
+ *	XFS_LOG_NOSLEEP: If space is not available, return error
+ *	XFS_LOG_PERM_RESERV: Permanent reservation.  When writes are
+ *		performed against this type of reservation, the reservation
+ *		is not decreased.  Long running transactions should use this.
+ */
+#define XFS_LOG_SLEEP		0x0
+#define XFS_LOG_NOSLEEP		0x1
+#define XFS_LOG_PERM_RESERV	0x2
+#define XFS_LOG_RESV_ALL	(XFS_LOG_NOSLEEP|XFS_LOG_PERM_RESERV)
+
+
+/*
+ * Flags to xfs_log_force()
+ *
+ *	XFS_LOG_SYNC:	Synchronous force in-core log to disk
+ *	XFS_LOG_FORCE:	Start in-core log write now.
+ *	XFS_LOG_URGE:	Start write within some window of time.
+ *
+ * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
+ */
+#define XFS_LOG_SYNC		0x1
+#define XFS_LOG_FORCE		0x2
+#define XFS_LOG_URGE		0x4
+
+#endif	/* __KERNEL__ */
+
+
+/* Log Clients */
+#define XFS_TRANSACTION		0x69
+#define XFS_VOLUME		0x2
+#define XFS_LOG			0xaa
+
+typedef struct xfs_log_iovec {
+	xfs_caddr_t		i_addr;		/* beginning address of region */
+	int		i_len;		/* length in bytes of region */
+} xfs_log_iovec_t;
+
+typedef void* xfs_log_ticket_t;
+
+/*
+ * Structure used to pass callback function and the function's argument
+ * to the log manager.
+ */
+typedef struct xfs_log_callback {
+	struct xfs_log_callback *cb_next;
+	void			(*cb_func)(void *, int);
+	void			*cb_arg;
+} xfs_log_callback_t;
+
+
+#ifdef __KERNEL__
+/* Log manager interfaces */
+struct xfs_mount;
+xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
+		       xfs_log_ticket_t ticket,
+		       uint		flags);
+int	  xfs_log_force(struct xfs_mount *mp,
+			xfs_lsn_t	 lsn,
+			uint		 flags);
+int	  xfs_log_init(void);
+int	  xfs_log_mount(struct xfs_mount *mp,
+			dev_t		 log_dev,
+			xfs_daddr_t		 start_block,
+			int		 num_bblocks);
+int	  xfs_log_mount_finish(struct xfs_mount *mp, int);
+void	  xfs_log_move_tail(struct xfs_mount	*mp,
+			    xfs_lsn_t		tail_lsn);
+void	  xfs_log_notify(struct xfs_mount	*mp,
+			 xfs_lsn_t		lsn,
+			 xfs_log_callback_t	*callback_entry);
+int	  xfs_log_reserve(struct xfs_mount *mp,
+			  int		   length,
+			  int		   count,
+			  xfs_log_ticket_t *ticket,
+			  __uint8_t	   clientid,
+			  uint		   flags);
+int	  xfs_log_write(struct xfs_mount *mp,
+			xfs_log_iovec_t	 region[],
+			int		 nentries,
+			xfs_log_ticket_t ticket,
+			xfs_lsn_t	 *start_lsn);
+int	  xfs_log_unmount(struct xfs_mount *mp);
+int	  xfs_log_unmount_write(struct xfs_mount *mp);
+void	  xfs_log_unmount_dealloc(struct xfs_mount *mp);
+int	  xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+int	  xfs_log_need_covered(struct xfs_mount *mp);
+
+void	  xlog_iodone(struct xfs_buf *);
+
+#endif
+
+
+extern int xlog_debug;		/* set to 1 to enable real log */
+
+
+#endif	/* __XFS_LOG_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log_priv.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log_priv.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log_priv.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log_priv.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LOG_PRIV_H__
+#define __XFS_LOG_PRIV_H__
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_LOG_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_LOG_TRACE
+#endif
+
+struct xfs_buf;
+struct ktrace;
+struct log;
+struct xfs_buf_cancel;
+struct xfs_mount;
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_NUM_ICLOGS		2
+#define XLOG_MAX_ICLOGS		8
+#define XLOG_CALLBACK_SIZE	10
+#define XLOG_HEADER_MAGIC_NUM	0xFEEDbabe	/* Illegal cycle number */
+#define XLOG_VERSION_1		1
+#define XLOG_VERSION_2		2		/* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS	(XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_RECORD_BSIZE	(16*1024)	/* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE	(32*1024)	/* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE	(256*1024)
+#define XLOG_HEADER_CYCLE_SIZE	(32*1024)	/* cycle data in header */
+#define XLOG_RECORD_BSHIFT	14		/* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT	15		/* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT	18		/* 256k == 1 << 18 */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_BTOLRBB)
+int xlog_btolrbb(int b);
+#define XLOG_BTOLRBB(b)		xlog_btolrbb(b)
+#else
+#define XLOG_BTOLRBB(b)		(((b)+XLOG_RECORD_BSIZE-1) >> XLOG_RECORD_BSHIFT)
+#endif
+
+#define XLOG_HEADER_SIZE	512
+
+#define XLOG_TOTAL_REC_SHIFT(log) \
+	BTOBB(XLOG_MAX_ICLOGS << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \
+	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+/*
+ *  set lsns
+ */
+
+#define ASSIGN_LSN_CYCLE(lsn,cycle,arch) \
+    INT_SET(((uint *)&(lsn))[LSN_FIELD_CYCLE(arch)], arch, (cycle));
+#define ASSIGN_LSN_BLOCK(lsn,block,arch) \
+    INT_SET(((uint *)&(lsn))[LSN_FIELD_BLOCK(arch)], arch, (block));
+#define ASSIGN_ANY_LSN(lsn,cycle,block,arch)  \
+    { \
+	ASSIGN_LSN_CYCLE(lsn,cycle,arch); \
+	ASSIGN_LSN_BLOCK(lsn,block,arch); \
+    }
+#define ASSIGN_LSN(lsn,log,arch) \
+    ASSIGN_ANY_LSN(lsn,(log)->l_curr_cycle,(log)->l_curr_block,arch);
+
+#define XLOG_SET(f,b)		(((f) & (b)) == (b))
+
+#define GET_CYCLE(ptr, arch) \
+    (INT_GET(*(uint *)(ptr), arch) == XLOG_HEADER_MAGIC_NUM ? \
+	 INT_GET(*((uint *)(ptr)+1), arch) : \
+	 INT_GET(*(uint *)(ptr), arch) \
+    )
+
+#define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
+
+
+#ifdef __KERNEL__
+/*
+ * get client id from packed copy.
+ *
+ * this hack is here because the xlog_pack code copies four bytes
+ * of xlog_op_header containing the fields oh_clientid, oh_flags
+ * and oh_res2 into the packed copy.
+ *
+ * later on this four byte chunk is treated as an int and the
+ * client id is pulled out.
+ *
+ * this has endian issues, of course.
+ */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define GET_CLIENT_ID(i,arch) \
+    ((i) & 0xff)
+#else
+#define GET_CLIENT_ID(i,arch) \
+    ((i) >> 24)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_SUB_SPACE)
+void xlog_grant_sub_space(struct log *log, int bytes, int type);
+#define XLOG_GRANT_SUB_SPACE(log,bytes,type)	\
+	xlog_grant_sub_space(log,bytes,type)
+#else
+#define XLOG_GRANT_SUB_SPACE(log,bytes,type)				\
+    {									\
+	if (type == 'w') {						\
+		(log)->l_grant_write_bytes -= (bytes);			\
+		if ((log)->l_grant_write_bytes < 0) {			\
+			(log)->l_grant_write_bytes += (log)->l_logsize; \
+			(log)->l_grant_write_cycle--;			\
+		}							\
+	} else {							\
+		(log)->l_grant_reserve_bytes -= (bytes);		\
+		if ((log)->l_grant_reserve_bytes < 0) {			\
+			(log)->l_grant_reserve_bytes += (log)->l_logsize;\
+			(log)->l_grant_reserve_cycle--;			\
+		}							\
+	 }								\
+    }
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_ADD_SPACE)
+void xlog_grant_add_space(struct log *log, int bytes, int type);
+#define XLOG_GRANT_ADD_SPACE(log,bytes,type)	\
+	xlog_grant_add_space(log,bytes,type)
+#else
+#define XLOG_GRANT_ADD_SPACE(log,bytes,type)				\
+    {									\
+	if (type == 'w') {						\
+		(log)->l_grant_write_bytes += (bytes);			\
+		if ((log)->l_grant_write_bytes > (log)->l_logsize) {	\
+			(log)->l_grant_write_bytes -= (log)->l_logsize; \
+			(log)->l_grant_write_cycle++;			\
+		}							\
+	} else {							\
+		(log)->l_grant_reserve_bytes += (bytes);		\
+		if ((log)->l_grant_reserve_bytes > (log)->l_logsize) {	\
+			(log)->l_grant_reserve_bytes -= (log)->l_logsize;\
+			(log)->l_grant_reserve_cycle++;			\
+		}							\
+	 }								\
+    }
+#endif
+#define XLOG_INS_TICKETQ(q,tic)				\
+    {							\
+	if (q) {					\
+		(tic)->t_next	    = (q);		\
+		(tic)->t_prev	    = (q)->t_prev;	\
+		(q)->t_prev->t_next = (tic);		\
+		(q)->t_prev	    = (tic);		\
+	} else {					\
+		(tic)->t_prev = (tic)->t_next = (tic);	\
+		(q) = (tic);				\
+	}						\
+	(tic)->t_flags |= XLOG_TIC_IN_Q;		\
+    }
+#define XLOG_DEL_TICKETQ(q,tic)				\
+    {							\
+	if ((tic) == (tic)->t_next) {			\
+		(q) = NULL;				\
+	} else {					\
+		(q) = (tic)->t_next;			\
+		(tic)->t_next->t_prev = (tic)->t_prev;	\
+		(tic)->t_prev->t_next = (tic)->t_next;	\
+	}						\
+	(tic)->t_next = (tic)->t_prev = NULL;		\
+	(tic)->t_flags &= ~XLOG_TIC_IN_Q;		\
+    }
+
+
+#define GRANT_LOCK(log)		mutex_spinlock(&(log)->l_grant_lock)
+#define GRANT_UNLOCK(log, s)	mutex_spinunlock(&(log)->l_grant_lock, s)
+#define LOG_LOCK(log)		mutex_spinlock(&(log)->l_icloglock)
+#define LOG_UNLOCK(log, s)	mutex_spinunlock(&(log)->l_icloglock, s)
+
+#define xlog_panic(s)		{cmn_err(CE_PANIC, s); }
+#define xlog_exit(s)		{cmn_err(CE_PANIC, s); }
+#define xlog_warn(s)		{cmn_err(CE_WARN, s); }
+
+/*
+ * In core log state
+ */
+#define XLOG_STATE_ACTIVE    0x0001 /* Current IC log being written to */
+#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
+#define XLOG_STATE_SYNCING   0x0004 /* This IC log is syncing */
+#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
+#define XLOG_STATE_DO_CALLBACK \
+			     0x0010 /* Process callback functions */
+#define XLOG_STATE_CALLBACK  0x0020 /* Callback functions now */
+#define XLOG_STATE_DIRTY     0x0040 /* Dirty IC log, not ready for ACTIVE status*/
+#define XLOG_STATE_IOERROR   0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_ALL	     0x7FFF /* All possible valid flags */
+#define XLOG_STATE_NOTUSED   0x8000 /* This IC log not being used */
+#endif	/* __KERNEL__ */
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS	0x01	/* Start a new transaction */
+#define XLOG_COMMIT_TRANS	0x02	/* Commit this transaction */
+#define XLOG_CONTINUE_TRANS	0x04	/* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS	0x08	/* Cont this trans into new region */
+#define XLOG_END_TRANS		0x10	/* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS	0x20	/* Unmount a filesystem transaction */
+#define XLOG_SKIP_TRANS		(XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \
+				 XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \
+				 XLOG_UNMOUNT_TRANS)
+
+#ifdef __KERNEL__
+/*
+ * Flags to log ticket
+ */
+#define XLOG_TIC_INITED		0x1	/* has been initialized */
+#define XLOG_TIC_PERM_RESERV	0x2	/* permanent reservation */
+#define XLOG_TIC_IN_Q		0x4
+#endif	/* __KERNEL__ */
+
+#define XLOG_UNMOUNT_TYPE	0x556e	/* Un for Unmount */
+
+/*
+ * Flags for log structure
+ */
+#define XLOG_CHKSUM_MISMATCH	0x1	/* used only during recovery */
+#define XLOG_ACTIVE_RECOVERY	0x2	/* in the middle of recovery */
+#define XLOG_RECOVERY_NEEDED	0x4	/* log was recovered */
+#define XLOG_IO_ERROR		0x8	/* log hit an I/O error, and being
+					   shutdown */
+typedef __uint32_t xlog_tid_t;
+
+
+#ifdef __KERNEL__
+/*
+ * Below are states for covering allocation transactions.
+ * By covering, we mean changing the h_tail_lsn in the last on-disk
+ * log write such that no allocation transactions will be re-done during
+ * recovery after a system crash. Recovery starts at the last on-disk
+ * log write.
+ *
+ * These states are used to insert dummy log entries to cover
+ * space allocation transactions which can undo non-transactional changes
+ * after a crash. Writes to a file with space
+ * already allocated do not result in any transactions. Allocations
+ * might include space beyond the EOF. So if we just push the EOF a
+ * little, the last transaction for the file could contain the wrong
+ * size. If there is no file system activity, after an allocation
+ * transaction, and the system crashes, the allocation transaction
+ * will get replayed and the file will be truncated. This could
+ * be hours/days/... after the allocation occurred.
+ *
+ * The fix for this is to do two dummy transactions when the
+ * system is idle. We need two dummy transaction because the h_tail_lsn
+ * in the log record header needs to point beyond the last possible
+ * non-dummy transaction. The first dummy changes the h_tail_lsn to
+ * the first transaction before the dummy. The second dummy causes
+ * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
+ *
+ * These dummy transactions get committed when everything
+ * is idle (after there has been some activity).
+ *
+ * There are 5 states used to control this.
+ *
+ *  IDLE -- no logging has been done on the file system or
+ *		we are done covering previous transactions.
+ *  NEED -- logging has occurred and we need a dummy transaction
+ *		when the log becomes idle.
+ *  DONE -- we were in the NEED state and have committed a dummy
+ *		transaction.
+ *  NEED2 -- we detected that a dummy transaction has gone to the
+ *		on disk log with no other transactions.
+ *  DONE2 -- we committed a dummy transaction when in the NEED2 state.
+ *
+ * There are two places where we switch states:
+ *
+ * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
+ *	We commit the dummy transaction and switch to DONE or DONE2,
+ *	respectively. In all other states, we don't do anything.
+ *
+ * 2.) When we finish writing the on-disk log (xlog_state_clean_log).
+ *
+ *	No matter what state we are in, if this isn't the dummy
+ *	transaction going out, the next state is NEED.
+ *	So, if we aren't in the DONE or DONE2 states, the next state
+ *	is NEED. We can't be finishing a write of the dummy record
+ *	unless it was committed and the state switched to DONE or DONE2.
+ *
+ *	If we are in the DONE state and this was a write of the
+ *		dummy transaction, we move to NEED2.
+ *
+ *	If we are in the DONE2 state and this was a write of the
+ *		dummy transaction, we move to IDLE.
+ *
+ *
+ * Writing only one dummy transaction can get appended to
+ * one file space allocation. When this happens, the log recovery
+ * code replays the space allocation and a file could be truncated.
+ * This is why we have the NEED2 and DONE2 states before going idle.
+ */
+
+#define XLOG_STATE_COVER_IDLE	0
+#define XLOG_STATE_COVER_NEED	1
+#define XLOG_STATE_COVER_DONE	2
+#define XLOG_STATE_COVER_NEED2	3
+#define XLOG_STATE_COVER_DONE2	4
+
+#define XLOG_COVER_OPS		5
+
+typedef struct xlog_ticket {
+	sv_t		   t_sema;	 /* sleep on this semaphore	 :20 */
+	struct xlog_ticket *t_next;	 /*				 : 4 */
+	struct xlog_ticket *t_prev;	 /*				 : 4 */
+	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4 */
+	int		   t_curr_res;	 /* current reservation in bytes : 4 */
+	int		   t_unit_res;	 /* unit reservation in bytes	 : 4 */
+	__uint8_t	   t_ocnt;	 /* original count		 : 1 */
+	__uint8_t	   t_cnt;	 /* current count		 : 1 */
+	__uint8_t	   t_clientid;	 /* who does this belong to;	 : 1 */
+	__uint8_t	   t_flags;	 /* properties of reservation	 : 1 */
+} xlog_ticket_t;
+#endif
+
+
+typedef struct xlog_op_header {
+	xlog_tid_t oh_tid;	/* transaction id of operation	:  4 b */
+	int	   oh_len;	/* bytes in data region		:  4 b */
+	__uint8_t  oh_clientid; /* who sent me this		:  1 b */
+	__uint8_t  oh_flags;	/*				:  1 b */
+	ushort	   oh_res2;	/* 32 bit align			:  2 b */
+} xlog_op_header_t;
+
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+
+/* our fmt */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#else
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#error unknown byte order
+#endif
+#endif
+
+typedef struct xlog_rec_header {
+	uint	  h_magicno;	/* log record (LR) identifier		:  4 */
+	uint	  h_cycle;	/* write cycle of log			:  4 */
+	int	  h_version;	/* LR version				:  4 */
+	int	  h_len;	/* len in bytes; should be 64-bit aligned: 4 */
+	xfs_lsn_t h_lsn;	/* lsn of this LR			:  8 */
+	xfs_lsn_t h_tail_lsn;	/* lsn of 1st LR w/ buffers not committed: 8 */
+	uint	  h_chksum;	/* may not be used; non-zero if used	:  4 */
+	int	  h_prev_block; /* block number to previous LR		:  4 */
+	int	  h_num_logops; /* number of log operations in this LR	:  4 */
+	uint	  h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+	/* new fields */
+	int	  h_fmt;	/* format of log record			:  4 */
+	uuid_t	  h_fs_uuid;	/* uuid of FS				: 16 */
+	int	  h_size;	/* iclog size				:  4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+	uint	  xh_cycle;	/* write cycle of log			: 4 */
+	uint	  xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*	: 256 */
+} xlog_rec_ext_header_t;
+#ifdef __KERNEL__
+/*
+ * - A log record header is 512 bytes.	There is plenty of room to grow the
+ *	xlog_rec_header_t into the reserved space.
+ * - ic_data follows, so a write to disk can start at the beginning of
+ *	the iclog.
+ * - ic_forcesema is used to implement synchronous forcing of the iclog to disk.
+ * - ic_next is the pointer to the next iclog in the ring.
+ * - ic_bp is a pointer to the buffer used to write this incore log to disk.
+ * - ic_log is a pointer back to the global log structure.
+ * - ic_callback is a linked list of callback function/argument pairs to be
+ *	called after an iclog finishes writing.
+ * - ic_size is the full size of the header plus data.
+ * - ic_offset is the current number of bytes written to in this iclog.
+ * - ic_refcnt is bumped when someone is writing to the log.
+ * - ic_state is the state of the iclog.
+ */
+typedef struct xlog_iclog_fields {
+	sv_t			ic_forcesema;
+	struct xlog_in_core	*ic_next;
+	struct xlog_in_core	*ic_prev;
+	struct xfs_buf		*ic_bp;
+	struct log		*ic_log;
+	xfs_log_callback_t	*ic_callback;
+	xfs_log_callback_t	**ic_callback_tail;
+#ifdef DEBUG
+	struct ktrace		*ic_trace;
+#endif
+	int			ic_size;
+	int			ic_offset;
+	int			ic_refcnt;
+	int			ic_roundoff;
+	int			ic_bwritecnt;
+	ushort_t		ic_state;
+	char			*ic_datap;	/* pointer to iclog data */
+	struct tq_struct	ic_write_sched;
+} xlog_iclog_fields_t;
+
+typedef struct xlog_in_core2 {
+	union {
+		xlog_rec_header_t hic_header;
+		xlog_rec_ext_header_t hic_xheader;
+		char		  hic_sector[XLOG_HEADER_SIZE];
+	} ic_h;
+} xlog_in_core_2_t;
+
+typedef struct xlog_in_core {
+	xlog_iclog_fields_t	hic_fields;
+	xlog_in_core_2_t	*hic_data;
+} xlog_in_core_t;
+
+/*
+ * Defines to save our code from this glop.
+ */
+#define ic_forcesema	hic_fields.ic_forcesema
+#define ic_write_sched	hic_fields.ic_write_sched
+#define ic_next		hic_fields.ic_next
+#define ic_prev		hic_fields.ic_prev
+#define ic_bp		hic_fields.ic_bp
+#define ic_log		hic_fields.ic_log
+#define ic_callback	hic_fields.ic_callback
+#define ic_callback_tail hic_fields.ic_callback_tail
+#define ic_trace	hic_fields.ic_trace
+#define ic_size		hic_fields.ic_size
+#define ic_offset	hic_fields.ic_offset
+#define ic_refcnt	hic_fields.ic_refcnt
+#define ic_roundoff	hic_fields.ic_roundoff
+#define ic_bwritecnt	hic_fields.ic_bwritecnt
+#define ic_state	hic_fields.ic_state
+#define ic_datap	hic_fields.ic_datap
+#define ic_header	hic_data->ic_h.hic_header
+
+/*
+ * The reservation head lsn is not made up of a cycle number and block number.
+ * Instead, it uses a cycle number and byte number.  Logs don't expect to
+ * overflow 31 bits worth of byte offset, so using a byte number will mean
+ * that round off problems won't occur when releasing partial reservations.
+ */
+typedef struct log {
+    /* The following block of fields are changed while holding icloglock */
+    sema_t		l_flushsema;	/* iclog flushing semaphore */
+    int			l_flushcnt;	/* # of procs waiting on this sema */
+    int			l_ticket_cnt;	/* free ticket count */
+    int			l_ticket_tcnt;	/* total ticket count */
+    int			l_covered_state;/* state of "covering disk log entries" */
+    xlog_ticket_t	*l_freelist;	/* free list of tickets */
+    xlog_ticket_t	*l_unmount_free;/* kmem_free these addresses */
+    xlog_ticket_t	*l_tail;	/* free list of tickets */
+    xlog_in_core_t	*l_iclog;	/* head log queue	*/
+    lock_t		l_icloglock;	/* grab to change iclog state */
+    xfs_lsn_t		l_tail_lsn;	/* lsn of 1st LR w/ unflush buffers */
+    xfs_lsn_t		l_last_sync_lsn;/* lsn of last LR on disk */
+    struct xfs_mount	*l_mp;		/* mount point */
+    struct xfs_buf	*l_xbuf;	/* extra buffer for log wrapping */
+    dev_t		l_dev;		/* dev_t of log */
+    xfs_daddr_t		l_logBBstart;	/* start block of log */
+    int			l_logsize;	/* size of log in bytes */
+    int			l_logBBsize;	/* size of log in 512 byte chunks */
+    int			l_roundoff;	/* round off error of all iclogs */
+    int			l_curr_cycle;	/* Cycle number of log writes */
+    int			l_prev_cycle;	/* Cycle # b4 last block increment */
+    int			l_curr_block;	/* current logical block of log */
+    int			l_prev_block;	/* previous logical block of log */
+    int			l_iclog_size;	 /* size of log in bytes */
+    int			l_iclog_size_log;/* log power size of log */
+    int			l_iclog_bufs;	 /* number of iclog buffers */
+
+    /* The following field are used for debugging; need to hold icloglock */
+    char		*l_iclog_bak[XLOG_MAX_ICLOGS];
+
+    /* The following block of fields are changed while holding grant_lock */
+    lock_t		l_grant_lock;		/* protects below fields */
+    xlog_ticket_t	*l_reserve_headq;	/* */
+    xlog_ticket_t	*l_write_headq;		/* */
+    int			l_grant_reserve_cycle;	/* */
+    int			l_grant_reserve_bytes;	/* */
+    int			l_grant_write_cycle;	/* */
+    int			l_grant_write_bytes;	/* */
+
+    /* The following fields don't need locking */
+#ifdef DEBUG
+    struct ktrace	*l_trace;
+    struct ktrace	*l_grant_trace;
+#endif
+    uint		l_flags;
+    uint		l_quotaoffs_flag;/* XFS_DQ_*, if QUOTAOFFs found */
+    struct xfs_buf_cancel **l_buf_cancel_table;
+    int			l_iclog_hsize;	/* size of iclog header */
+    int			l_iclog_heads;	/* number of iclog header sectors */
+} xlog_t;
+
+
+/* common routines */
+extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp,
+				      xlog_in_core_t *iclog);
+extern int	 xlog_find_head(xlog_t *log, xfs_daddr_t *head_blk);
+extern int	 xlog_find_tail(xlog_t	*log,
+				xfs_daddr_t *head_blk,
+				xfs_daddr_t *tail_blk,
+				int readonly);
+extern int	 xlog_print_find_oldest(xlog_t *log, xfs_daddr_t *last_blk);
+extern int	 xlog_recover(xlog_t *log, int readonly);
+extern int	 xlog_recover_finish(xlog_t *log, int mfsi_flags);
+extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog);
+extern struct xfs_buf *xlog_get_bp(int,xfs_mount_t *);
+extern void	 xlog_put_bp(struct xfs_buf *);
+extern int	 xlog_bread(xlog_t *, xfs_daddr_t blkno, int bblks, struct xfs_buf *bp);
+extern void	 xlog_recover_process_iunlinks(xlog_t *log);
+
+#define XLOG_TRACE_GRAB_FLUSH  1
+#define XLOG_TRACE_REL_FLUSH   2
+#define XLOG_TRACE_SLEEP_FLUSH 3
+#define XLOG_TRACE_WAKE_FLUSH  4
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_LOG_PRIV_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log_recover.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log_recover.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log_recover.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log_recover.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,3704 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_log_recover.h>
+
+STATIC int	xlog_find_zeroed(struct log *log, xfs_daddr_t *blk_no);
+
+STATIC int	xlog_clear_stale_blocks(xlog_t	*log, xfs_lsn_t tail_lsn);
+STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
+					       xlog_recover_item_t *item);
+
+#if defined(DEBUG)
+STATIC void	xlog_recover_check_summary(xlog_t *log);
+STATIC void	xlog_recover_check_ail(xfs_mount_t *mp, xfs_log_item_t *lip,
+				       int gen);
+#else
+#define xlog_recover_check_summary(log)
+#define xlog_recover_check_ail(mp, lip, gen)
+#endif /* DEBUG */
+
+
+xfs_buf_t *
+xlog_get_bp(int num_bblks,xfs_mount_t *mp)
+{
+	xfs_buf_t   *bp;
+
+	ASSERT(num_bblks > 0);
+
+	bp = XFS_ngetrbuf(BBTOB(num_bblks),mp);
+	return bp;
+}	/* xlog_get_bp */
+
+
+void
+xlog_put_bp(xfs_buf_t *bp)
+{
+	XFS_nfreerbuf(bp);
+}	/* xlog_put_bp */
+
+
+/*
+ * nbblks should be uint, but oh well.	Just want to catch that 32-bit length.
+ */
+int
+xlog_bread(xlog_t	*log,
+	   xfs_daddr_t	blk_no,
+	   int		nbblks,
+	   xfs_buf_t	*bp)
+{
+	int error;
+
+	ASSERT(log);
+	ASSERT(nbblks > 0);
+	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+	ASSERT(bp);
+
+	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+	XFS_BUF_READ(bp);
+	XFS_BUF_BUSY(bp);
+	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+
+	xfsbdstrat(log->l_mp, bp);
+	if ((error = xfs_iowait(bp))) {
+		xfs_ioerror_alert("xlog_bread", log->l_mp,
+				  bp, XFS_BUF_ADDR(bp));
+		return (error);
+	}
+	return error;
+}	/* xlog_bread */
+
+
+/*
+ * Write out the buffer at the given block for the given number of blocks.
+ * The buffer is kept locked across the write and is returned locked.
+ * This can only be used for synchronous log writes.
+ */
+int
+xlog_bwrite(
+	xlog_t	*log,
+	int	blk_no,
+	int	nbblks,
+	xfs_buf_t	*bp)
+{
+	int	error;
+
+	ASSERT(nbblks > 0);
+	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+
+	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+	XFS_BUF_ZEROFLAGS(bp);
+	XFS_BUF_BUSY(bp);
+	XFS_BUF_HOLD(bp);
+	XFS_BUF_PSEMA(bp, PRIBIO);
+	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+
+	if ((error = xfs_bwrite(log->l_mp, bp)))
+		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
+				  bp, XFS_BUF_ADDR(bp));
+
+	return (error);
+}	/* xlog_bwrite */
+
+#ifdef DEBUG
+/*
+ * check log record header for recovery
+ */
+
+static void
+xlog_header_check_dump(xfs_mount_t *mp, xlog_rec_header_t *head)
+{
+    int b;
+
+    printk("xlog_header_check_dump:\n	 SB : uuid = ");
+    for (b=0;b<16;b++) printk("%02x",((unsigned char *)&mp->m_sb.sb_uuid)[b]);
+    printk(", fmt = %d\n",XLOG_FMT);
+    printk("	log: uuid = ");
+    for (b=0;b<16;b++) printk("%02x",((unsigned char *)&head->h_fs_uuid)[b]);
+    printk(", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
+}
+#endif
+
+/*
+ * check log record header for recovery
+ */
+
+STATIC int
+xlog_header_check_recover(xfs_mount_t *mp, xlog_rec_header_t *head)
+{
+    ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+
+    /*
+     * IRIX doesn't write the h_fmt field and leaves it zeroed
+     * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
+     * a dirty log created in IRIX.
+     */
+
+    if (INT_GET(head->h_fmt, ARCH_CONVERT) != XLOG_FMT) {
+	xlog_warn("XFS: dirty log written in incompatible format - can't recover");
+#ifdef DEBUG
+	xlog_header_check_dump(mp, head);
+#endif
+	return XFS_ERROR(EFSCORRUPTED);
+    } else if (!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid)) {
+	xlog_warn("XFS: dirty log entry has mismatched uuid - can't recover");
+#ifdef DEBUG
+	xlog_header_check_dump(mp, head);
+#endif
+	return XFS_ERROR(EFSCORRUPTED);
+    }
+
+    return 0;
+}
+
+/*
+ * read the head block of the log and check the header
+ */
+
+STATIC int
+xlog_header_check_mount(xfs_mount_t *mp, xlog_rec_header_t *head)
+{
+    ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+
+    if (uuid_is_nil(&head->h_fs_uuid)) {
+
+	/*
+	 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
+	 * h_fs_uuid is nil, we assume this log was last mounted
+	 * by IRIX and continue.
+	 */
+
+	xlog_warn("XFS: nil uuid in log - IRIX style log");
+
+    } else if (!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid)) {
+	xlog_warn("XFS: log has mismatched uuid - can't recover");
+#ifdef DEBUG
+	xlog_header_check_dump(mp, head);
+#endif
+	return XFS_ERROR(EFSCORRUPTED);
+    }
+
+    return 0;
+}
+
+STATIC void
+xlog_recover_iodone(
+	struct xfs_buf	*bp)
+{
+	xfs_mount_t	*mp;
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
+
+	if (XFS_BUF_GETERROR(bp)) {
+		/*
+		 * We're not going to bother about retrying
+		 * this during recovery. One strike!
+		 */
+		mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
+		xfs_ioerror_alert("xlog_recover_iodone",
+				  mp, bp, XFS_BUF_ADDR(bp));
+		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+	}
+	XFS_BUF_SET_FSPRIVATE(bp, NULL);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	xfs_biodone(bp);
+}
+
+/*
+ * This routine finds (to an approximation) the first block in the physical
+ * log which contains the given cycle.	It uses a binary search algorithm.
+ * Note that the algorithm can not be perfect because the disk will not
+ * necessarily be perfect.
+ */
+int
+xlog_find_cycle_start(xlog_t	*log,
+		      xfs_buf_t *bp,
+		      xfs_daddr_t	first_blk,
+		      xfs_daddr_t	*last_blk,
+		      uint	cycle)
+{
+	xfs_daddr_t mid_blk;
+	uint	mid_cycle;
+	int	error;
+
+	mid_blk = BLK_AVG(first_blk, *last_blk);
+	while (mid_blk != first_blk && mid_blk != *last_blk) {
+		if ((error = xlog_bread(log, mid_blk, 1, bp)))
+			return error;
+		mid_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+		if (mid_cycle == cycle) {
+			*last_blk = mid_blk;
+			/* last_half_cycle == mid_cycle */
+		} else {
+			first_blk = mid_blk;
+			/* first_half_cycle == mid_cycle */
+		}
+		mid_blk = BLK_AVG(first_blk, *last_blk);
+	}
+	ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
+	       (mid_blk == *last_blk && mid_blk-1 == first_blk));
+
+	return 0;
+}	/* xlog_find_cycle_start */
+
+
+/*
+ * Check that the range of blocks does not contain the cycle number
+ * given.  The scan needs to occur from front to back and the ptr into the
+ * region must be updated since a later routine will need to perform another
+ * test.  If the region is completely good, we end up returning the same
+ * last block number.
+ *
+ * Set blkno to -1 if we encounter no errors.  This is an invalid block number
+ * since we don't ever expect logs to get this large.
+ */
+
+STATIC int
+xlog_find_verify_cycle( xlog_t		*log,
+			xfs_daddr_t	start_blk,
+			int		nbblks,
+			uint		stop_on_cycle_no,
+			xfs_daddr_t	*new_blk)
+{
+	xfs_daddr_t		i, j;
+	uint			cycle;
+	xfs_buf_t		*bp;
+	char			*buf	    = NULL;
+	int			error	    = 0;
+	xfs_daddr_t		bufblks;
+
+	bufblks = 1 << ffs(nbblks);
+
+	while (!(bp = xlog_get_bp(bufblks, log->l_mp))) {
+		/* can't get enough memory to do everything in one big buffer */
+		bufblks >>= 1;
+		if (!bufblks)
+			return ENOMEM;
+	}
+
+
+	for (i = start_blk; i < start_blk + nbblks; i += bufblks)  {
+		int bcount = min(bufblks, (start_blk + nbblks - i));
+
+		if ((error = xlog_bread(log, i, bcount, bp)))
+			goto out;
+
+		buf = XFS_BUF_PTR(bp);
+		for (j = 0; j < bcount; j++) {
+			cycle = GET_CYCLE(buf, ARCH_CONVERT);
+			if (cycle == stop_on_cycle_no) {
+				*new_blk = i+j;
+				goto out;
+			}
+
+			buf += BBSIZE;
+		}
+	}
+
+	*new_blk = -1;
+
+out:
+	xlog_put_bp(bp);
+
+	return error;
+}	/* xlog_find_verify_cycle */
+
+
+/*
+ * Potentially backup over partial log record write.
+ *
+ * In the typical case, last_blk is the number of the block directly after
+ * a good log record.  Therefore, we subtract one to get the block number
+ * of the last block in the given buffer.  extra_bblks contains the number
+ * of blocks we would have read on a previous read.  This happens when the
+ * last log record is split over the end of the physical log.
+ *
+ * extra_bblks is the number of blocks potentially verified on a previous
+ * call to this routine.
+ */
+
+STATIC int
+xlog_find_verify_log_record(xlog_t	*log,
+			    xfs_daddr_t start_blk,
+			    xfs_daddr_t *last_blk,
+			    int		extra_bblks)
+{
+    xfs_daddr_t		i;
+    xfs_buf_t		*bp;
+    char		*buf	    = NULL;
+    xlog_rec_header_t	*head	    = NULL;
+    int			error	    = 0;
+    int			smallmem    = 0;
+    int			num_blks    = *last_blk - start_blk;
+    int			xhdrs;
+
+    ASSERT(start_blk != 0 || *last_blk != start_blk);
+
+    if (!(bp = xlog_get_bp(num_blks, log->l_mp))) {
+	if (!(bp = xlog_get_bp(1, log->l_mp)))
+	    return ENOMEM;
+	smallmem = 1;
+	buf = XFS_BUF_PTR(bp);
+    } else {
+	if ((error = xlog_bread(log, start_blk, num_blks, bp)))
+	    goto out;
+	buf = XFS_BUF_PTR(bp) + (num_blks - 1) * BBSIZE;
+    }
+
+
+    for (i=(*last_blk)-1; i>=0; i--) {
+	if (i < start_blk) {
+	    /* legal log record not found */
+	    xlog_warn("XFS: Log inconsistent (didn't find previous header)");
+	    ASSERT(0);
+	    error = XFS_ERROR(EIO);
+	    goto out;
+	}
+
+	if (smallmem && (error = xlog_bread(log, i, 1, bp)))
+	    goto out;
+	head = (xlog_rec_header_t*)buf;
+
+	if (INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
+	    break;
+
+	if (!smallmem)
+	    buf -= BBSIZE;
+    }
+
+    /*
+     * We hit the beginning of the physical log & still no header.  Return
+     * to caller.  If caller can handle a return of -1, then this routine
+     * will be called again for the end of the physical log.
+     */
+    if (i == -1) {
+	error = -1;
+	goto out;
+    }
+
+    /* we have the final block of the good log (the first block
+     * of the log record _before_ the head. So we check the uuid.
+     */
+
+    if ((error = xlog_header_check_mount(log->l_mp, head)))
+	goto out;
+
+    /*
+     * We may have found a log record header before we expected one.
+     * last_blk will be the 1st block # with a given cycle #.  We may end
+     * up reading an entire log record.	 In this case, we don't want to
+     * reset last_blk.	Only when last_blk points in the middle of a log
+     * record do we update last_blk.
+     */
+    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+	int h_size = INT_GET(head->h_size, ARCH_CONVERT);
+	xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
+	if (h_size % XLOG_HEADER_CYCLE_SIZE)
+		xhdrs++;
+    } else {
+	xhdrs = 1;
+    }
+
+    if (*last_blk - i + extra_bblks
+		!= BTOBB(INT_GET(head->h_len, ARCH_CONVERT))+xhdrs)
+	    *last_blk = i;
+
+out:
+    xlog_put_bp(bp);
+
+    return error;
+}	/* xlog_find_verify_log_record */
+
+/*
+ * Head is defined to be the point of the log where the next log write
+ * write could go.  This means that incomplete LR writes at the end are
+ * eliminated when calculating the head.  We aren't guaranteed that previous
+ * LR have complete transactions.  We only know that a cycle number of
+ * current cycle number -1 won't be present in the log if we start writing
+ * from our current block number.
+ *
+ * last_blk contains the block number of the first block with a given
+ * cycle number.
+ *
+ * Also called from xfs_log_print.c
+ *
+ * Return: zero if normal, non-zero if error.
+ */
+int
+xlog_find_head(xlog_t  *log,
+	       xfs_daddr_t *return_head_blk)
+{
+    xfs_buf_t	*bp;
+    xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
+    int	    num_scan_bblks;
+    uint    first_half_cycle, last_half_cycle;
+    uint    stop_on_cycle;
+    int	    error, log_bbnum = log->l_logBBsize;
+
+    /* Is the end of the log device zeroed? */
+    if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
+	*return_head_blk = first_blk;
+
+	/* is the whole lot zeroed? */
+	if (!first_blk) {
+	    /* Linux XFS shouldn't generate totally zeroed logs -
+	     * mkfs etc write a dummy unmount record to a fresh
+	     * log so we can store the uuid in there
+	     */
+	    xlog_warn("XFS: totally zeroed log\n");
+	}
+
+	return 0;
+    } else if (error) {
+	xlog_warn("XFS: empty log check failed");
+	return error;
+    }
+
+    first_blk = 0;				/* get cycle # of 1st block */
+    bp = xlog_get_bp(1,log->l_mp);
+    if (!bp)
+	return ENOMEM;
+    if ((error = xlog_bread(log, 0, 1, bp)))
+	goto bp_err;
+    first_half_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+
+    last_blk = head_blk = log_bbnum-1;		/* get cycle # of last block */
+    if ((error = xlog_bread(log, last_blk, 1, bp)))
+	goto bp_err;
+    last_half_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+    ASSERT(last_half_cycle != 0);
+
+    /*
+     * If the 1st half cycle number is equal to the last half cycle number,
+     * then the entire log is stamped with the same cycle number.  In this
+     * case, head_blk can't be set to zero (which makes sense).	 The below
+     * math doesn't work out properly with head_blk equal to zero.  Instead,
+     * we set it to log_bbnum which is an illegal block number, but this
+     * value makes the math correct.  If head_blk doesn't changed through
+     * all the tests below, *head_blk is set to zero at the very end rather
+     * than log_bbnum.	In a sense, log_bbnum and zero are the same block
+     * in a circular file.
+     */
+    if (first_half_cycle == last_half_cycle) {
+	/*
+	 * In this case we believe that the entire log should have cycle
+	 * number last_half_cycle.  We need to scan backwards from the
+	 * end verifying that there are no holes still containing
+	 * last_half_cycle - 1.	 If we find such a hole, then the start
+	 * of that hole will be the new head.  The simple case looks like
+	 *	  x | x ... | x - 1 | x
+	 * Another case that fits this picture would be
+	 *	  x | x + 1 | x ... | x
+	 * In this case the head really is somwhere at the end of the
+	 * log, as one of the latest writes at the beginning was incomplete.
+	 * One more case is
+	 *	  x | x + 1 | x ... | x - 1 | x
+	 * This is really the combination of the above two cases, and the
+	 * head has to end up at the start of the x-1 hole at the end of
+	 * the log.
+	 *
+	 * In the 256k log case, we will read from the beginning to the
+	 * end of the log and search for cycle numbers equal to x-1.  We
+	 * don't worry about the x+1 blocks that we encounter, because
+	 * we know that they cannot be the head since the log started with
+	 * x.
+	 */
+	head_blk = log_bbnum;
+	stop_on_cycle = last_half_cycle - 1;
+    } else {
+	/*
+	 * In this case we want to find the first block with cycle number
+	 * matching last_half_cycle.  We expect the log to be some
+	 * variation on
+	 *	  x + 1 ... | x ...
+	 * The first block with cycle number x (last_half_cycle) will be
+	 * where the new head belongs.	First we do a binary search for
+	 * the first occurrence of last_half_cycle.  The binary search
+	 * may not be totally accurate, so then we scan back from there
+	 * looking for occurrences of last_half_cycle before us.  If
+	 * that backwards scan wraps around the beginning of the log,
+	 * then we look for occurrences of last_half_cycle - 1 at the
+	 * end of the log.  The cases we're looking for look like
+	 *	  x + 1 ... | x | x + 1 | x ...
+	 *				 ^ binary search stopped here
+	 * or
+	 *	  x + 1 ... | x ... | x - 1 | x
+	 *	  <---------> less than scan distance
+	 */
+	stop_on_cycle = last_half_cycle;
+	if ((error = xlog_find_cycle_start(log, bp, first_blk,
+					  &head_blk, last_half_cycle)))
+	    goto bp_err;
+    }
+
+    /*
+     * Now validate the answer.	 Scan back some number of maximum possible
+     * blocks and make sure each one has the expected cycle number.  The
+     * maximum is determined by the total possible amount of buffering
+     * in the in-core log.  The following number can be made tighter if
+     * we actually look at the block size of the filesystem.
+     */
+    num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+    if (head_blk >= num_scan_bblks) {
+	/*
+	 * We are guaranteed that the entire check can be performed
+	 * in one buffer.
+	 */
+	start_blk = head_blk - num_scan_bblks;
+	if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks,
+					 stop_on_cycle, &new_blk)))
+	    goto bp_err;
+	if (new_blk != -1)
+	    head_blk = new_blk;
+    } else {			/* need to read 2 parts of log */
+	/*
+	 * We are going to scan backwards in the log in two parts.  First
+	 * we scan the physical end of the log.	 In this part of the log,
+	 * we are looking for blocks with cycle number last_half_cycle - 1.
+	 * If we find one, then we know that the log starts there, as we've
+	 * found a hole that didn't get written in going around the end
+	 * of the physical log.	 The simple case for this is
+	 *	  x + 1 ... | x ... | x - 1 | x
+	 *	  <---------> less than scan distance
+	 * If all of the blocks at the end of the log have cycle number
+	 * last_half_cycle, then we check the blocks at the start of the
+	 * log looking for occurrences of last_half_cycle.  If we find one,
+	 * then our current estimate for the location of the first
+	 * occurrence of last_half_cycle is wrong and we move back to the
+	 * hole we've found.  This case looks like
+	 *	  x + 1 ... | x | x + 1 | x ...
+	 *				 ^ binary search stopped here
+	 * Another case we need to handle that only occurs in 256k logs is
+	 *	  x + 1 ... | x ... | x+1 | x ...
+	 *		     ^ binary search stops here
+	 * In a 256k log, the scan at the end of the log will see the x+1
+	 * blocks.  We need to skip past those since that is certainly not
+	 * the head of the log.	 By searching for last_half_cycle-1 we
+	 * accomplish that.
+	 */
+	start_blk = log_bbnum - num_scan_bblks + head_blk;
+	ASSERT(head_blk <= INT_MAX && (xfs_daddr_t) num_scan_bblks-head_blk >= 0);
+	if ((error = xlog_find_verify_cycle(log, start_blk,
+			num_scan_bblks-(int)head_blk, (stop_on_cycle - 1),
+			&new_blk)))
+		goto bp_err;
+	if (new_blk != -1) {
+	    head_blk = new_blk;
+	    goto bad_blk;
+	}
+
+	/*
+	 * Scan beginning of log now.  The last part of the physical log
+	 * is good.  This scan needs to verify that it doesn't find the
+	 * last_half_cycle.
+	 */
+	start_blk = 0;
+	ASSERT(head_blk <= INT_MAX);
+	if ((error = xlog_find_verify_cycle(log, start_blk, (int) head_blk,
+					 stop_on_cycle, &new_blk)))
+	    goto bp_err;
+	if (new_blk != -1)
+	    head_blk = new_blk;
+    }
+
+bad_blk:
+    /*
+     * Now we need to make sure head_blk is not pointing to a block in
+     * the middle of a log record.
+     */
+    num_scan_bblks = BTOBB(XLOG_MAX_RECORD_BSIZE);
+    if (head_blk >= num_scan_bblks) {
+	start_blk = head_blk - num_scan_bblks;	/* don't read head_blk */
+
+	/* start ptr at last block ptr before head_blk */
+	if ((error = xlog_find_verify_log_record(log,
+						 start_blk,
+						 &head_blk,
+						 0)) == -1) {
+	    error = XFS_ERROR(EIO);
+	    goto bp_err;
+	} else if (error)
+	    goto bp_err;
+    } else {
+	start_blk = 0;
+	ASSERT(head_blk <= INT_MAX);
+	if ((error = xlog_find_verify_log_record(log,
+						 start_blk,
+						 &head_blk,
+						 0)) == -1) {
+	    /* We hit the beginning of the log during our search */
+	    start_blk = log_bbnum - num_scan_bblks + head_blk;
+	    new_blk = log_bbnum;
+	    ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0);
+	    ASSERT(head_blk <= INT_MAX);
+	    if ((error = xlog_find_verify_log_record(log,
+						     start_blk,
+						     &new_blk,
+						     (int)head_blk)) == -1) {
+		error = XFS_ERROR(EIO);
+		goto bp_err;
+	    } else if (error)
+		goto bp_err;
+	    if (new_blk != log_bbnum)
+		head_blk = new_blk;
+	} else if (error)
+	    goto bp_err;
+    }
+
+    xlog_put_bp(bp);
+    if (head_blk == log_bbnum)
+	    *return_head_blk = 0;
+    else
+	    *return_head_blk = head_blk;
+    /*
+     * When returning here, we have a good block number.  Bad block
+     * means that during a previous crash, we didn't have a clean break
+     * from cycle number N to cycle number N-1.	 In this case, we need
+     * to find the first block with cycle number N-1.
+     */
+    return 0;
+
+bp_err:
+	xlog_put_bp(bp);
+
+	if (error)
+	    xlog_warn("XFS: failed to find log head");
+
+	return error;
+}	/* xlog_find_head */
+
+/*
+ * Find the sync block number or the tail of the log.
+ *
+ * This will be the block number of the last record to have its
+ * associated buffers synced to disk.  Every log record header has
+ * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
+ * to get a sync block number.	The only concern is to figure out which
+ * log record header to believe.
+ *
+ * The following algorithm uses the log record header with the largest
+ * lsn.	 The entire log record does not need to be valid.  We only care
+ * that the header is valid.
+ *
+ * We could speed up search by using current head_blk buffer, but it is not
+ * available.
+ */
+int
+xlog_find_tail(xlog_t  *log,
+	       xfs_daddr_t *head_blk,
+	       xfs_daddr_t *tail_blk,
+	       int readonly)
+{
+	xlog_rec_header_t	*rhead;
+	xlog_op_header_t	*op_head;
+	xfs_buf_t		*bp;
+	int			error, i, found;
+	xfs_daddr_t		umount_data_blk;
+	xfs_daddr_t		after_umount_blk;
+	xfs_lsn_t		tail_lsn;
+	int			hblks;
+
+	found = error = 0;
+
+	/*
+	 * Find previous log record
+	 */
+	if ((error = xlog_find_head(log, head_blk)))
+		return error;
+
+	bp = xlog_get_bp(1,log->l_mp);
+	if (!bp)
+		return ENOMEM;
+	if (*head_blk == 0) {				/* special case */
+		if ((error = xlog_bread(log, 0, 1, bp)))
+			goto bread_err;
+		if (GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT) == 0) {
+			*tail_blk = 0;
+			/* leave all other log inited values alone */
+			goto exit;
+		}
+	}
+
+	/*
+	 * Search backwards looking for log record header block
+	 */
+	ASSERT(*head_blk < INT_MAX);
+	for (i=(int)(*head_blk)-1; i>=0; i--) {
+		if ((error = xlog_bread(log, i, 1, bp)))
+			goto bread_err;
+		if (INT_GET(*(uint *)(XFS_BUF_PTR(bp)), ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) {
+			found = 1;
+			break;
+		}
+	}
+	/*
+	 * If we haven't found the log record header block, start looking
+	 * again from the end of the physical log.  XXXmiken: There should be
+	 * a check here to make sure we didn't search more than N blocks in
+	 * the previous code.
+	 */
+	if (!found) {
+		for (i=log->l_logBBsize-1; i>=(int)(*head_blk); i--) {
+			if ((error = xlog_bread(log, i, 1, bp)))
+				goto bread_err;
+			if (INT_GET(*(uint*)(XFS_BUF_PTR(bp)), ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) {
+				found = 2;
+				break;
+			}
+		}
+	}
+	if (!found) {
+		xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
+		ASSERT(0);
+		return XFS_ERROR(EIO);
+	}
+
+	/* find blk_no of tail of log */
+	rhead = (xlog_rec_header_t *)XFS_BUF_PTR(bp);
+	*tail_blk = BLOCK_LSN(rhead->h_tail_lsn, ARCH_CONVERT);
+
+	/*
+	 * Reset log values according to the state of the log when we
+	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
+	 * one because the next write starts a new cycle rather than
+	 * continuing the cycle of the last good log record.  At this
+	 * point we have guaranteed that all partial log records have been
+	 * accounted for.  Therefore, we know that the last good log record
+	 * written was complete and ended exactly on the end boundary
+	 * of the physical log.
+	 */
+	log->l_prev_block = i;
+	log->l_curr_block = (int)*head_blk;
+	log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT);
+	if (found == 2)
+		log->l_curr_cycle++;
+	log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT);
+	log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT);
+	log->l_grant_reserve_cycle = log->l_curr_cycle;
+	log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+	log->l_grant_write_cycle = log->l_curr_cycle;
+	log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+
+	/*
+	 * Look for unmount record.  If we find it, then we know there
+	 * was a clean unmount.	 Since 'i' could be the last block in
+	 * the physical log, we convert to a log block before comparing
+	 * to the head_blk.
+	 *
+	 * Save the current tail lsn to use to pass to
+	 * xlog_clear_stale_blocks() below.  We won't want to clear the
+	 * unmount record if there is one, so we pass the lsn of the
+	 * unmount record rather than the block after it.
+	 */
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		int	h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
+		int	h_version = INT_GET(rhead->h_version, ARCH_CONVERT);
+		if ((h_version && XLOG_VERSION_2) &&
+		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+			if (h_size % XLOG_HEADER_CYCLE_SIZE)
+				hblks++;
+		} else {
+			hblks = 1;
+		}
+	} else {
+		hblks = 1;
+	}
+	after_umount_blk = (i + hblks +
+		(int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT))) % log->l_logBBsize;
+	tail_lsn = log->l_tail_lsn;
+	if (*head_blk == after_umount_blk && INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) {
+		umount_data_blk = (i + hblks) % log->l_logBBsize;
+		if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
+			goto bread_err;
+		}
+		op_head = (xlog_op_header_t *)XFS_BUF_PTR(bp);
+		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+			/*
+			 * Set tail and last sync so that newly written
+			 * log records will point recovery to after the
+			 * current unmount record.
+			 */
+			ASSIGN_ANY_LSN(log->l_tail_lsn, log->l_curr_cycle,
+					after_umount_blk, ARCH_NOCONVERT);
+			ASSIGN_ANY_LSN(log->l_last_sync_lsn, log->l_curr_cycle,
+					after_umount_blk, ARCH_NOCONVERT);
+			*tail_blk = after_umount_blk;
+		}
+	}
+
+#ifdef __KERNEL__
+	/*
+	 * Make sure that there are no blocks in front of the head
+	 * with the same cycle number as the head.  This can happen
+	 * because we allow multiple outstanding log writes concurrently,
+	 * and the later writes might make it out before earlier ones.
+	 *
+	 * We use the lsn from before modifying it so that we'll never
+	 * overwrite the unmount record after a clean unmount.
+	 *
+	 * Do this only if we are going to recover the filesystem
+	 */
+	if (!readonly)
+		error = xlog_clear_stale_blocks(log, tail_lsn);
+#endif
+
+bread_err:
+exit:
+	xlog_put_bp(bp);
+
+	if (error)
+		xlog_warn("XFS: failed to locate log tail");
+
+	return error;
+}	/* xlog_find_tail */
+
+
+/*
+ * Is the log zeroed at all?
+ *
+ * The last binary search should be changed to perform an X block read
+ * once X becomes small enough.	 You can then search linearly through
+ * the X blocks.  This will cut down on the number of reads we need to do.
+ *
+ * If the log is partially zeroed, this routine will pass back the blkno
+ * of the first block with cycle number 0.  It won't have a complete LR
+ * preceding it.
+ *
+ * Return:
+ *	0  => the log is completely written to
+ *	-1 => use *blk_no as the first block of the log
+ *	>0 => error has occurred
+ */
+int
+xlog_find_zeroed(struct log	*log,
+		 xfs_daddr_t	*blk_no)
+{
+	xfs_buf_t	*bp;
+	uint		first_cycle, last_cycle;
+	xfs_daddr_t	new_blk, last_blk, start_blk;
+	xfs_daddr_t	num_scan_bblks;
+	int		error, log_bbnum = log->l_logBBsize;
+
+	error = 0;
+	/* check totally zeroed log */
+	bp = xlog_get_bp(1,log->l_mp);
+	if (!bp)
+		return ENOMEM;
+	if ((error = xlog_bread(log, 0, 1, bp)))
+		goto bp_err;
+	first_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+	if (first_cycle == 0) {		/* completely zeroed log */
+		*blk_no = 0;
+		xlog_put_bp(bp);
+		return -1;
+	}
+
+	/* check partially zeroed log */
+	if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
+		goto bp_err;
+	last_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+	if (last_cycle != 0) {		/* log completely written to */
+		xlog_put_bp(bp);
+		return 0;
+	} else if (first_cycle != 1) {
+		/*
+		 * If the cycle of the last block is zero, the cycle of
+		 * the first block must be 1. If it's not, maybe we're
+		 * not looking at a log... Bail out.
+		 */
+		xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
+		return XFS_ERROR(EINVAL);
+	}
+
+	/* we have a partially zeroed log */
+	last_blk = log_bbnum-1;
+	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
+		goto bp_err;
+
+	/*
+	 * Validate the answer.	 Because there is no way to guarantee that
+	 * the entire log is made up of log records which are the same size,
+	 * we scan over the defined maximum blocks.  At this point, the maximum
+	 * is not chosen to mean anything special.   XXXmiken
+	 */
+	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+	ASSERT(num_scan_bblks <= INT_MAX);
+
+	if (last_blk < num_scan_bblks)
+		num_scan_bblks = last_blk;
+	start_blk = last_blk - num_scan_bblks;
+
+	/*
+	 * We search for any instances of cycle number 0 that occur before
+	 * our current estimate of the head.  What we're trying to detect is
+	 *	  1 ... | 0 | 1 | 0...
+	 *			 ^ binary search ends here
+	 */
+	if ((error = xlog_find_verify_cycle(log, start_blk,
+					 (int)num_scan_bblks, 0, &new_blk)))
+		goto bp_err;
+	if (new_blk != -1)
+		last_blk = new_blk;
+
+	/*
+	 * Potentially backup over partial log record write.  We don't need
+	 * to search the end of the log because we know it is zero.
+	 */
+	if ((error = xlog_find_verify_log_record(log, start_blk,
+				&last_blk, 0)) == -1) {
+	    error = XFS_ERROR(EIO);
+	    goto bp_err;
+	} else if (error)
+	    goto bp_err;
+
+	*blk_no = last_blk;
+bp_err:
+	xlog_put_bp(bp);
+	if (error)
+		return error;
+	return -1;
+}	/* xlog_find_zeroed */
+
+/*
+ * This is simply a subroutine used by xlog_clear_stale_blocks() below
+ * to initialize a buffer full of empty log record headers and write
+ * them into the log.
+ */
+STATIC int
+xlog_write_log_records(
+	xlog_t	*log,
+	int	cycle,
+	int	start_block,
+	int	blocks,
+	int	tail_cycle,
+	int	tail_block)
+{
+	xlog_rec_header_t	*recp;
+	int			i, j;
+	int			end_block = start_block + blocks;
+	int			error	    = 0;
+	xfs_buf_t		*bp;
+	char			*buf;
+	int			bufblks;
+
+	bufblks = 1 << ffs(blocks);
+	while (!(bp = xlog_get_bp(bufblks, log->l_mp))) {
+		bufblks >>= 1;
+		if (!bufblks)
+			return ENOMEM;
+	}
+
+	buf = XFS_BUF_PTR(bp);
+	recp = (xlog_rec_header_t*)buf;
+
+	memset(buf, 0, BBSIZE);
+	INT_SET(recp->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
+	INT_SET(recp->h_cycle, ARCH_CONVERT, cycle);
+	INT_SET(recp->h_version, ARCH_CONVERT,
+			XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
+	ASSIGN_ANY_LSN(recp->h_tail_lsn, tail_cycle, tail_block, ARCH_CONVERT);
+
+	for (i = start_block; i < end_block; i += bufblks) {
+		int bcount = min(bufblks, end_block - start_block);
+		/* with plenty of memory, we duplicate the block
+		 * right through the buffer and modify each entry
+		 */
+		ASSIGN_ANY_LSN(recp->h_lsn, cycle, i, ARCH_CONVERT);
+		for (j = 1; j < bcount; j++) {
+			buf += BBSIZE;
+			recp = (xlog_rec_header_t*)buf;
+			memcpy(buf, XFS_BUF_PTR(bp), BBSIZE);
+			ASSIGN_ANY_LSN(recp->h_lsn, cycle, i+j, ARCH_CONVERT);
+		}
+		/* then write the whole lot out at once */
+		error = xlog_bwrite(log, start_block, bcount, bp);
+		start_block += bcount;
+		buf = XFS_BUF_PTR(bp);
+		recp = (xlog_rec_header_t*)buf;
+	}
+	xlog_put_bp(bp);
+
+	return error;
+}
+
+/*
+ * This routine is called to blow away any incomplete log writes out
+ * in front of the log head.  We do this so that we won't become confused
+ * if we come up, write only a little bit more, and then crash again.
+ * If we leave the partial log records out there, this situation could
+ * cause us to think those partial writes are valid blocks since they
+ * have the current cycle number.  We get rid of them by overwriting them
+ * with empty log records with the old cycle number rather than the
+ * current one.
+ *
+ * The tail lsn is passed in rather than taken from
+ * the log so that we will not write over the unmount record after a
+ * clean unmount in a 512 block log.  Doing so would leave the log without
+ * any valid log records in it until a new one was written.  If we crashed
+ * during that time we would not be able to recover.
+ */
+STATIC int
+xlog_clear_stale_blocks(
+	xlog_t		*log,
+	xfs_lsn_t	tail_lsn)
+{
+	int			tail_cycle;
+	int			head_cycle;
+	int			tail_block;
+	int			head_block;
+	int			tail_distance;
+	int			max_distance;
+	int			distance;
+	int			error;
+
+	tail_cycle = CYCLE_LSN(tail_lsn, ARCH_NOCONVERT);
+	tail_block = BLOCK_LSN(tail_lsn, ARCH_NOCONVERT);
+	head_cycle = log->l_curr_cycle;
+	head_block = log->l_curr_block;
+
+	/*
+	 * Figure out the distance between the new head of the log
+	 * and the tail.  We want to write over any blocks beyond the
+	 * head that we may have written just before the crash, but
+	 * we don't want to overwrite the tail of the log.
+	 */
+	if (head_cycle == tail_cycle) {
+		/*
+		 * The tail is behind the head in the physical log,
+		 * so the distance from the head to the tail is the
+		 * distance from the head to the end of the log plus
+		 * the distance from the beginning of the log to the
+		 * tail.
+		 */
+		if (head_block < tail_block || head_block >= log->l_logBBsize)
+			return XFS_ERROR(EFSCORRUPTED);
+		tail_distance = tail_block +
+				(log->l_logBBsize - head_block);
+	} else {
+		/*
+		 * The head is behind the tail in the physical log,
+		 * so the distance from the head to the tail is just
+		 * the tail block minus the head block.
+		 */
+		if (head_block >= tail_block || head_cycle != (tail_cycle + 1))
+			return XFS_ERROR(EFSCORRUPTED);
+		tail_distance = tail_block - head_block;
+	}
+
+	/*
+	 * If the head is right up against the tail, we can't clear
+	 * anything.
+	 */
+	if (tail_distance <= 0) {
+		ASSERT(tail_distance == 0);
+		return 0;
+	}
+
+	max_distance = XLOG_TOTAL_REC_SHIFT(log);
+	/*
+	 * Take the smaller of the maximum amount of outstanding I/O
+	 * we could have and the distance to the tail to clear out.
+	 * We take the smaller so that we don't overwrite the tail and
+	 * we don't waste all day writing from the head to the tail
+	 * for no reason.
+	 */
+	max_distance = MIN(max_distance, tail_distance);
+
+	if ((head_block + max_distance) <= log->l_logBBsize) {
+		/*
+		 * We can stomp all the blocks we need to without
+		 * wrapping around the end of the log.	Just do it
+		 * in a single write.  Use the cycle number of the
+		 * current cycle minus one so that the log will look like:
+		 *     n ... | n - 1 ...
+		 */
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, max_distance, tail_cycle,
+				tail_block);
+		if (error)
+			return error;
+	} else {
+		/*
+		 * We need to wrap around the end of the physical log in
+		 * order to clear all the blocks.  Do it in two separate
+		 * I/Os.  The first write should be from the head to the
+		 * end of the physical log, and it should use the current
+		 * cycle number minus one just like above.
+		 */
+		distance = log->l_logBBsize - head_block;
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, distance, tail_cycle,
+				tail_block);
+
+		if (error)
+			return error;
+
+		/*
+		 * Now write the blocks at the start of the physical log.
+		 * This writes the remainder of the blocks we want to clear.
+		 * It uses the current cycle number since we're now on the
+		 * same cycle as the head so that we get:
+		 *    n ... n ... | n - 1 ...
+		 *    ^^^^^ blocks we're writing
+		 */
+		distance = max_distance - (log->l_logBBsize - head_block);
+		error = xlog_write_log_records(log, head_cycle, 0, distance,
+				tail_cycle, tail_block);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ *
+ *		Log recover routines
+ *
+ ******************************************************************************
+ */
+
+STATIC xlog_recover_t *
+xlog_recover_find_tid(xlog_recover_t *q,
+		      xlog_tid_t     tid)
+{
+	xlog_recover_t *p = q;
+
+	while (p != NULL) {
+		if (p->r_log_tid == tid)
+		    break;
+		p = p->r_next;
+	}
+	return p;
+}	/* xlog_recover_find_tid */
+
+
+STATIC void
+xlog_recover_put_hashq(xlog_recover_t **q,
+		       xlog_recover_t *trans)
+{
+	trans->r_next = *q;
+	*q = trans;
+}	/* xlog_recover_put_hashq */
+
+
+STATIC void
+xlog_recover_add_item(xlog_recover_item_t **itemq)
+{
+	xlog_recover_item_t *item;
+
+	item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
+	xlog_recover_insert_item_backq(itemq, item);
+}	/* xlog_recover_add_item */
+
+
+STATIC int
+xlog_recover_add_to_cont_trans(xlog_recover_t	*trans,
+			       xfs_caddr_t		dp,
+			       int		len)
+{
+	xlog_recover_item_t	*item;
+	xfs_caddr_t			ptr, old_ptr;
+	int			old_len;
+
+	item = trans->r_itemq;
+	if (item == 0) {
+		/* finish copying rest of trans header */
+		xlog_recover_add_item(&trans->r_itemq);
+		ptr = (xfs_caddr_t)&trans->r_theader+sizeof(xfs_trans_header_t)-len;
+		bcopy(dp, ptr, len); /* s, d, l */
+		return 0;
+	}
+	item = item->ri_prev;
+
+	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+	old_len = item->ri_buf[item->ri_cnt-1].i_len;
+
+	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0);
+	bcopy(dp , &ptr[old_len], len); /* s, d, l */
+	item->ri_buf[item->ri_cnt-1].i_len += len;
+	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+	return 0;
+}	/* xlog_recover_add_to_cont_trans */
+
+
+/* The next region to add is the start of a new region.	 It could be
+ * a whole region or it could be the first part of a new region.  Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned.  Therefore, we
+ * either have both fields or we have neither field.  In the case we have
+ * neither field, the data part of the region is zero length.  We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later.  If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(xlog_recover_t	*trans,
+			  xfs_caddr_t		dp,
+			  int			len)
+{
+	xfs_inode_log_format_t	 *in_f;			/* any will do */
+	xlog_recover_item_t	 *item;
+	xfs_caddr_t		 ptr;
+
+	if (!len)
+		return 0;
+	ptr = kmem_zalloc(len, 0);
+	bcopy(dp, ptr, len);
+
+	in_f = (xfs_inode_log_format_t *)ptr;
+	item = trans->r_itemq;
+	if (item == 0) {
+		ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
+		if (len == sizeof(xfs_trans_header_t))
+			xlog_recover_add_item(&trans->r_itemq);
+		bcopy(dp, &trans->r_theader, len); /* s, d, l */
+		return 0;
+	}
+	if (item->ri_prev->ri_total != 0 &&
+	     item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
+		xlog_recover_add_item(&trans->r_itemq);
+	}
+	item = trans->r_itemq;
+	item = item->ri_prev;
+
+	if (item->ri_total == 0) {		/* first region to be added */
+		item->ri_total	= in_f->ilf_size;
+		ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
+		item->ri_buf = kmem_zalloc((item->ri_total *
+					    sizeof(xfs_log_iovec_t)), 0);
+	}
+	ASSERT(item->ri_total > item->ri_cnt);
+	/* Description region is ri_buf[0] */
+	item->ri_buf[item->ri_cnt].i_addr = ptr;
+	item->ri_buf[item->ri_cnt].i_len  = len;
+	item->ri_cnt++;
+	return 0;
+}	/* xlog_recover_add_to_trans */
+
+
+STATIC void
+xlog_recover_new_tid(xlog_recover_t	**q,
+		     xlog_tid_t		tid,
+		     xfs_lsn_t		lsn)
+{
+	xlog_recover_t	*trans;
+
+	trans = kmem_zalloc(sizeof(xlog_recover_t), 0);
+	trans->r_log_tid   = tid;
+	trans->r_lsn	   = lsn;
+	xlog_recover_put_hashq(q, trans);
+}	/* xlog_recover_new_tid */
+
+
+STATIC int
+xlog_recover_unlink_tid(xlog_recover_t	**q,
+			xlog_recover_t	*trans)
+{
+	xlog_recover_t	*tp;
+	int		found = 0;
+
+	ASSERT(trans != 0);
+	if (trans == *q) {
+		*q = (*q)->r_next;
+	} else {
+		tp = *q;
+		while (tp != 0) {
+			if (tp->r_next == trans) {
+				found = 1;
+				break;
+			}
+			tp = tp->r_next;
+		}
+		if (!found) {
+			xlog_warn(
+			     "XFS: xlog_recover_unlink_tid: trans not found");
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
+		tp->r_next = tp->r_next->r_next;
+	}
+	return 0;
+}	/* xlog_recover_unlink_tid */
+
+STATIC void
+xlog_recover_insert_item_backq(xlog_recover_item_t **q,
+			       xlog_recover_item_t *item)
+{
+	if (*q == 0) {
+		item->ri_prev = item->ri_next = item;
+		*q = item;
+	} else {
+		item->ri_next		= *q;
+		item->ri_prev		= (*q)->ri_prev;
+		(*q)->ri_prev		= item;
+		item->ri_prev->ri_next	= item;
+	}
+}	/* xlog_recover_insert_item_backq */
+
+STATIC void
+xlog_recover_insert_item_frontq(xlog_recover_item_t **q,
+				xlog_recover_item_t *item)
+{
+	xlog_recover_insert_item_backq(q, item);
+	*q = item;
+}	/* xlog_recover_insert_item_frontq */
+
+STATIC int
+xlog_recover_reorder_trans(xlog_t	  *log,
+			   xlog_recover_t *trans)
+{
+    xlog_recover_item_t *first_item, *itemq, *itemq_next;
+
+    first_item = itemq = trans->r_itemq;
+    trans->r_itemq = NULL;
+    do {
+	itemq_next = itemq->ri_next;
+	switch (ITEM_TYPE(itemq)) {
+	    case XFS_LI_BUF:
+	    case XFS_LI_6_1_BUF:
+	    case XFS_LI_5_3_BUF: {
+		xlog_recover_insert_item_frontq(&trans->r_itemq, itemq);
+		break;
+	    }
+	    case XFS_LI_INODE:
+	    case XFS_LI_6_1_INODE:
+	    case XFS_LI_5_3_INODE:
+	    case XFS_LI_DQUOT:
+	    case XFS_LI_QUOTAOFF:
+	    case XFS_LI_EFD:
+	    case XFS_LI_EFI: {
+		xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
+		break;
+	    }
+	    default: {
+		xlog_warn(
+	"XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
+		ASSERT(0);
+		return XFS_ERROR(EIO);
+	    }
+	}
+	itemq = itemq_next;
+    } while (first_item != itemq);
+    return 0;
+}	/* xlog_recover_reorder_trans */
+
+
+/*
+ * Build up the table of buf cancel records so that we don't replay
+ * cancelled data in the second pass.  For buffer records that are
+ * not cancel records, there is nothing to do here so we just return.
+ *
+ * If we get a cancel record which is already in the table, this indicates
+ * that the buffer was cancelled multiple times.  In order to ensure
+ * that during pass 2 we keep the record in the table until we reach its
+ * last occurrence in the log, we keep a reference count in the cancel
+ * record in the table to tell us how many times we expect to see this
+ * record during the second pass.
+ */
+STATIC void
+xlog_recover_do_buffer_pass1(xlog_t			*log,
+			     xfs_buf_log_format_t	*buf_f)
+{
+	xfs_buf_cancel_t	*bcp;
+	xfs_buf_cancel_t	*nextp;
+	xfs_buf_cancel_t	*prevp;
+	xfs_buf_cancel_t	**bucket;
+	xfs_buf_log_format_v1_t *obuf_f;
+	xfs_daddr_t		blkno=0;
+	uint			len=0;
+	ushort			flags=0;
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		blkno = buf_f->blf_blkno;
+		len = buf_f->blf_len;
+		flags = buf_f->blf_flags;
+		break;
+	case XFS_LI_6_1_BUF:
+	case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		blkno = (xfs_daddr_t) obuf_f->blf_blkno;
+		len = obuf_f->blf_len;
+		flags = obuf_f->blf_flags;
+		break;
+	}
+
+	/*
+	 * If this isn't a cancel buffer item, then just return.
+	 */
+	if (!(flags & XFS_BLI_CANCEL)) {
+		return;
+	}
+
+	/*
+	 * Insert an xfs_buf_cancel record into the hash table of
+	 * them.  If there is already an identical record, bump
+	 * its reference count.
+	 */
+	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
+					  XLOG_BC_TABLE_SIZE];
+	/*
+	 * If the hash bucket is empty then just insert a new record into
+	 * the bucket.
+	 */
+	if (*bucket == NULL) {
+		bcp = (xfs_buf_cancel_t*)kmem_alloc(sizeof(xfs_buf_cancel_t),
+						    KM_SLEEP);
+		bcp->bc_blkno = blkno;
+		bcp->bc_len = len;
+		bcp->bc_refcount = 1;
+		bcp->bc_next = NULL;
+		*bucket = bcp;
+		return;
+	}
+
+	/*
+	 * The hash bucket is not empty, so search for duplicates of our
+	 * record.  If we find one them just bump its refcount.	 If not
+	 * then add us at the end of the list.
+	 */
+	prevp = NULL;
+	nextp = *bucket;
+	while (nextp != NULL) {
+		if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+			nextp->bc_refcount++;
+			return;
+		}
+		prevp = nextp;
+		nextp = nextp->bc_next;
+	}
+	ASSERT(prevp != NULL);
+	bcp = (xfs_buf_cancel_t*)kmem_alloc(sizeof(xfs_buf_cancel_t),
+					    KM_SLEEP);
+	bcp->bc_blkno = blkno;
+	bcp->bc_len = len;
+	bcp->bc_refcount = 1;
+	bcp->bc_next = NULL;
+	prevp->bc_next = bcp;
+}
+
+/*
+ * Check to see whether the buffer being recovered has a corresponding
+ * entry in the buffer cancel record table.  If it does then return 1
+ * so that it will be cancelled, otherwise return 0.  If the buffer is
+ * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * the refcount on the entry in the table and remove it from the table
+ * if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its
+ * last occurrence in the log so that if the same buffer is re-used
+ * again after its last cancellation we actually replay the changes
+ * made at that point.
+ */
+STATIC int
+xlog_recover_do_buffer_pass2(xlog_t			*log,
+			     xfs_buf_log_format_t	*buf_f)
+{
+	xfs_buf_cancel_t	*bcp;
+	xfs_buf_cancel_t	*prevp;
+	xfs_buf_cancel_t	**bucket;
+	xfs_buf_log_format_v1_t *obuf_f;
+	xfs_daddr_t		blkno=0;
+	ushort			flags=0;
+	uint			len=0;
+
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		blkno = buf_f->blf_blkno;
+		flags = buf_f->blf_flags;
+		len = buf_f->blf_len;
+		break;
+	case XFS_LI_6_1_BUF:
+	case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		blkno = (xfs_daddr_t) obuf_f->blf_blkno;
+		flags = obuf_f->blf_flags;
+		len = (xfs_daddr_t) obuf_f->blf_len;
+		break;
+	}
+	if (log->l_buf_cancel_table == NULL) {
+		/*
+		 * There is nothing in the table built in pass one,
+		 * so this buffer must not be cancelled.
+		 */
+		ASSERT(!(flags & XFS_BLI_CANCEL));
+		return 0;
+	}
+
+	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
+					  XLOG_BC_TABLE_SIZE];
+	bcp = *bucket;
+	if (bcp == NULL) {
+		/*
+		 * There is no corresponding entry in the table built
+		 * in pass one, so this buffer has not been cancelled.
+		 */
+		ASSERT(!(flags & XFS_BLI_CANCEL));
+		return 0;
+	}
+
+	/*
+	 * Search for an entry in the buffer cancel table that
+	 * matches our buffer.
+	 */
+	prevp = NULL;
+	while (bcp != NULL) {
+		if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+			/*
+			 * We've go a match, so return 1 so that the
+			 * recovery of this buffer is cancelled.
+			 * If this buffer is actually a buffer cancel
+			 * log item, then decrement the refcount on the
+			 * one in the table and remove it if this is the
+			 * last reference.
+			 */
+			if (flags & XFS_BLI_CANCEL) {
+				bcp->bc_refcount--;
+				if (bcp->bc_refcount == 0) {
+					if (prevp == NULL) {
+						*bucket = bcp->bc_next;
+					} else {
+						prevp->bc_next = bcp->bc_next;
+					}
+					kmem_free(bcp,
+						  sizeof(xfs_buf_cancel_t));
+				}
+			}
+			return 1;
+		}
+		prevp = bcp;
+		bcp = bcp->bc_next;
+	}
+	/*
+	 * We didn't find a corresponding entry in the table, so
+	 * return 0 so that the buffer is NOT cancelled.
+	 */
+	ASSERT(!(flags & XFS_BLI_CANCEL));
+	return 0;
+}
+
+
+/*
+ * Perform recovery for a buffer full of inodes.  In these buffers,
+ * the only data which should be recovered is that which corresponds
+ * to the di_next_unlinked pointers in the on disk inode structures.
+ * The rest of the data for the inodes is always logged through the
+ * inodes themselves rather than the inode buffer and is recovered
+ * in xlog_recover_do_inode_trans().
+ *
+ * The only time when buffers full of inodes are fully recovered is
+ * when the buffer is full of newly allocated inodes.  In this case
+ * the buffer will not be marked as an inode buffer and so will be
+ * sent to xlog_recover_do_reg_buffer() below during recovery.
+ */
+STATIC int
+xlog_recover_do_inode_buffer(xfs_mount_t		*mp,
+			     xlog_recover_item_t	*item,
+			     xfs_buf_t			*bp,
+			     xfs_buf_log_format_t	*buf_f)
+{
+	int			i;
+	int			item_index;
+	int			bit;
+	int			nbits;
+	int			reg_buf_offset;
+	int			reg_buf_bytes;
+	int			next_unlinked_offset;
+	int			inodes_per_buf;
+	xfs_agino_t		*logged_nextp;
+	xfs_agino_t		*buffer_nextp;
+	xfs_buf_log_format_v1_t *obuf_f;
+	unsigned int		*data_map=NULL;
+	unsigned int		map_size=0;
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		data_map = buf_f->blf_data_map;
+		map_size = buf_f->blf_map_size;
+		break;
+	case XFS_LI_6_1_BUF:
+	case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		data_map = obuf_f->blf_data_map;
+		map_size = obuf_f->blf_map_size;
+		break;
+	}
+	/*
+	 * Set the variables corresponding to the current region to
+	 * 0 so that we'll initialize them on the first pass through
+	 * the loop.
+	 */
+	reg_buf_offset = 0;
+	reg_buf_bytes = 0;
+	bit = 0;
+	nbits = 0;
+	item_index = 0;
+	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+	for (i = 0; i < inodes_per_buf; i++) {
+		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
+			offsetof(xfs_dinode_t, di_next_unlinked);
+
+		while (next_unlinked_offset >=
+		       (reg_buf_offset + reg_buf_bytes)) {
+			/*
+			 * The next di_next_unlinked field is beyond
+			 * the current logged region.  Find the next
+			 * logged region that contains or is beyond
+			 * the current di_next_unlinked field.
+			 */
+			bit += nbits;
+			bit = xfs_next_bit(data_map, map_size, bit);
+
+			/*
+			 * If there are no more logged regions in the
+			 * buffer, then we're done.
+			 */
+			if (bit == -1) {
+				return 0;
+			}
+
+			nbits = xfs_contig_bits(data_map, map_size,
+							 bit);
+			reg_buf_offset = bit << XFS_BLI_SHIFT;
+			reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+			item_index++;
+		}
+
+		/*
+		 * If the current logged region starts after the current
+		 * di_next_unlinked field, then move on to the next
+		 * di_next_unlinked field.
+		 */
+		if (next_unlinked_offset < reg_buf_offset) {
+			continue;
+		}
+
+		ASSERT(item->ri_buf[item_index].i_addr != NULL);
+		ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
+
+		/*
+		 * The current logged region contains a copy of the
+		 * current di_next_unlinked field.  Extract its value
+		 * and copy it to the buffer copy.
+		 */
+		logged_nextp = (xfs_agino_t *)
+			       ((char *)(item->ri_buf[item_index].i_addr) +
+				(next_unlinked_offset - reg_buf_offset));
+		if (*logged_nextp == 0)	 {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
+				item, bp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
+					      next_unlinked_offset);
+		INT_SET(*buffer_nextp, ARCH_CONVERT, *logged_nextp);
+	}
+
+	return 0;
+}	/* xlog_recover_do_inode_buffer */
+
+/*
+ * Perform a 'normal' buffer recovery.	Each logged region of the
+ * buffer should be copied over the corresponding region in the
+ * given buffer.  The bitmap in the buf log format structure indicates
+ * where to place the logged data.
+ */
+/*ARGSUSED*/
+STATIC void
+xlog_recover_do_reg_buffer(xfs_mount_t		*mp,
+			   xlog_recover_item_t	*item,
+			   xfs_buf_t		*bp,
+			   xfs_buf_log_format_t *buf_f)
+{
+	int			i;
+	int			bit;
+	int			nbits;
+	xfs_buf_log_format_v1_t *obuf_f;
+	unsigned int		*data_map=NULL;
+	unsigned int		map_size=0;
+	int			error;
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		data_map = buf_f->blf_data_map;
+		map_size = buf_f->blf_map_size;
+		break;
+	case XFS_LI_6_1_BUF:
+	case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		data_map = obuf_f->blf_data_map;
+		map_size = obuf_f->blf_map_size;
+		break;
+	}
+	bit = 0;
+	i = 1;	/* 0 is the buf format structure */
+	while (1) {
+		bit = xfs_next_bit(data_map, map_size, bit);
+		if (bit == -1)
+			break;
+		nbits = xfs_contig_bits(data_map, map_size, bit);
+		ASSERT(item->ri_buf[i].i_addr != 0);
+		ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+		ASSERT(XFS_BUF_COUNT(bp) >=
+		       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+
+		/*
+		 * Do a sanity check if this is a dquot buffer. Just checking
+		 * the first dquot in the buffer should do. XXXThis is
+		 * probably a good thing to do for other buf types also.
+		 */
+		error = 0;
+		if (buf_f->blf_flags & (XFS_BLI_UDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+			/* OK, if this returns nopkg() */
+			error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
+					       item->ri_buf[i].i_addr,
+					       -1, 0, XFS_QMOPT_DOWARN,
+					       "dquot_buf_recover");
+		}
+		if (!error)
+		    bcopy(item->ri_buf[i].i_addr,		   /* source */
+		      xfs_buf_offset(bp, (uint)bit << XFS_BLI_SHIFT), /* dest */
+		      nbits<<XFS_BLI_SHIFT);			   /* length */
+		i++;
+		bit += nbits;
+	}
+
+	/* Shouldn't be any more regions */
+	ASSERT(i == item->ri_total);
+}	/* xlog_recover_do_reg_buffer */
+
+
+/*
+ * Perform a dquot buffer recovery.
+ * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * (ie. USR or GRP), then just toss this buffer away; don't recover it.
+ * Else, treat it as a regular buffer and do recovery.
+ */
+STATIC void
+xlog_recover_do_dquot_buffer(
+	xfs_mount_t		*mp,
+	xlog_t			*log,
+	xlog_recover_item_t	*item,
+	xfs_buf_t		*bp,
+	xfs_buf_log_format_t	*buf_f)
+{
+	uint type;
+
+	/*
+	 * Non-root filesystems are required to send in quota flags
+	 * at mount time. However, we may also get QUOTA_MAYBE flag set,
+	 * indicating that quota should stay on (and stay consistent),
+	 * if it already is. (so, we have to replay dquot log records
+	 * when MAYBE flag's set).
+	 */
+	if (mp->m_qflags == 0 && mp->m_dev != rootdev) {
+		return;
+	}
+	
+	type = 0;
+	if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+		type |= XFS_DQ_USER;
+	if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+		type |= XFS_DQ_GROUP;
+	/*
+	 * This type of quotas was turned off, so ignore this buffer
+	 */
+	if (log->l_quotaoffs_flag & type)
+		return;
+
+	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+}
+
+/*
+ * This routine replays a modification made to a buffer at runtime.
+ * There are actually two types of buffer, regular and inode, which
+ * are handled differently.  Inode buffers are handled differently
+ * in that we only recover a specific set of data from them, namely
+ * the inode di_next_unlinked fields.  This is because all other inode
+ * data is actually logged via inode records and any data we replay
+ * here which overlaps that may be stale.
+ *
+ * When meta-data buffers are freed at run time we log a buffer item
+ * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * of the buffer in the log should not be replayed at recovery time.
+ * This is so that if the blocks covered by the buffer are reused for
+ * file data before we crash we don't end up replaying old, freed
+ * meta-data into a user's file.
+ *
+ * To handle the cancellation of buffer log items, we make two passes
+ * over the log during recovery.  During the first we build a table of
+ * those buffers which have been cancelled, and during the second we
+ * only replay those buffers which do not have corresponding cancel
+ * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
+ * for more details on the implementation of the table of cancel records.
+ */
+STATIC int
+xlog_recover_do_buffer_trans(xlog_t		 *log,
+			     xlog_recover_item_t *item,
+			     int		 pass)
+{
+	xfs_buf_log_format_t	*buf_f;
+	xfs_buf_log_format_v1_t *obuf_f;
+	xfs_mount_t		*mp;
+	xfs_buf_t		*bp;
+	int			error;
+	int			cancel;
+	xfs_daddr_t		blkno;
+	int			len;
+	ushort			flags;
+
+	buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		/*
+		 * In this pass we're only looking for buf items
+		 * with the XFS_BLI_CANCEL bit set.
+		 */
+		xlog_recover_do_buffer_pass1(log, buf_f);
+		return 0;
+	} else {
+		/*
+		 * In this pass we want to recover all the buffers
+		 * which have not been cancelled and are not
+		 * cancellation buffers themselves.  The routine
+		 * we call here will tell us whether or not to
+		 * continue with the replay of this buffer.
+		 */
+		cancel = xlog_recover_do_buffer_pass2(log, buf_f);
+		if (cancel) {
+			return 0;
+		}
+	}
+	switch (buf_f->blf_type) {
+	      case XFS_LI_BUF:
+		blkno = buf_f->blf_blkno;
+		len = buf_f->blf_len;
+		flags = buf_f->blf_flags;
+		break;
+	      case XFS_LI_6_1_BUF:
+	      case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		blkno = obuf_f->blf_blkno;
+		len = obuf_f->blf_len;
+		flags = obuf_f->blf_flags;
+		break;
+	      default:
+		xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+			"xfs_log_recover: unknown buffer type 0x%x, dev 0x%x",
+			buf_f->blf_type, log->l_dev);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	mp = log->l_mp;
+	if (flags & XFS_BLI_INODE_BUF) {
+		bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
+								XFS_BUF_LOCK);
+	} else {
+		bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
+	}
+	if (XFS_BUF_ISERROR(bp)) {
+		xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+				  bp, blkno);
+		error = XFS_BUF_GETERROR(bp);
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	error = 0;
+	if (flags & XFS_BLI_INODE_BUF) {
+		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+	} else if (flags & (XFS_BLI_UDQUOT_BUF | XFS_BLI_GDQUOT_BUF)) {
+		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+	} else {
+		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	}
+	if (error)
+		return XFS_ERROR(error);
+
+	/*
+	 * Perform delayed write on the buffer.	 Asynchronous writes will be
+	 * slower when taking into account all the buffers to be flushed.
+	 *
+	 * Also make sure that only inode buffers with good sizes stay in
+	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
+	 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
+	 * buffers in the log can be a different size if the log was generated
+	 * by an older kernel using unclustered inode buffers or a newer kernel
+	 * running with a different inode cluster size.	 Regardless, if the
+	 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
+	 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+	 * the buffer out of the buffer cache so that the buffer won't
+	 * overlap with future reads of those inodes.
+	 */
+	error = 0;
+
+	if ((INT_GET(*((__uint16_t *)(xfs_buf_offset(bp, 0))), ARCH_CONVERT) == XFS_DINODE_MAGIC) &&
+	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
+			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+	  XFS_BUF_STALE(bp);
+	  error = xfs_bwrite(mp, bp);
+	} else {
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+		XFS_BUF_SET_FSPRIVATE(bp, mp);
+		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+		xfs_bdwrite(mp, bp);
+	}
+
+	return (error);
+}	/* xlog_recover_do_buffer_trans */
+
+STATIC int
+xlog_recover_do_inode_trans(xlog_t		*log,
+			    xlog_recover_item_t *item,
+			    int			pass)
+{
+	xfs_inode_log_format_t	*in_f;
+	xfs_mount_t		*mp;
+	xfs_buf_t		*bp;
+	xfs_imap_t		imap;
+	xfs_dinode_t		*dip;
+	xfs_ino_t		ino;
+	int			len;
+	xfs_caddr_t		src;
+	xfs_caddr_t		dest;
+	int			error;
+	int			attr_index;
+	uint			fields;
+	xfs_dinode_core_t	*dicp;
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		return 0;
+	}
+
+	in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+	ino = in_f->ilf_ino;
+	mp = log->l_mp;
+	if (ITEM_TYPE(item) == XFS_LI_INODE) {
+		imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
+		imap.im_len = in_f->ilf_len;
+		imap.im_boffset = in_f->ilf_boffset;
+	} else {
+		/*
+		 * It's an old inode format record.  We don't know where
+		 * its cluster is located on disk, and we can't allow
+		 * xfs_imap() to figure it out because the inode btrees
+		 * are not ready to be used.  Therefore do not pass the
+		 * XFS_IMAP_LOOKUP flag to xfs_imap().	This will give
+		 * us only the single block in which the inode lives
+		 * rather than its cluster, so we must make sure to
+		 * invalidate the buffer when we write it out below.
+		 */
+		imap.im_blkno = 0;
+		xfs_imap(log->l_mp, 0, ino, &imap, 0);
+	}
+	bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
+								XFS_BUF_LOCK);
+	if (XFS_BUF_ISERROR(bp)) {
+		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
+				  bp, imap.im_blkno);
+		error = XFS_BUF_GETERROR(bp);
+		xfs_buf_relse(bp);
+		return error;
+	}
+	error = 0;
+	xfs_inobp_check(mp, bp);
+	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+
+	/*
+	 * Make sure the place we're flushing out to really looks
+	 * like an inode!
+	 */
+	if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
+			dip, bp, ino);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr);
+	if (dicp->di_magic != XFS_DINODE_MAGIC) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
+			item, ino);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if ((dicp->di_mode & IFMT) == IFREG) {
+		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+			xfs_buf_relse(bp);
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				item, dip, bp, ino);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	} else if ((dicp->di_mode & IFMT) == IFDIR) {
+		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
+		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+			xfs_buf_relse(bp);
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				item, dip, bp, ino);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	}
+	if (dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+			item, dip, bp, ino,
+			dicp->di_nextents + dicp->di_anextents,
+			dicp->di_nblocks);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (dicp->di_forkoff > mp->m_sb.sb_inodesize) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
+			item, dip, bp, ino, dicp->di_forkoff);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
+			item->ri_buf[1].i_len, item);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/* The core is in in-core format */
+	xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
+			      (xfs_dinode_core_t*)item->ri_buf[1].i_addr,
+			      -1, ARCH_CONVERT);
+	/* the rest is in on-disk format */
+	if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
+		bcopy(item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
+		      (xfs_caddr_t) dip		 + sizeof(xfs_dinode_core_t),
+		      item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
+	}
+
+	fields = in_f->ilf_fields;
+	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
+	case XFS_ILOG_DEV:
+		INT_SET(dip->di_u.di_dev, ARCH_CONVERT, in_f->ilf_u.ilfu_rdev);
+
+		break;
+	case XFS_ILOG_UUID:
+		dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
+		break;
+	}
+
+	if (in_f->ilf_size == 2)
+		goto write_inode_buffer;
+	len = item->ri_buf[2].i_len;
+	src = item->ri_buf[2].i_addr;
+	ASSERT(in_f->ilf_size <= 4);
+	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
+	ASSERT(!(fields & XFS_ILOG_DFORK) ||
+	       (len == in_f->ilf_dsize));
+
+	switch (fields & XFS_ILOG_DFORK) {
+	case XFS_ILOG_DDATA:
+	case XFS_ILOG_DEXT:
+		bcopy(src, &dip->di_u, len);
+		break;
+
+	case XFS_ILOG_DBROOT:
+		xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+				 &(dip->di_u.di_bmbt),
+				 XFS_DFORK_DSIZE(dip, mp));
+		break;
+
+	default:
+		/*
+		 * There are no data fork flags set.
+		 */
+		ASSERT((fields & XFS_ILOG_DFORK) == 0);
+		break;
+	}
+
+
+
+	/*
+	 * If we logged any attribute data, recover it.	 There may or
+	 * may not have been any other non-core data logged in this
+	 * transaction.
+	 */
+	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
+			attr_index = 3;
+		} else {
+			attr_index = 2;
+		}
+		len = item->ri_buf[attr_index].i_len;
+		src = item->ri_buf[attr_index].i_addr;
+		ASSERT(len == in_f->ilf_asize);
+
+		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		case XFS_ILOG_ADATA:
+		case XFS_ILOG_AEXT:
+			dest = XFS_DFORK_APTR(dip);
+			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
+			bcopy(src, dest, len);
+			break;
+
+		case XFS_ILOG_ABROOT:
+			dest = XFS_DFORK_APTR(dip);
+			xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+					 (xfs_bmdr_block_t*)dest,
+					 XFS_DFORK_ASIZE(dip, mp));
+			break;
+
+		default:
+			xlog_warn("XFS: xlog_recover_do_inode_trans: Illegal flag");
+			ASSERT(0);
+			xfs_buf_relse(bp);
+			return XFS_ERROR(EIO);
+		}
+	}
+
+
+write_inode_buffer:
+#if 0
+	/*
+	 * Can't do this if the transaction didn't log the current
+	 * contents, e.g. rmdir.
+	 */
+	XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip);
+#endif
+	xfs_inobp_check(mp, bp);
+	if (ITEM_TYPE(item) == XFS_LI_INODE) {
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+		XFS_BUF_SET_FSPRIVATE(bp, mp);
+		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+		xfs_bdwrite(mp, bp);
+	} else {
+		XFS_BUF_STALE(bp);
+		error = xfs_bwrite(mp, bp);
+	}
+
+	return (error);
+}	/* xlog_recover_do_inode_trans */
+
+
+/*
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
+ * structure, so that we know not to do any dquot item or dquot buffer recovery,
+ * of that type.
+ */
+STATIC int
+xlog_recover_do_quotaoff_trans(xlog_t			*log,
+			       xlog_recover_item_t	*item,
+			       int			pass)
+{
+	xfs_qoff_logformat_t *qoff_f;
+
+	if (pass == XLOG_RECOVER_PASS2) {
+		return (0);
+	}
+
+	qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
+	ASSERT(qoff_f);
+
+	/*
+	 * The logitem format's flag tells us if this was user quotaoff,
+	 * group quotaoff or both.
+	 */
+	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_USER;
+	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
+
+	return (0);
+}
+
+
+/*
+ * Recover a dquot record
+ */
+STATIC int
+xlog_recover_do_dquot_trans(xlog_t		*log,
+			    xlog_recover_item_t *item,
+			    int			pass)
+{
+	xfs_mount_t		*mp;
+	xfs_buf_t		*bp;
+	struct xfs_disk_dquot	*ddq, *recddq;
+	int			error;
+	xfs_dq_logformat_t	*dq_f;
+	uint			type;
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		return 0;
+	}
+	mp = log->l_mp;
+
+	/*
+	 * Non-root filesystems are required to send in quota flags
+	 * at mount time. However, we may also get QUOTA_MAYBE flag set,
+	 * indicating that quota should stay on (and stay consistent),
+	 * if it already is. (so, we have to replay dquot log records
+	 * when MAYBE flag's set).
+	 */
+	if (mp->m_qflags == 0 && mp->m_dev != rootdev) {
+		return (0);
+	}
+
+	recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
+	ASSERT(recddq);
+	/*
+	 * This type of quotas was turned off, so ignore this record.
+	 */
+	type = INT_GET(recddq->d_flags, ARCH_CONVERT)&(XFS_DQ_USER|XFS_DQ_GROUP);
+	ASSERT(type);
+	if (log->l_quotaoffs_flag & type)
+		return (0);
+
+	/*
+	 * At this point we know that if we are recovering a root filesystem
+	 * then quota was _not_ turned off. Since there is no other flag
+	 * indicate to us otherwise, this must mean that quota's on,
+	 * and the dquot needs to be replayed. Remember that we may not have
+	 * fully recovered the superblock yet, so we can't do the usual trick
+	 * of looking at the SB quota bits.
+	 *
+	 * The other possibility, of course, is that the quota subsystem was
+	 * removed since the last mount - nopkg().
+	 */
+	dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
+	ASSERT(dq_f);
+	if ((error = xfs_qm_dqcheck(recddq,
+			   dq_f->qlf_id,
+			   0, XFS_QMOPT_DOWARN,
+			   "xlog_recover_do_dquot_trans (log copy)"))) {
+	  if ((error == nopkg()))
+			return (0);
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dq_f->qlf_len == 1);
+
+	error = xfs_read_buf(mp, mp->m_ddev_targp,
+			     dq_f->qlf_blkno,
+			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
+			     0, &bp);
+	if (error) {
+		xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
+				  bp, dq_f->qlf_blkno);
+		return error;
+	}
+	ASSERT(bp);
+	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+
+	/*
+	 * At least the magic num portion should be on disk because this
+	 * was among a chunk of dquots created earlier, and we did some
+	 * minimal initialization then.
+	 */
+	if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+			   "xlog_recover_do_dquot_trans")) {
+		xfs_buf_relse(bp);
+		return XFS_ERROR(EIO);
+	}
+
+	bcopy(recddq, ddq, item->ri_buf[1].i_len);
+
+	ASSERT(dq_f->qlf_size == 2);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+	       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+	XFS_BUF_SET_FSPRIVATE(bp, mp);
+	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+	xfs_bdwrite(mp, bp);
+
+	return (0);
+}	/* xlog_recover_do_dquot_trans */
+
+/*
+ * This routine is called to create an in-core extent free intent
+ * item from the efi format structure which was logged on disk.
+ * It allocates an in-core efi, copies the extents from the format
+ * structure into it, and adds the efi to the AIL with the given
+ * LSN.
+ */
+STATIC void
+xlog_recover_do_efi_trans(xlog_t		*log,
+			  xlog_recover_item_t	*item,
+			  xfs_lsn_t		lsn,
+			  int			pass)
+{
+	xfs_mount_t		*mp;
+	xfs_efi_log_item_t	*efip;
+	xfs_efi_log_format_t	*efi_formatp;
+	SPLDECL(s);
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		return;
+	}
+
+	efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
+	ASSERT(item->ri_buf[0].i_len ==
+	       (sizeof(xfs_efi_log_format_t) +
+		((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t))));
+
+	mp = log->l_mp;
+	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+	bcopy((char *)efi_formatp, (char *)&(efip->efi_format),
+	      sizeof(xfs_efi_log_format_t) +
+	      ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t)));
+	efip->efi_next_extent = efi_formatp->efi_nextents;
+	efip->efi_flags |= XFS_EFI_COMMITTED;
+
+	AIL_LOCK(mp,s);
+	/*
+	 * xfs_trans_update_ail() drops the AIL lock.
+	 */
+	xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s);
+}	/* xlog_recover_do_efi_trans */
+
+
+/*
+ * This routine is called when an efd format structure is found in
+ * a committed transaction in the log.	It's purpose is to cancel
+ * the corresponding efi if it was still in the log.  To do this
+ * it searches the AIL for the efi with an id equal to that in the
+ * efd format structure.  If we find it, we remove the efi from the
+ * AIL and free it.
+ */
+STATIC void
+xlog_recover_do_efd_trans(xlog_t		*log,
+			  xlog_recover_item_t	*item,
+			  int			pass)
+{
+	xfs_mount_t		*mp;
+	xfs_efd_log_format_t	*efd_formatp;
+	xfs_efi_log_item_t	*efip=NULL;
+	xfs_log_item_t		*lip;
+	int			gen;
+	int			nexts;
+	__uint64_t		efi_id;
+	SPLDECL(s);
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		return;
+	}
+
+	efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
+	ASSERT(item->ri_buf[0].i_len ==
+	       (sizeof(xfs_efd_log_format_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_t))));
+	efi_id = efd_formatp->efd_efi_id;
+
+	/*
+	 * Search for the efi with the id in the efd format structure
+	 * in the AIL.
+	 */
+	mp = log->l_mp;
+	AIL_LOCK(mp,s);
+	lip = xfs_trans_first_ail(mp, &gen);
+	while (lip != NULL) {
+		if (lip->li_type == XFS_LI_EFI) {
+			efip = (xfs_efi_log_item_t *)lip;
+			if (efip->efi_format.efi_id == efi_id) {
+				/*
+				 * xfs_trans_delete_ail() drops the
+				 * AIL lock.
+				 */
+				xfs_trans_delete_ail(mp, lip, s);
+				break;
+			}
+		}
+		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+	}
+	if (lip == NULL) {
+		AIL_UNLOCK(mp, s);
+	}
+
+	/*
+	 * If we found it, then free it up.  If it wasn't there, it
+	 * must have been overwritten in the log.  Oh well.
+	 */
+	if (lip != NULL) {
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			kmem_free(lip, sizeof(xfs_efi_log_item_t) +
+				  ((nexts - 1) * sizeof(xfs_extent_t)));
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+	}
+	}
+}	/* xlog_recover_do_efd_trans */
+
+/*
+ * Perform the transaction
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
+xlog_recover_do_trans(xlog_t	     *log,
+		      xlog_recover_t *trans,
+		      int	     pass)
+{
+	int		    error = 0;
+	xlog_recover_item_t *item, *first_item;
+
+	if ((error = xlog_recover_reorder_trans(log, trans)))
+		return error;
+	first_item = item = trans->r_itemq;
+	do {
+		/*
+		 * we don't need to worry about the block number being
+		 * truncated in > 1 TB buffers because in user-land,
+		 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
+		 * the blkno's will get through the user-mode buffer
+		 * cache properly.  The only bad case is o32 kernels
+		 * where xfs_daddr_t is 32-bits but mount will warn us
+		 * off a > 1 TB filesystem before we get here.
+		 */
+		if ((ITEM_TYPE(item) == XFS_LI_BUF) ||
+		    (ITEM_TYPE(item) == XFS_LI_6_1_BUF) ||
+		    (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) {
+			if  ((error = xlog_recover_do_buffer_trans(log, item,
+								 pass)))
+				break;
+		} else if ((ITEM_TYPE(item) == XFS_LI_INODE) ||
+			   (ITEM_TYPE(item) == XFS_LI_6_1_INODE) ||
+			   (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) {
+			if ((error = xlog_recover_do_inode_trans(log, item,
+								pass)))
+				break;
+		} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
+			xlog_recover_do_efi_trans(log, item, trans->r_lsn,
+						  pass);
+		} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
+			xlog_recover_do_efd_trans(log, item, pass);
+		} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
+			if ((error = xlog_recover_do_dquot_trans(log, item,
+								   pass)))
+					break;
+		} else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
+			if ((error = xlog_recover_do_quotaoff_trans(log, item,
+								   pass)))
+					break;
+		} else {
+			xlog_warn("XFS: xlog_recover_do_trans");
+			ASSERT(0);
+			error = XFS_ERROR(EIO);
+			break;
+		}
+		item = item->ri_next;
+	} while (first_item != item);
+
+	return error;
+}	/* xlog_recover_do_trans */
+
+
+/*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(xlog_recover_t	    *trans)
+{
+	xlog_recover_item_t *first_item, *item, *free_item;
+	int i;
+
+	item = first_item = trans->r_itemq;
+	do {
+		free_item = item;
+		item = item->ri_next;
+		 /* Free the regions in the item. */
+		for (i = 0; i < free_item->ri_cnt; i++) {
+			kmem_free(free_item->ri_buf[i].i_addr,
+				  free_item->ri_buf[i].i_len);
+		}
+		/* Free the item itself */
+		kmem_free(free_item->ri_buf,
+			  (free_item->ri_total * sizeof(xfs_log_iovec_t)));
+		kmem_free(free_item, sizeof(xlog_recover_item_t));
+	} while (first_item != item);
+	/* Free the transaction recover structure */
+	kmem_free(trans, sizeof(xlog_recover_t));
+}	/* xlog_recover_free_trans */
+
+
+STATIC int
+xlog_recover_commit_trans(xlog_t	 *log,
+			  xlog_recover_t **q,
+			  xlog_recover_t *trans,
+			  int		 pass)
+{
+	int error;
+
+	if ((error = xlog_recover_unlink_tid(q, trans)))
+		return error;
+	if ((error = xlog_recover_do_trans(log, trans, pass)))
+		return error;
+	xlog_recover_free_trans(trans);			/* no error */
+	return 0;
+}	/* xlog_recover_commit_trans */
+
+
+/*ARGSUSED*/
+STATIC int
+xlog_recover_unmount_trans(xlog_recover_t *trans)
+{
+	/* Do nothing now */
+	xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
+	return( 0 );
+}	/* xlog_recover_unmount_trans */
+
+
+/*
+ * There are two valid states of the r_state field.  0 indicates that the
+ * transaction structure is in a normal state.	We have either seen the
+ * start of the transaction or the last operation we added was not a partial
+ * operation.  If the last operation we added to the transaction was a
+ * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
+ *
+ * NOTE: skip LRs with 0 data length.
+ */
+STATIC int
+xlog_recover_process_data(xlog_t	    *log,
+			  xlog_recover_t    *rhash[],
+			  xlog_rec_header_t *rhead,
+			  xfs_caddr_t	    dp,
+			  int		    pass)
+{
+    xfs_caddr_t		lp	   = dp+INT_GET(rhead->h_len, ARCH_CONVERT);
+    int			num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT);
+    xlog_op_header_t	*ohead;
+    xlog_recover_t	*trans;
+    xlog_tid_t		tid;
+    int			error;
+    unsigned long	hash;
+    uint		flags;
+
+    /* check the log format matches our own - else we can't recover */
+    if (xlog_header_check_recover(log->l_mp, rhead))
+	    return (XFS_ERROR(EIO));
+
+    while ((dp < lp) && num_logops) {
+	ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
+	ohead = (xlog_op_header_t *)dp;
+	dp += sizeof(xlog_op_header_t);
+	if (ohead->oh_clientid != XFS_TRANSACTION &&
+	    ohead->oh_clientid != XFS_LOG) {
+	    xlog_warn("XFS: xlog_recover_process_data: bad clientid");
+	    ASSERT(0);
+	    return (XFS_ERROR(EIO));
+	}
+	tid = INT_GET(ohead->oh_tid, ARCH_CONVERT);
+	hash = XLOG_RHASH(tid);
+	trans = xlog_recover_find_tid(rhash[hash], tid);
+	if (trans == NULL) {			   /* not found; add new tid */
+	    if (ohead->oh_flags & XLOG_START_TRANS)
+		xlog_recover_new_tid(&rhash[hash], tid, INT_GET(rhead->h_lsn, ARCH_CONVERT));
+	} else {
+	    ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp);
+	    flags = ohead->oh_flags & ~XLOG_END_TRANS;
+	    if (flags & XLOG_WAS_CONT_TRANS)
+		flags &= ~XLOG_CONTINUE_TRANS;
+	    switch (flags) {
+		case XLOG_COMMIT_TRANS: {
+		    error = xlog_recover_commit_trans(log, &rhash[hash],
+						      trans, pass);
+		    break;
+		}
+		case XLOG_UNMOUNT_TRANS: {
+		    error = xlog_recover_unmount_trans(trans);
+		    break;
+		}
+		case XLOG_WAS_CONT_TRANS: {
+		    error = xlog_recover_add_to_cont_trans(trans, dp,
+				  INT_GET(ohead->oh_len, ARCH_CONVERT));
+		    break;
+		}
+		case XLOG_START_TRANS : {
+		    xlog_warn("XFS: xlog_recover_process_data: bad transaction");
+		    ASSERT(0);
+		    error = XFS_ERROR(EIO);
+		    break;
+		}
+		case 0:
+		case XLOG_CONTINUE_TRANS: {
+		    error = xlog_recover_add_to_trans(trans, dp,
+				   INT_GET(ohead->oh_len, ARCH_CONVERT));
+		    break;
+		}
+		default: {
+		    xlog_warn("XFS: xlog_recover_process_data: bad flag");
+		    ASSERT(0);
+		    error = XFS_ERROR(EIO);
+		    break;
+		}
+	    } /* switch */
+	    if (error)
+		return error;
+	} /* if */
+	dp += INT_GET(ohead->oh_len, ARCH_CONVERT);
+	num_logops--;
+    }
+    return( 0 );
+}	/* xlog_recover_process_data */
+
+
+/*
+ * Process an extent free intent item that was recovered from
+ * the log.  We need to free the extents that it describes.
+ */
+STATIC void
+xlog_recover_process_efi(xfs_mount_t		*mp,
+			 xfs_efi_log_item_t	*efip)
+{
+	xfs_efd_log_item_t	*efdp;
+	xfs_trans_t		*tp;
+	int			i;
+	xfs_extent_t		*extp;
+	xfs_fsblock_t		startblock_fsb;
+
+	ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+
+	/*
+	 * First check the validity of the extents described by the
+	 * EFI.	 If any are bad, then assume that all are bad and
+	 * just toss the EFI.
+	 */
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		extp = &(efip->efi_format.efi_extents[i]);
+		startblock_fsb = XFS_BB_TO_FSB(mp,
+				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
+		if ((startblock_fsb == 0) ||
+		    (extp->ext_len == 0) ||
+		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
+		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
+			/*
+			 * This will pull the EFI from the AIL and
+			 * free the memory associated with it.
+			 */
+			xfs_efi_release(efip, efip->efi_format.efi_nextents);
+			return;
+		}
+	}
+
+	tp = xfs_trans_alloc(mp, 0);
+	xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		extp = &(efip->efi_format.efi_extents[i]);
+		xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
+					 extp->ext_len);
+	}
+
+	efip->efi_flags |= XFS_EFI_RECOVERED;
+	xfs_trans_commit(tp, 0, NULL);
+}	/* xlog_recover_process_efi */
+
+
+/*
+ * Verify that once we've encountered something other than an EFI
+ * in the AIL that there are no more EFIs in the AIL.
+ */
+#if defined(DEBUG)
+STATIC void
+xlog_recover_check_ail(xfs_mount_t	*mp,
+		       xfs_log_item_t	*lip,
+		       int		gen)
+{
+	int	orig_gen;
+
+	orig_gen = gen;
+	do {
+		ASSERT(lip->li_type != XFS_LI_EFI);
+		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+		/*
+		 * The check will be bogus if we restart from the
+		 * beginning of the AIL, so ASSERT that we don't.
+		 * We never should since we're holding the AIL lock
+		 * the entire time.
+		 */
+		ASSERT(gen == orig_gen);
+	} while (lip != NULL);
+}
+#endif	/* DEBUG */
+
+
+/*
+ * When this is called, all of the EFIs which did not have
+ * corresponding EFDs should be in the AIL.  What we do now
+ * is free the extents associated with each one.
+ *
+ * Since we process the EFIs in normal transactions, they
+ * will be removed at some point after the commit.  This prevents
+ * us from just walking down the list processing each one.
+ * We'll use a flag in the EFI to skip those that we've already
+ * processed and use the AIL iteration mechanism's generation
+ * count to try to speed this up at least a bit.
+ *
+ * When we start, we know that the EFIs are the only things in
+ * the AIL.  As we process them, however, other items are added
+ * to the AIL.	Since everything added to the AIL must come after
+ * everything already in the AIL, we stop processing as soon as
+ * we see something other than an EFI in the AIL.
+ */
+STATIC void
+xlog_recover_process_efis(xlog_t	*log)
+{
+	xfs_log_item_t		*lip;
+	xfs_efi_log_item_t	*efip;
+	int			gen;
+	xfs_mount_t		*mp;
+	SPLDECL(s);
+
+	mp = log->l_mp;
+	AIL_LOCK(mp,s);
+
+	lip = xfs_trans_first_ail(mp, &gen);
+	while (lip != NULL) {
+		/*
+		 * We're done when we see something other than an EFI.
+		 */
+		if (lip->li_type != XFS_LI_EFI) {
+			xlog_recover_check_ail(mp, lip, gen);
+			break;
+		}
+
+		/*
+		 * Skip EFIs that we've already processed.
+		 */
+		efip = (xfs_efi_log_item_t *)lip;
+		if (efip->efi_flags & XFS_EFI_RECOVERED) {
+			lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+			continue;
+		}
+
+		AIL_UNLOCK(mp, s);
+		xlog_recover_process_efi(mp, efip);
+		AIL_LOCK(mp,s);
+		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+	}
+	AIL_UNLOCK(mp, s);
+}	/* xlog_recover_process_efis */
+
+
+/*
+ * This routine performs a transaction to null out a bad inode pointer
+ * in an agi unlinked inode hash bucket.
+ */
+STATIC void
+xlog_recover_clear_agi_bucket(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno,
+	int		bucket)
+{
+	xfs_trans_t	*tp;
+	xfs_agi_t	*agi;
+	xfs_daddr_t	agidaddr;
+	xfs_buf_t	*agibp;
+	int		offset;
+	int		error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
+	xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+
+	agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agidaddr,
+				   1, 0, &agibp);
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+		return;
+	}
+
+	agi = XFS_BUF_TO_AGI(agibp);
+	if (INT_GET(agi->agi_magicnum, ARCH_CONVERT) != XFS_AGI_MAGIC) {
+		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+		return;
+	}
+	ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+
+	INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT, NULLAGINO);
+	offset = offsetof(xfs_agi_t, agi_unlinked) +
+		 (sizeof(xfs_agino_t) * bucket);
+	xfs_trans_log_buf(tp, agibp, offset,
+			  (offset + sizeof(xfs_agino_t) - 1));
+
+	(void) xfs_trans_commit(tp, 0, NULL);
+}	/* xlog_recover_clear_agi_bucket */
+
+
+/*
+ * xlog_iunlink_recover
+ *
+ * This is called during recovery to process any inodes which
+ * we unlinked but not freed when the system crashed.  These
+ * inodes will be on the lists in the AGI blocks.  What we do
+ * here is scan all the AGIs and fully truncate and free any
+ * inodes found on the lists.  Each inode is removed from the
+ * lists when it has been fully truncated and is freed.	 The
+ * freeing of the inode and its removal from the list must be
+ * atomic.
+ */
+void
+xlog_recover_process_iunlinks(xlog_t	*log)
+{
+	xfs_mount_t	*mp;
+	xfs_agnumber_t	agno;
+	xfs_agi_t	*agi;
+	xfs_daddr_t	agidaddr;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_dinode_t	*dip;
+	xfs_inode_t	*ip;
+	xfs_agino_t	agino;
+	xfs_ino_t	ino;
+	int		bucket;
+	int		error;
+	uint		mp_dmevmask;
+
+	mp = log->l_mp;
+
+	/*
+	 * Prevent any DMAPI event from being sent while in this function.
+	 * Not a problem for xfs since the file system isn't mounted
+	 * yet.	 It is a problem for cxfs recovery.
+	 */
+	mp_dmevmask = mp->m_dmevmask;
+	mp->m_dmevmask = 0;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		/*
+		 * Find the agi for this ag.
+		 */
+		agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+		agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr, 1, 0);
+		if (XFS_BUF_ISERROR(agibp)) {
+			xfs_ioerror_alert("xlog_recover_process_iunlinks(agi#1)",
+					  log->l_mp, agibp, agidaddr);
+		}
+		agi = XFS_BUF_TO_AGI(agibp);
+		ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+
+		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+
+			agino = INT_GET(agi->agi_unlinked[bucket], ARCH_CONVERT);
+			while (agino != NULLAGINO) {
+
+				/*
+				 * Release the agi buffer so that it can
+				 * be acquired in the normal course of the
+				 * transaction to truncate and free the inode.
+				 */
+				xfs_buf_relse(agibp);
+
+				ino = XFS_AGINO_TO_INO(mp, agno, agino);
+				error = xfs_iget(mp, NULL, ino, 0, &ip, 0);
+				ASSERT(error || (ip != NULL));
+
+				if (!error) {
+					/*
+					 * Get the on disk inode to find the
+					 * next inode in the bucket.
+					 */
+					error = xfs_itobp(mp, NULL, ip, &dip,
+							&ibp, 0);
+					ASSERT(error || (dip != NULL));
+				}
+
+				if (!error) {
+					ASSERT(ip->i_d.di_nlink == 0);
+					ASSERT(ip->i_d.di_mode != 0);
+
+					/* setup for the next pass */
+					agino = INT_GET(dip->di_next_unlinked,
+							ARCH_CONVERT);
+					xfs_buf_relse(ibp);
+					/*
+					 * Prevent any DMAPI event from
+					 * being sent when the
+					 * reference on the inode is
+					 * dropped.  Not a problem for
+					 * xfs since the file system
+					 * isn't mounted yet.  It is a
+					 * problem for cxfs recovery.
+					 */
+					ip->i_d.di_dmevmask = 0;
+
+					/*
+					 * Drop our reference to the
+					 * inode.  If there are no
+					 * other references, this will
+					 * send the inode to
+					 * xfs_inactive() which will
+					 * truncate the file and free
+					 * the inode.
+					 */
+					VN_RELE(XFS_ITOV(ip));
+				} else {
+					/*
+					 * We can't read in the inode
+					 * this bucket points to, or
+					 * this inode is messed up.  Just
+					 * ditch this bucket of inodes.	 We
+					 * will lose some inodes and space,
+					 * but at least we won't hang.	Call
+					 * xlog_recover_clear_agi_bucket()
+					 * to perform a transaction to clear
+					 * the inode pointer in the bucket.
+					 */
+					xlog_recover_clear_agi_bucket(mp, agno,
+							bucket);
+
+					agino = NULLAGINO;
+				}
+
+				/*
+				 * Reacquire the agibuffer and continue around
+				 * the loop.
+				 */
+				agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+				agibp = xfs_buf_read(mp->m_ddev_targp,
+						 agidaddr, 1, 0);
+				if (XFS_BUF_ISERROR(agibp)) {
+					xfs_ioerror_alert("xlog_recover_process_iunlinks(agi#2)",
+							  log->l_mp, agibp, agidaddr);
+				}
+				agi = XFS_BUF_TO_AGI(agibp);
+				ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+			}
+		}
+
+		/*
+		 * Release the buffer for the current agi so we can
+		 * go on to the next one.
+		 */
+		xfs_buf_relse(agibp);
+	}
+
+	mp->m_dmevmask = mp_dmevmask;
+
+}	/* xlog_recover_process_iunlinks */
+
+
+/*
+ * Stamp cycle number in every block
+ *
+ * This routine is also called in xfs_log.c
+ */
+/*ARGSUSED*/
+void
+xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog)
+{
+	int	i, j, k;
+	int	size = iclog->ic_offset + iclog->ic_roundoff;
+	xfs_caddr_t dp;
+	union ich {
+		xlog_rec_ext_header_t	hic_xheader;
+		char			hic_sector[XLOG_HEADER_SIZE];
+	} *xhdr;
+	uint	cycle_lsn;
+
+#ifdef DEBUG
+	uint	*up;
+	uint	chksum = 0;
+
+	up = (uint *)iclog->ic_datap;
+	/* divide length by 4 to get # words */
+	for (i=0; i<size >> 2; i++) {
+		chksum ^= INT_GET(*up, ARCH_CONVERT);
+		up++;
+	}
+	INT_SET(iclog->ic_header.h_chksum, ARCH_CONVERT, chksum);
+#endif /* DEBUG */
+
+	cycle_lsn = CYCLE_LSN_NOCONV(iclog->ic_header.h_lsn, ARCH_CONVERT);
+
+	dp = iclog->ic_datap;
+	for (i = 0; i < BTOBB(size) &&
+		i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+		iclog->ic_header.h_cycle_data[i] = *(uint *)dp;
+		*(uint *)dp = cycle_lsn;
+		dp += BBSIZE;
+	}
+
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		xhdr = (union ich*)&iclog->ic_header;
+		for ( ; i < BTOBB(size); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			xhdr[j].hic_xheader.xh_cycle_data[k] = *(uint *)dp;
+			*(uint *)dp = cycle_lsn;
+			dp += BBSIZE;
+		}
+
+		for (i = 1; i < log->l_iclog_heads; i++) {
+			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+		}
+	}
+
+}	/* xlog_pack_data */
+
+
+/*ARGSUSED*/
+STATIC void
+xlog_unpack_data(xlog_rec_header_t *rhead,
+		 xfs_caddr_t	   dp,
+		 xlog_t		   *log)
+{
+	int i, j, k;
+	union ich {
+		xlog_rec_header_t	hic_header;
+		xlog_rec_ext_header_t	hic_xheader;
+		char			hic_sector[XLOG_HEADER_SIZE];
+	} *xhdr;
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+	uint *up = (uint *)dp;
+	uint chksum = 0;
+#endif
+
+	for (i=0; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)) &&
+		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+		*(uint *)dp = *(uint *)&rhead->h_cycle_data[i];
+		dp += BBSIZE;
+	}
+
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		xhdr = (union ich*)rhead;
+		for ( ; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			*(uint *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+			dp += BBSIZE;
+		}
+	}
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+	/* divide length by 4 to get # words */
+	for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) {
+		chksum ^= INT_GET(*up, ARCH_CONVERT);
+		up++;
+	}
+	if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) {
+	    if (!INT_ISZERO(rhead->h_chksum, ARCH_CONVERT) ||
+		((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
+		    cmn_err(CE_DEBUG,
+			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
+			    INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
+		    cmn_err(CE_DEBUG,
+"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
+		    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+			    cmn_err(CE_DEBUG,
+				"XFS: LogR this is a LogV2 filesystem\n");
+		    }
+		    log->l_flags |= XLOG_CHKSUM_MISMATCH;
+	    }
+	}
+#endif /* DEBUG && XFS_LOUD_RECOVERY */
+}	/* xlog_unpack_data */
+
+
+/*
+ * Read the log from tail to head and process the log records found.
+ * Handle the two cases where the tail and head are in the same cycle
+ * and where the active portion of the log wraps around the end of
+ * the physical log separately.	 The pass parameter is passed through
+ * to the routines called to process the data and is not looked at
+ * here.
+ */
+STATIC int
+xlog_do_recovery_pass(xlog_t	*log,
+		      xfs_daddr_t	head_blk,
+		      xfs_daddr_t	tail_blk,
+		      int	pass)
+{
+    xlog_rec_header_t	*rhead;
+    xfs_daddr_t		blk_no;
+    xfs_caddr_t		bufaddr;
+    xfs_buf_t		*hbp, *dbp;
+    int			error, h_size;
+    int			bblks, split_bblks;
+    int			hblks, split_hblks, wrapped_hblks;
+    xlog_recover_t	*rhash[XLOG_RHASH_SIZE];
+
+    error = 0;
+
+
+    /*
+     * Read the header of the tail block and get the iclog buffer size from
+     * h_size.	Use this to tell how many sectors make up the log header.
+     */
+    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+	/*
+	 * When using variable length iclogs, read first sector of iclog
+	 * header and extract the header size from it.	Get a new hbp that
+	 * is the correct size.
+	 */
+	hbp = xlog_get_bp(1, log->l_mp);
+	if (!hbp)
+	    return ENOMEM;
+	if ((error = xlog_bread(log, tail_blk, 1, hbp)))
+	    goto bread_err1;
+	rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
+	ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) ==
+						XLOG_HEADER_MAGIC_NUM);
+	if ((INT_GET(rhead->h_version, ARCH_CONVERT) & (~XLOG_VERSION_OKBITS)) != 0) {
+	    xlog_warn("XFS: xlog_do_recovery_pass: unrecognised log version number.");
+	    error = XFS_ERROR(EIO);
+	    goto bread_err1;
+	}
+	h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
+
+	if ((INT_GET(rhead->h_version, ARCH_CONVERT) & XLOG_VERSION_2) &&
+	    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+	    hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+	    if (h_size % XLOG_HEADER_CYCLE_SIZE)
+		hblks++;
+	    xlog_put_bp(hbp);
+	    hbp = xlog_get_bp(hblks, log->l_mp);
+	} else {
+	    hblks=1;
+	}
+    } else {
+	hblks=1;
+	hbp = xlog_get_bp(1, log->l_mp);
+	h_size = XLOG_BIG_RECORD_BSIZE;
+    }
+
+    if (!hbp)
+	return ENOMEM;
+    dbp = xlog_get_bp(BTOBB(h_size),log->l_mp);
+    if (!dbp) {
+	xlog_put_bp(hbp);
+	return ENOMEM;
+    }
+
+    bzero(rhash, sizeof(rhash));
+    if (tail_blk <= head_blk) {
+	for (blk_no = tail_blk; blk_no < head_blk; ) {
+	    if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+		goto bread_err2;
+	    rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
+	    ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+	    ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
+	    bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));	/* blocks in data section */
+
+	    if ((INT_GET(rhead->h_magicno, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) ||
+		(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) > INT_MAX)) ||
+		(bblks <= 0) ||
+		(blk_no > log->l_logBBsize)) {
+		    error = EFSCORRUPTED;
+		    goto bread_err2;
+	    }
+
+	    if ((INT_GET(rhead->h_version, ARCH_CONVERT) & (~XLOG_VERSION_OKBITS)) != 0) {
+		xlog_warn("XFS: xlog_do_recovery_pass: unrecognised log version number.");
+		error = XFS_ERROR(EIO);
+		goto bread_err2;
+	    }
+	    bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));	/* blocks in data section */
+	    if (bblks > 0) {
+		if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+		    goto bread_err2;
+		xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
+		if ((error = xlog_recover_process_data(log, rhash,
+						      rhead, XFS_BUF_PTR(dbp),
+						      pass)))
+			goto bread_err2;
+	    }
+	    blk_no += (bblks+hblks);
+	}
+    } else {
+	/*
+	 * Perform recovery around the end of the physical log.	 When the head
+	 * is not on the same cycle number as the tail, we can't do a sequential
+	 * recovery as above.
+	 */
+	blk_no = tail_blk;
+	while (blk_no < log->l_logBBsize) {
+	    /*
+	     * Check for header wrapping around physical end-of-log
+	     */
+	    wrapped_hblks = 0;
+	    if (blk_no+hblks <= log->l_logBBsize) {
+		/* Read header in one read */
+		if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+		    goto bread_err2;
+	    } else {
+		/* This log record is split across physical end of log */
+		split_hblks = 0;
+		if (blk_no != log->l_logBBsize) {
+		    /* some data is before physical end of log */
+		    ASSERT(blk_no <= INT_MAX);
+		    split_hblks = log->l_logBBsize - (int)blk_no;
+		    ASSERT(split_hblks > 0);
+		    if ((error = xlog_bread(log, blk_no, split_hblks, hbp)))
+			goto bread_err2;
+		}
+		bufaddr = XFS_BUF_PTR(hbp);
+		XFS_BUF_SET_PTR(hbp, bufaddr + BBTOB(split_hblks),
+			BBTOB(hblks - split_hblks));
+		wrapped_hblks = hblks - split_hblks;
+		if ((error = xlog_bread(log, 0, wrapped_hblks, hbp)))
+		    goto bread_err2;
+		XFS_BUF_SET_PTR(hbp, bufaddr, hblks);
+	    }
+	    rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
+	    ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+	    ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
+	    bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
+
+	    /* LR body must have data or it wouldn't have been written */
+	    ASSERT(bblks > 0);
+	    blk_no += hblks;			/* successfully read header */
+
+	    if ((INT_GET(rhead->h_magicno, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) ||
+		(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) > INT_MAX)) ||
+		(bblks <= 0)) {
+		    error = EFSCORRUPTED;
+		    goto bread_err2;
+	    }
+
+	    /* Read in data for log record */
+	    if (blk_no+bblks <= log->l_logBBsize) {
+		if ((error = xlog_bread(log, blk_no, bblks, dbp)))
+		    goto bread_err2;
+	    } else {
+		/* This log record is split across physical end of log */
+		split_bblks = 0;
+		if (blk_no != log->l_logBBsize) {
+
+		    /* some data is before physical end of log */
+		    ASSERT(blk_no <= INT_MAX);
+		    split_bblks = log->l_logBBsize - (int)blk_no;
+		    ASSERT(split_bblks > 0);
+		    if ((error = xlog_bread(log, blk_no, split_bblks, dbp)))
+			goto bread_err2;
+		}
+		bufaddr = XFS_BUF_PTR(dbp);
+		XFS_BUF_SET_PTR(dbp, bufaddr + BBTOB(split_bblks),
+			BBTOB(bblks - split_bblks));
+		if ((error = xlog_bread(log, wrapped_hblks,
+					bblks - split_bblks, dbp)))
+		    goto bread_err2;
+		XFS_BUF_SET_PTR(dbp, bufaddr, XLOG_BIG_RECORD_BSIZE);
+	    }
+	    xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
+	    if ((error = xlog_recover_process_data(log, rhash,
+						  rhead, XFS_BUF_PTR(dbp),
+						  pass)))
+		goto bread_err2;
+	    blk_no += bblks;
+	}
+
+	ASSERT(blk_no >= log->l_logBBsize);
+	blk_no -= log->l_logBBsize;
+
+	/* read first part of physical log */
+	while (blk_no < head_blk) {
+	    if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+		goto bread_err2;
+	    rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
+	    ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+	    ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
+	    bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
+	    ASSERT(bblks > 0);
+	    if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+		goto bread_err2;
+	    xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
+	    if ((error = xlog_recover_process_data(log, rhash,
+						  rhead, XFS_BUF_PTR(dbp),
+						  pass)))
+		goto bread_err2;
+	    blk_no += (bblks+hblks);
+	}
+    }
+
+bread_err2:
+    xlog_put_bp(dbp);
+bread_err1:
+    xlog_put_bp(hbp);
+
+    return error;
+}
+
+/*
+ * Do the recovery of the log.	We actually do this in two phases.
+ * The two passes are necessary in order to implement the function
+ * of cancelling a record written into the log.	 The first pass
+ * determines those things which have been cancelled, and the
+ * second pass replays log items normally except for those which
+ * have been cancelled.	 The handling of the replay and cancellations
+ * takes place in the log item type specific routines.
+ *
+ * The table of items which have cancel records in the log is allocated
+ * and freed at this level, since only here do we know when all of
+ * the log recovery has been completed.
+ */
+STATIC int
+xlog_do_log_recovery(xlog_t	*log,
+		     xfs_daddr_t	head_blk,
+		     xfs_daddr_t	tail_blk)
+{
+	int		error;
+#ifdef DEBUG
+	int		i;
+#endif
+
+	/*
+	 * First do a pass to find all of the cancelled buf log items.
+	 * Store them in the buf_cancel_table for use in the second pass.
+	 */
+	log->l_buf_cancel_table =
+		(xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+						 sizeof(xfs_buf_cancel_t*),
+						 KM_SLEEP);
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS1);
+	if (error != 0) {
+		kmem_free(log->l_buf_cancel_table,
+			  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+		log->l_buf_cancel_table = NULL;
+		return error;
+	}
+	/*
+	 * Then do a second pass to actually recover the items in the log.
+	 * When it is complete free the table of buf cancel items.
+	 */
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS2);
+#ifdef	DEBUG
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) {
+		ASSERT(log->l_buf_cancel_table[i] == NULL);
+	}
+#endif	/* DEBUG */
+	kmem_free(log->l_buf_cancel_table,
+		  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+	log->l_buf_cancel_table = NULL;
+
+	return error;
+}
+
+/*
+ * Do the actual recovery
+ */
+STATIC int
+xlog_do_recover(xlog_t	*log,
+		xfs_daddr_t head_blk,
+		xfs_daddr_t tail_blk)
+{
+	int		error;
+	xfs_buf_t	*bp;
+	xfs_sb_t	*sbp;
+
+	/*
+	 * First replay the images in the log.
+	 */
+	error = xlog_do_log_recovery(log, head_blk, tail_blk);
+	if (error) {
+		return error;
+	}
+
+	XFS_bflush(log->l_mp->m_ddev_targp);
+
+	/*
+	 * If IO errors happened during recovery, bail out.
+	 */
+	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+		return (EIO);
+	}
+
+	/*
+	 * We now update the tail_lsn since much of the recovery has completed
+	 * and there may be space available to use.  If there were no extent
+	 * or iunlinks, we can free up the entire log and set the tail_lsn to
+	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
+	 * lsn of the last known good LR on disk.  If there are extent frees
+	 * or iunlinks they will have some entries in the AIL; so we look at
+	 * the AIL to determine how to set the tail_lsn.
+	 */
+	xlog_assign_tail_lsn(log->l_mp, NULL);
+
+	/*
+	 * Now that we've finished replaying all buffer and inode
+	 * updates, re-read in the superblock.
+	 */
+	bp = xfs_getsb(log->l_mp, 0);
+	XFS_BUF_UNDONE(bp);
+	XFS_BUF_READ(bp);
+	xfsbdstrat(log->l_mp, bp);
+	if ((error = xfs_iowait(bp))) {
+		xfs_ioerror_alert("xlog_do_recover",
+				  log->l_mp, bp, XFS_BUF_ADDR(bp));
+		ASSERT(0);
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	/* convert superblock from on-disk format */
+
+	sbp=&log->l_mp->m_sb;
+	xfs_xlatesb(XFS_BUF_TO_SBP(bp), sbp, 1, ARCH_CONVERT, XFS_SB_ALL_BITS);
+	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
+	ASSERT(XFS_SB_GOOD_VERSION(sbp));
+	xfs_buf_relse(bp);
+
+	xlog_recover_check_summary(log);
+
+	/* Normal transactions can now occur */
+	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+	return 0;
+}	/* xlog_do_recover */
+
+/*
+ * Perform recovery and re-initialize some log variables in xlog_find_tail.
+ *
+ * Return error or zero.
+ */
+int
+xlog_recover(xlog_t *log, int readonly)
+{
+	xfs_daddr_t head_blk, tail_blk;
+	int	error;
+
+	/* find the tail of the log */
+
+	if ((error = xlog_find_tail(log, &head_blk, &tail_blk, readonly)))
+		return error;
+
+	if (tail_blk != head_blk) {
+#ifndef __KERNEL__
+		extern xfs_daddr_t HEAD_BLK, TAIL_BLK;
+		head_blk = HEAD_BLK;
+		tail_blk = TAIL_BLK;
+#endif
+		/* There used to be a comment here:
+		 *
+		 * disallow recovery on read-only mounts.  note -- mount
+		 * checks for ENOSPC and turns it into an intelligent
+		 * error message.
+		 * ...but this is no longer true.  Now, unless you specify
+		 * NORECOVERY (in which case this function would never be
+		 * called), it enables read-write access long enough to do
+		 * recovery.
+		 */
+		if (readonly) {
+#ifdef __KERNEL__
+			if ((error = xfs_recover_read_only(log)))
+				return error;
+#else
+			return ENOSPC;
+#endif
+		}
+
+#ifdef __KERNEL__
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+		cmn_err(CE_NOTE,
+			"Starting XFS recovery on filesystem: %s (dev: %d/%d)",
+			log->l_mp->m_fsname, MAJOR(log->l_dev),
+			MINOR(log->l_dev));
+#else
+		cmn_err(CE_NOTE,
+			"!Starting XFS recovery on filesystem: %s (dev: %d/%d)",
+			log->l_mp->m_fsname, MAJOR(log->l_dev),
+			MINOR(log->l_dev));
+#endif
+#endif
+		error = xlog_do_recover(log, head_blk, tail_blk);
+		log->l_flags |= XLOG_RECOVERY_NEEDED;
+		if (readonly)
+			XFS_MTOVFS(log->l_mp)->vfs_flag |= VFS_RDONLY;
+	}
+	return error;
+}	/* xlog_recover */
+
+
+/*
+ * In the first part of recovery we replay inodes and buffers and build
+ * up the list of extent free items which need to be processed.	 Here
+ * we process the extent free items and clean up the on disk unlinked
+ * inode lists.	 This is separated from the first part of recovery so
+ * that the root and real-time bitmap inodes can be read in from disk in
+ * between the two stages.  This is necessary so that we can free space
+ * in the real-time portion of the file system.
+ */
+int
+xlog_recover_finish(xlog_t *log, int mfsi_flags)
+{
+	/*
+	 * Now we're ready to do the transactions needed for the
+	 * rest of recovery.  Start with completing all the extent
+	 * free intent records and then process the unlinked inode
+	 * lists.  At this point, we essentially run in normal mode
+	 * except that we're still performing recovery actions
+	 * rather than accepting new requests.
+	 */
+	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
+		xlog_recover_process_efis(log);
+		/*
+		 * Sync the log to get all the EFIs out of the AIL.
+		 * This isn't absolutely necessary, but it helps in
+		 * case the unlink transactions would have problems
+		 * pushing the EFIs out of the way.
+		 */
+		xfs_log_force(log->l_mp, (xfs_lsn_t)0,
+			      (XFS_LOG_FORCE | XFS_LOG_SYNC));
+
+		if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
+
+			xlog_recover_process_iunlinks(log);
+		}
+
+		xlog_recover_check_summary(log);
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+		cmn_err(CE_NOTE,
+			"Ending XFS recovery on filesystem: %s (dev: %d/%d)",
+			log->l_mp->m_fsname, MAJOR(log->l_dev),
+			MINOR(log->l_dev));
+#else
+		cmn_err(CE_NOTE,
+			"!Ending XFS recovery on filesystem: %s (dev: %d/%d)",
+			log->l_mp->m_fsname, MAJOR(log->l_dev),
+			MINOR(log->l_dev));
+#endif
+		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
+	} else {
+		cmn_err(CE_DEBUG,
+			"!Ending clean XFS mount for filesystem: %s\n",
+			log->l_mp->m_fsname);
+	}
+	return 0;
+}	/* xlog_recover_finish */
+
+
+#if defined(DEBUG)
+/*
+ * Read all of the agf and agi counters and check that they
+ * are consistent with the superblock counters.
+ */
+void
+xlog_recover_check_summary(xlog_t	*log)
+{
+	xfs_mount_t	*mp;
+	xfs_agf_t	*agfp;
+	xfs_agi_t	*agip;
+	xfs_buf_t	*agfbp;
+	xfs_buf_t	*agibp;
+	xfs_daddr_t	agfdaddr;
+	xfs_daddr_t	agidaddr;
+	xfs_buf_t	*sbbp;
+#ifdef XFS_LOUD_RECOVERY
+	xfs_sb_t	*sbp;
+#endif
+	xfs_agnumber_t	agno;
+	__uint64_t	freeblks;
+	__uint64_t	itotal;
+	__uint64_t	ifree;
+
+	mp = log->l_mp;
+
+	freeblks = 0LL;
+	itotal = 0LL;
+	ifree = 0LL;
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR);
+		agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr, 1, 0);
+		if (XFS_BUF_ISERROR(agfbp)) {
+			xfs_ioerror_alert("xlog_recover_check_summary(agf)",
+					  log->l_mp, agfbp, agfdaddr);
+		}
+		agfp = XFS_BUF_TO_AGF(agfbp);
+		ASSERT(INT_GET(agfp->agf_magicnum, ARCH_CONVERT) == XFS_AGF_MAGIC);
+		ASSERT(XFS_AGF_GOOD_VERSION(INT_GET(agfp->agf_versionnum, ARCH_CONVERT)));
+		ASSERT(INT_GET(agfp->agf_seqno, ARCH_CONVERT) == agno);
+
+		freeblks += INT_GET(agfp->agf_freeblks, ARCH_CONVERT) +
+			    INT_GET(agfp->agf_flcount, ARCH_CONVERT);
+		xfs_buf_relse(agfbp);
+
+		agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+		agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr, 1, 0);
+		if (XFS_BUF_ISERROR(agibp)) {
+			xfs_ioerror_alert("xlog_recover_check_summary(agi)",
+					  log->l_mp, agibp, agidaddr);
+		}
+		agip = XFS_BUF_TO_AGI(agibp);
+		ASSERT(INT_GET(agip->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+		ASSERT(XFS_AGI_GOOD_VERSION(INT_GET(agip->agi_versionnum, ARCH_CONVERT)));
+		ASSERT(INT_GET(agip->agi_seqno, ARCH_CONVERT) == agno);
+
+		itotal += INT_GET(agip->agi_count, ARCH_CONVERT);
+		ifree += INT_GET(agip->agi_freecount, ARCH_CONVERT);
+		xfs_buf_relse(agibp);
+	}
+
+	sbbp = xfs_getsb(mp, 0);
+#ifdef XFS_LOUD_RECOVERY
+	sbp = XFS_BUF_TO_SBP(sbbp);
+	cmn_err(CE_NOTE,
+		"xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
+		sbp->sb_icount, itotal);
+	cmn_err(CE_NOTE,
+		"xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
+		sbp->sb_ifree, ifree);
+	cmn_err(CE_NOTE,
+		"xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
+		sbp->sb_fdblocks, freeblks);
+#if 0
+	/*
+	 * This is turned off until I account for the allocation
+	 * btree blocks which live in free space.
+	 */
+	ASSERT(sbp->sb_icount == itotal);
+	ASSERT(sbp->sb_ifree == ifree);
+	ASSERT(sbp->sb_fdblocks == freeblks);
+#endif
+#endif
+	xfs_buf_relse(sbbp);
+}
+#endif /* DEBUG */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log_recover.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log_recover.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_log_recover.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_log_recover.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_RHASH_BITS	 4
+#define XLOG_RHASH_SIZE 16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid) \
+	((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+
+
+/*
+ * item headers are in ri_buf[0].  Additional buffers follow.
+ */
+typedef struct xlog_recover_item {
+	struct xlog_recover_item *ri_next;
+	struct xlog_recover_item *ri_prev;
+	int			 ri_type;
+	int			 ri_cnt;	/* count of regions found */
+	int			 ri_total;	/* total regions */
+	xfs_log_iovec_t		 *ri_buf;	/* ptr to regions buffer */
+} xlog_recover_item_t;
+
+struct xlog_tid;
+typedef struct xlog_recover {
+	struct xlog_recover *r_next;
+	xlog_tid_t	    r_log_tid;		/* log's transaction id */
+	xfs_trans_header_t  r_theader;		/* trans header for partial */
+	int		    r_state;		/* not needed */
+	xfs_lsn_t	    r_lsn;		/* xact lsn */
+	xlog_recover_item_t *r_itemq;		/* q for items */
+} xlog_recover_t;
+
+#define ITEM_TYPE(i)	(*(ushort *)(i)->ri_buf[0].i_addr)
+
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define XLOG_BC_TABLE_SIZE	64
+
+#define XLOG_RECOVER_PASS1	1
+#define XLOG_RECOVER_PASS2	2
+
+#endif	/* __XFS_LOG_RECOVER_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_mac.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_mac.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_mac.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_mac.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+static xfs_mac_label_t *mac_low_high_lp;
+static xfs_mac_label_t *mac_high_low_lp;
+static xfs_mac_label_t *mac_admin_high_lp;
+static xfs_mac_label_t *mac_equal_equal_lp;
+
+/*
+ * Test for the existence of a MAC label as efficiently as possible.
+ */
+int
+xfs_mac_vhaslabel(
+	vnode_t		*vp)
+{
+	int		error;
+	int		len = sizeof(xfs_mac_label_t);
+	int		flags = ATTR_KERNOVAL|ATTR_ROOT;
+
+	VOP_ATTR_GET(vp, SGI_MAC_FILE, NULL, &len, flags, sys_cred, error);
+	return (error == 0);
+}
+
+int
+xfs_mac_iaccess(xfs_inode_t *ip, mode_t mode, struct cred *cr)
+{
+	xfs_mac_label_t mac;
+	xfs_mac_label_t *mp = mac_high_low_lp;
+
+	if (cr == NULL || sys_cred == NULL ) {
+		return EACCES;
+	}
+
+	if (xfs_attr_fetch(ip, SGI_MAC_FILE, (char *)&mac, sizeof(mac)) == 0) {
+		if ((mp = mac_add_label(&mac)) == NULL) {
+			return mac_access(mac_high_low_lp, cr, mode);
+		}
+	}
+
+	return mac_access(mp, cr, mode);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_mac.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_mac.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_mac.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_mac.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_MAC_H__
+#define __XFS_MAC_H__
+
+/*
+ * Mandatory Access Control
+ *
+ * Layout of a composite MAC label:
+ * ml_list contains the list of categories (MSEN) followed by the list of
+ * divisions (MINT). This is actually a header for the data structure which
+ * will have an ml_list with more than one element.
+ *
+ *	-------------------------------
+ *	| ml_msen_type | ml_mint_type |
+ *	-------------------------------
+ *	| ml_level     | ml_grade     |
+ *	-------------------------------
+ *	| ml_catcount		      |
+ *	-------------------------------
+ *	| ml_divcount		      |
+ *	-------------------------------
+ *	| category 1		      |
+ *	| . . .			      |
+ *	| category N		      | (where N = ml_catcount)
+ *	-------------------------------
+ *	| division 1		      |
+ *	| . . .			      |
+ *	| division M		      | (where M = ml_divcount)
+ *	-------------------------------
+ */
+#define XFS_MAC_MAX_SETS	250
+typedef struct xfs_mac_label {
+	__uint8_t	ml_msen_type;	/* MSEN label type */
+	__uint8_t	ml_mint_type;	/* MINT label type */
+	__uint8_t	ml_level;	/* Hierarchical level */
+	__uint8_t	ml_grade;	/* Hierarchical grade */
+	__uint16_t	ml_catcount;	/* Category count */
+	__uint16_t	ml_divcount;	/* Division count */
+					/* Category set, then Division set */
+	__uint16_t	ml_list[XFS_MAC_MAX_SETS];
+} xfs_mac_label_t;
+
+/* MSEN label type names. Choose an upper case ASCII character.	 */
+#define XFS_MSEN_ADMIN_LABEL	'A'	/* Admin: low<admin != tcsec<high */
+#define XFS_MSEN_EQUAL_LABEL	'E'	/* Wildcard - always equal */
+#define XFS_MSEN_HIGH_LABEL	'H'	/* System High - always dominates */
+#define XFS_MSEN_MLD_HIGH_LABEL 'I'	/* System High, multi-level dir */
+#define XFS_MSEN_LOW_LABEL	'L'	/* System Low - always dominated */
+#define XFS_MSEN_MLD_LABEL	'M'	/* TCSEC label on a multi-level dir */
+#define XFS_MSEN_MLD_LOW_LABEL	'N'	/* System Low, multi-level dir */
+#define XFS_MSEN_TCSEC_LABEL	'T'	/* TCSEC label */
+#define XFS_MSEN_UNKNOWN_LABEL	'U'	/* unknown label */
+
+/* MINT label type names. Choose a lower case ASCII character.	*/
+#define XFS_MINT_BIBA_LABEL	'b'	/* Dual of a TCSEC label */
+#define XFS_MINT_EQUAL_LABEL	'e'	/* Wildcard - always equal */
+#define XFS_MINT_HIGH_LABEL	'h'	/* High Grade - always dominates */
+#define XFS_MINT_LOW_LABEL	'l'	/* Low Grade - always dominated */
+
+/* On-disk XFS extended attribute names */
+#define SGI_MAC_FILE	"SGI_MAC_FILE"
+#define SGI_MAC_FILE_SIZE	(sizeof(SGI_MAC_FILE)-1)
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_FS_POSIX_MAC
+
+/* NOT YET IMPLEMENTED */
+
+#define MACEXEC		00100
+#define MACWRITE	00200
+#define MACREAD		00400
+
+struct xfs_inode;
+extern int  xfs_mac_iaccess(struct xfs_inode *, mode_t, cred_t *);
+
+#define _MAC_XFS_IACCESS(i,m,c) (xfs_mac_iaccess(i,m,c))
+#define _MAC_VACCESS(v,c,m)	(xfs_mac_vaccess(v,c,m))
+#define _MAC_EXISTS		xfs_mac_vhaslabel
+
+#else
+#define _MAC_XFS_IACCESS(i,m,c) (0)
+#define _MAC_VACCESS(v,c,m)	(0)
+#define _MAC_EXISTS		(NULL)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_MAC_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_macros.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_macros.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_macros.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_macros.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,2228 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#define XFS_MACRO_C
+
+#include <xfs.h>
+
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_ISNULLDSTARTBLOCK)
+int
+isnulldstartblock(xfs_dfsbno_t x)
+{
+	return ISNULLDSTARTBLOCK(x);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_ISNULLSTARTBLOCK)
+int
+isnullstartblock(xfs_fsblock_t x)
+{
+	return ISNULLSTARTBLOCK(x);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_NULLSTARTBLOCK)
+xfs_fsblock_t
+nullstartblock(int k)
+{
+	return NULLSTARTBLOCK(k);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_STARTBLOCKVAL)
+xfs_filblks_t
+startblockval(xfs_fsblock_t x)
+{
+	return STARTBLOCKVAL(x);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_CHECK_DADDR)
+void
+xfs_ag_check_daddr(xfs_mount_t *mp, xfs_daddr_t d, xfs_extlen_t len)
+{
+	XFS_AG_CHECK_DADDR(mp, d, len);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_DADDR)
+xfs_daddr_t
+xfs_ag_daddr(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_daddr_t d)
+{
+	return XFS_AG_DADDR(mp, agno, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_BEST_BLOCKS)
+xfs_extlen_t
+xfs_ag_best_blocks(int bl, xfs_drfsbno_t blks)
+{
+	return XFS_AG_BEST_BLOCKS(bl, blks);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_MAX_BLOCKS)
+xfs_extlen_t
+xfs_ag_max_blocks(int bl)
+{
+	return XFS_AG_MAX_BLOCKS(bl);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_MAXLEVELS)
+int
+xfs_ag_maxlevels(xfs_mount_t *mp)
+{
+	return XFS_AG_MAXLEVELS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_MIN_BLOCKS)
+xfs_extlen_t
+xfs_ag_min_blocks(int bl)
+{
+	return XFS_AG_MIN_BLOCKS(bl);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGB_TO_DADDR)
+xfs_daddr_t
+xfs_agb_to_daddr(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno)
+{
+	return XFS_AGB_TO_DADDR(mp, agno, agbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGB_TO_FSB)
+xfs_fsblock_t
+xfs_agb_to_fsb(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno)
+{
+	return XFS_AGB_TO_FSB(mp, agno, agbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGBLOCK_MAX)
+xfs_agblock_t
+xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b)
+{
+	return XFS_AGBLOCK_MAX(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGBLOCK_MIN)
+xfs_agblock_t
+xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b)
+{
+	return XFS_AGBLOCK_MIN(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGF_BLOCK)
+xfs_agblock_t
+xfs_agf_block(xfs_mount_t *mp)
+{
+	return XFS_AGF_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGF_GOOD_VERSION)
+int
+xfs_agf_good_version(unsigned v)
+{
+	return XFS_AGF_GOOD_VERSION(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGFL_BLOCK)
+xfs_agblock_t
+xfs_agfl_block(xfs_mount_t *mp)
+{
+	return XFS_AGFL_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGI_BLOCK)
+xfs_agblock_t
+xfs_agi_block(xfs_mount_t *mp)
+{
+	return XFS_AGI_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGI_GOOD_VERSION)
+int
+xfs_agi_good_version(unsigned v)
+{
+	return XFS_AGI_GOOD_VERSION(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_AGBNO)
+xfs_agblock_t
+xfs_agino_to_agbno(xfs_mount_t *mp, xfs_agino_t i)
+{
+	return XFS_AGINO_TO_AGBNO(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_INO)
+xfs_ino_t
+xfs_agino_to_ino(xfs_mount_t *mp, xfs_agnumber_t a, xfs_agino_t i)
+{
+	return XFS_AGINO_TO_INO(mp, a, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_OFFSET)
+int
+xfs_agino_to_offset(xfs_mount_t *mp, xfs_agino_t i)
+{
+	return XFS_AGINO_TO_OFFSET(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_MAXRECS)
+int
+xfs_alloc_block_maxrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_BLOCK_MAXRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_MINRECS)
+int
+xfs_alloc_block_minrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_BLOCK_MINRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_SIZE)
+/*ARGSUSED1*/
+int
+xfs_alloc_block_size(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_BLOCK_SIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_KEY_ADDR)
+/*ARGSUSED3*/
+xfs_alloc_key_t *
+xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_KEY_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_PTR_ADDR)
+xfs_alloc_ptr_t *
+xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_PTR_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_REC_ADDR)
+/*ARGSUSED3*/
+xfs_alloc_rec_t *
+xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_REC_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL)
+int
+xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+	return XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen, vlen);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX)
+int
+xfs_attr_leaf_entsize_local_max(int bsize)
+{
+	return XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE)
+int
+xfs_attr_leaf_entsize_remote(int nlen)
+{
+	return XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME)
+char *
+xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return XFS_ATTR_LEAF_NAME(leafp, idx);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL)
+xfs_attr_leaf_name_local_t *
+xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return XFS_ATTR_LEAF_NAME_LOCAL(leafp, idx);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE)
+xfs_attr_leaf_name_remote_t *
+xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return XFS_ATTR_LEAF_NAME_REMOTE(leafp, idx);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_ENTSIZE)
+int
+xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep)
+{
+	return XFS_ATTR_SF_ENTSIZE(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME)
+int
+xfs_attr_sf_entsize_byname(int nlen, int vlen)
+{
+	return XFS_ATTR_SF_ENTSIZE_BYNAME(nlen, vlen);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_NEXTENTRY)
+xfs_attr_sf_entry_t *
+xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep)
+{
+	return XFS_ATTR_SF_NEXTENTRY(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_TOTSIZE)
+int
+xfs_attr_sf_totsize(xfs_inode_t *dp)
+{
+	return XFS_ATTR_SF_TOTSIZE(dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BHVTOI)
+xfs_inode_t *
+xfs_bhvtoi(bhv_desc_t *bhvp)
+{
+	return XFS_BHVTOI(bhvp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BHVTOM)
+xfs_mount_t *
+xfs_bhvtom(bhv_desc_t *bdp)
+{
+	return XFS_BHVTOM(bdp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BM_MAXLEVELS)
+int
+xfs_bm_maxlevels(xfs_mount_t *mp, int w)
+{
+	return XFS_BM_MAXLEVELS(mp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DMAXRECS)
+int
+xfs_bmap_block_dmaxrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_DMAXRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DMINRECS)
+int
+xfs_bmap_block_dminrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_DMINRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DSIZE)
+int
+xfs_bmap_block_dsize(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_DSIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_IMAXRECS)
+int
+xfs_bmap_block_imaxrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_IMAXRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_IMINRECS)
+int
+xfs_bmap_block_iminrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_IMINRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_ISIZE)
+int
+xfs_bmap_block_isize(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_ISIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_KEY_ADDR)
+/*ARGSUSED3*/
+xfs_bmbt_key_t *
+xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz)
+{
+	return XFS_BMAP_BROOT_KEY_ADDR(bb, i, sz);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_MAXRECS)
+int
+xfs_bmap_broot_maxrecs(int sz)
+{
+	return XFS_BMAP_BROOT_MAXRECS(sz);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_NUMRECS)
+int
+xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb)
+{
+	return XFS_BMAP_BROOT_NUMRECS(bb);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_PTR_ADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz)
+{
+	return XFS_BMAP_BROOT_PTR_ADDR(bb, i, sz);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_REC_ADDR)
+/*ARGSUSED3*/
+xfs_bmbt_rec_t *
+xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz)
+{
+	return XFS_BMAP_BROOT_REC_ADDR(bb, i, sz);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_SPACE)
+int
+xfs_bmap_broot_space(xfs_bmdr_block_t *bb)
+{
+	return XFS_BMAP_BROOT_SPACE(bb);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_SPACE_CALC)
+int
+xfs_bmap_broot_space_calc(int nrecs)
+{
+	return XFS_BMAP_BROOT_SPACE_CALC(nrecs);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_IBLOCK_SIZE)
+/*ARGSUSED1*/
+int
+xfs_bmap_iblock_size(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_IBLOCK_SIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_INIT)
+void
+xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
+{
+	XFS_BMAP_INIT(flp, fbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_KEY_DADDR)
+/*ARGSUSED3*/
+xfs_bmbt_key_t *
+xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_KEY_DADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_KEY_IADDR)
+/*ARGSUSED3*/
+xfs_bmbt_key_t *
+xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_KEY_IADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_PTR_DADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_PTR_DADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_PTR_IADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_PTR_IADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_RBLOCK_DSIZE)
+/*ARGSUSED1*/
+int
+xfs_bmap_rblock_dsize(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_RBLOCK_DSIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_RBLOCK_ISIZE)
+/*ARGSUSED1*/
+int
+xfs_bmap_rblock_isize(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_RBLOCK_ISIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_REC_DADDR)
+/*ARGSUSED3*/
+xfs_bmbt_rec_t *
+xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_REC_DADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_REC_IADDR)
+/*ARGSUSED3*/
+xfs_bmbt_rec_t *
+xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_REC_IADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_SANITY_CHECK)
+int
+xfs_bmap_sanity_check(xfs_mount_t *mp, xfs_bmbt_block_t *bb, int level)
+{
+	return XFS_BMAP_SANITY_CHECK(mp, bb, level);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAPI_AFLAG)
+int
+xfs_bmapi_aflag(int w)
+{
+	return XFS_BMAPI_AFLAG(w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMDR_SPACE_CALC)
+int
+xfs_bmdr_space_calc(int nrecs)
+{
+	return XFS_BMDR_SPACE_CALC(nrecs);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BNO_BLOCK)
+xfs_agblock_t
+xfs_bno_block(xfs_mount_t *mp)
+{
+	return XFS_BNO_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BTREE_LONG_PTRS)
+int
+xfs_btree_long_ptrs(xfs_btnum_t btnum)
+{
+	return XFS_BTREE_LONG_PTRS(btnum);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGF)
+xfs_agf_t *
+xfs_buf_to_agf(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_AGF(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGFL)
+xfs_agfl_t *
+xfs_buf_to_agfl(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_AGFL(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGI)
+xfs_agi_t *
+xfs_buf_to_agi(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_AGI(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_ALLOC_BLOCK)
+xfs_alloc_block_t *
+xfs_buf_to_alloc_block(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_ALLOC_BLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_BLOCK)
+xfs_btree_block_t *
+xfs_buf_to_block(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_BLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_BMBT_BLOCK)
+xfs_bmbt_block_t *
+xfs_buf_to_bmbt_block(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_BMBT_BLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_DINODE)
+xfs_dinode_t *
+xfs_buf_to_dinode(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_DINODE(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_INOBT_BLOCK)
+xfs_inobt_block_t *
+xfs_buf_to_inobt_block(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_INOBT_BLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_LBLOCK)
+xfs_btree_lblock_t *
+xfs_buf_to_lblock(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_LBLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_SBLOCK)
+xfs_btree_sblock_t *
+xfs_buf_to_sblock(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_SBLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_SBP)
+xfs_sb_t *
+xfs_buf_to_sbp(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_SBP(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_ASIZE)
+int
+xfs_cfork_asize_arch(xfs_dinode_core_t *dcp, xfs_mount_t *mp, xfs_arch_t arch)
+{
+	return XFS_CFORK_ASIZE_ARCH(dcp, mp, arch);
+}
+int
+xfs_cfork_asize(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
+{
+	return XFS_CFORK_ASIZE(dcp, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_BOFF)
+int
+xfs_cfork_boff_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch)
+{
+	return XFS_CFORK_BOFF_ARCH(dcp, arch);
+}
+int
+xfs_cfork_boff(xfs_dinode_core_t *dcp)
+{
+	return XFS_CFORK_BOFF(dcp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_DSIZE)
+int
+xfs_cfork_dsize_arch(xfs_dinode_core_t *dcp, xfs_mount_t *mp, xfs_arch_t arch)
+{
+	return XFS_CFORK_DSIZE_ARCH(dcp, mp, arch);
+}
+int
+xfs_cfork_dsize(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
+{
+	return XFS_CFORK_DSIZE(dcp, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_FMT_SET)
+void
+xfs_cfork_fmt_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch)
+{
+	XFS_CFORK_FMT_SET_ARCH(dcp, w, n, arch);
+}
+void
+xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n)
+{
+	XFS_CFORK_FMT_SET(dcp, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_FORMAT)
+int
+xfs_cfork_format_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch)
+{
+	return XFS_CFORK_FORMAT_ARCH(dcp, w, arch);
+}
+int
+xfs_cfork_format(xfs_dinode_core_t *dcp, int w)
+{
+	return XFS_CFORK_FORMAT(dcp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_NEXT_SET)
+void
+xfs_cfork_next_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch)
+{
+	XFS_CFORK_NEXT_SET_ARCH(dcp, w, n, arch);
+}
+void
+xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n)
+{
+	XFS_CFORK_NEXT_SET(dcp, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_NEXTENTS)
+int
+xfs_cfork_nextents_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch)
+{
+	return XFS_CFORK_NEXTENTS_ARCH(dcp, w, arch);
+}
+int
+xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w)
+{
+	return XFS_CFORK_NEXTENTS(dcp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_Q)
+int
+xfs_cfork_q_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch)
+{
+	return XFS_CFORK_Q_ARCH(dcp, arch);
+}
+int
+xfs_cfork_q(xfs_dinode_core_t *dcp)
+{
+	return XFS_CFORK_Q(dcp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_SIZE)
+int
+xfs_cfork_size_arch(xfs_dinode_core_t *dcp, xfs_mount_t *mp, int w, xfs_arch_t arch)
+{
+	return XFS_CFORK_SIZE_ARCH(dcp, mp, w, arch);
+}
+int
+xfs_cfork_size(xfs_dinode_core_t *dcp, xfs_mount_t *mp, int w)
+{
+	return XFS_CFORK_SIZE(dcp, mp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CNT_BLOCK)
+xfs_agblock_t
+xfs_cnt_block(xfs_mount_t *mp)
+{
+	return XFS_CNT_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_BNO)
+xfs_dablk_t
+xfs_da_cookie_bno(xfs_mount_t *mp, xfs_off_t cookie)
+{
+	return XFS_DA_COOKIE_BNO(mp, cookie);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_ENTRY)
+int
+xfs_da_cookie_entry(xfs_mount_t *mp, xfs_off_t cookie)
+{
+	return XFS_DA_COOKIE_ENTRY(mp, cookie);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_HASH)
+/*ARGSUSED1*/
+xfs_dahash_t
+xfs_da_cookie_hash(xfs_mount_t *mp, xfs_off_t cookie)
+{
+	return XFS_DA_COOKIE_HASH(mp, cookie);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_MAKE_BNOENTRY)
+__uint32_t
+xfs_da_make_bnoentry(xfs_mount_t *mp, xfs_dablk_t bno, int entry)
+{
+	return XFS_DA_MAKE_BNOENTRY(mp, bno, entry);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_MAKE_COOKIE)
+xfs_off_t
+xfs_da_make_cookie(xfs_mount_t *mp, xfs_dablk_t bno, int entry,
+		   xfs_dahash_t hash)
+{
+	return XFS_DA_MAKE_COOKIE(mp, bno, entry, hash);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_NODE_ENTRIES)
+int
+xfs_da_node_entries(xfs_mount_t *mp)
+{
+	return XFS_DA_NODE_ENTRIES(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_AGBNO)
+xfs_agblock_t
+xfs_daddr_to_agbno(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	return XFS_DADDR_TO_AGBNO(mp, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_AGNO)
+xfs_agnumber_t
+xfs_daddr_to_agno(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	return XFS_DADDR_TO_AGNO(mp, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_FSB)
+xfs_fsblock_t
+xfs_daddr_to_fsb(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	return XFS_DADDR_TO_FSB(mp, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_APTR)
+char *
+xfs_dfork_aptr_arch(xfs_dinode_t *dip, xfs_arch_t arch)
+{
+	return XFS_DFORK_APTR_ARCH(dip, arch);
+}
+char *
+xfs_dfork_aptr(xfs_dinode_t *dip)
+{
+	return XFS_DFORK_APTR(dip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_ASIZE)
+int
+xfs_dfork_asize_arch(xfs_dinode_t *dip, xfs_mount_t *mp, xfs_arch_t arch)
+{
+	return XFS_DFORK_ASIZE_ARCH(dip, mp, arch);
+}
+int
+xfs_dfork_asize(xfs_dinode_t *dip, xfs_mount_t *mp)
+{
+	return XFS_DFORK_ASIZE(dip, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_BOFF)
+int
+xfs_dfork_boff_arch(xfs_dinode_t *dip, xfs_arch_t arch)
+{
+	return XFS_DFORK_BOFF_ARCH(dip, arch);
+}
+int
+xfs_dfork_boff(xfs_dinode_t *dip)
+{
+	return XFS_DFORK_BOFF(dip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_DPTR)
+char *
+xfs_dfork_dptr_arch(xfs_dinode_t *dip, xfs_arch_t arch)
+{
+	return XFS_DFORK_DPTR_ARCH(dip, arch);
+}
+char *
+xfs_dfork_dptr(xfs_dinode_t *dip)
+{
+	return XFS_DFORK_DPTR(dip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_DSIZE)
+int
+xfs_dfork_dsize_arch(xfs_dinode_t *dip, xfs_mount_t *mp, xfs_arch_t arch)
+{
+	return XFS_DFORK_DSIZE_ARCH(dip, mp, arch);
+}
+int
+xfs_dfork_dsize(xfs_dinode_t *dip, xfs_mount_t *mp)
+{
+	return XFS_DFORK_DSIZE(dip, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_FMT_SET)
+void
+xfs_dfork_fmt_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch)
+{
+	XFS_DFORK_FMT_SET_ARCH(dip, w, n, arch);
+}
+void
+xfs_dfork_fmt_set(xfs_dinode_t *dip, int w, int n)
+{
+	XFS_DFORK_FMT_SET(dip, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_FORMAT)
+int
+xfs_dfork_format_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch)
+{
+	return XFS_DFORK_FORMAT_ARCH(dip, w, arch);
+}
+int
+xfs_dfork_format(xfs_dinode_t *dip, int w)
+{
+	return XFS_DFORK_FORMAT(dip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_NEXT_SET)
+void
+xfs_dfork_next_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch)
+{
+	XFS_DFORK_NEXT_SET_ARCH(dip, w, n, arch);
+}
+void
+xfs_dfork_next_set(xfs_dinode_t *dip, int w, int n)
+{
+	XFS_DFORK_NEXT_SET(dip, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_NEXTENTS)
+int
+xfs_dfork_nextents_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch)
+{
+	return XFS_DFORK_NEXTENTS_ARCH(dip, w, arch);
+}
+int
+xfs_dfork_nextents(xfs_dinode_t *dip, int w)
+{
+	return XFS_DFORK_NEXTENTS(dip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_PTR)
+char *
+xfs_dfork_ptr_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch)
+{
+	return XFS_DFORK_PTR_ARCH(dip, w, arch);
+}
+char *
+xfs_dfork_ptr(xfs_dinode_t *dip, int w)
+{
+	return XFS_DFORK_PTR(dip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_Q)
+int
+xfs_dfork_q_arch(xfs_dinode_t *dip, xfs_arch_t arch)
+{
+	return XFS_DFORK_Q_ARCH(dip, arch);
+}
+int
+xfs_dfork_q(xfs_dinode_t *dip)
+{
+	return XFS_DFORK_Q(dip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_SIZE)
+int
+xfs_dfork_size_arch(xfs_dinode_t *dip, xfs_mount_t *mp, int w, xfs_arch_t arch)
+{
+	return XFS_DFORK_SIZE_ARCH(dip, mp, w, arch);
+}
+int
+xfs_dfork_size(xfs_dinode_t *dip, xfs_mount_t *mp, int w)
+{
+	return XFS_DFORK_SIZE(dip, mp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DINODE_GOOD_VERSION)
+int
+xfs_dinode_good_version(int v)
+{
+	return XFS_DINODE_GOOD_VERSION(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYENTRY)
+int
+xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry)
+{
+	return XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYNAME)
+int
+xfs_dir_leaf_entsize_byname(int len)
+{
+	return XFS_DIR_LEAF_ENTSIZE_BYNAME(len);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_NAMESTRUCT)
+xfs_dir_leaf_name_t *
+xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset)
+{
+	return XFS_DIR_LEAF_NAMESTRUCT(leafp, offset);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ALLFIT)
+int
+xfs_dir_sf_allfit(int count, int totallen)
+{
+	return XFS_DIR_SF_ALLFIT(count, totallen);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ENTSIZE_BYENTRY)
+int
+xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep)
+{
+	return XFS_DIR_SF_ENTSIZE_BYENTRY(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ENTSIZE_BYNAME)
+int
+xfs_dir_sf_entsize_byname(int len)
+{
+	return XFS_DIR_SF_ENTSIZE_BYNAME(len);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_GET_DIRINO)
+void
+xfs_dir_sf_get_dirino_arch(xfs_dir_ino_t *from, xfs_ino_t *to, xfs_arch_t arch)
+{
+	XFS_DIR_SF_GET_DIRINO_ARCH(from, to, arch);
+}
+void
+xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to)
+{
+	XFS_DIR_SF_GET_DIRINO(from, to);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_NEXTENTRY)
+xfs_dir_sf_entry_t *
+xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep)
+{
+	return XFS_DIR_SF_NEXTENTRY(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_PUT_DIRINO)
+void
+xfs_dir_sf_put_dirino_arch(xfs_ino_t *from, xfs_dir_ino_t *to, xfs_arch_t arch)
+{
+	XFS_DIR_SF_PUT_DIRINO_ARCH(from, to, arch);
+}
+void
+xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to)
+{
+	XFS_DIR_SF_PUT_DIRINO(from, to);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BLOCK_LEAF_P)
+xfs_dir2_leaf_entry_t *
+xfs_dir2_block_leaf_p_arch(xfs_dir2_block_tail_t *btp, xfs_arch_t arch)
+{
+	return XFS_DIR2_BLOCK_LEAF_P_ARCH(btp,arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BLOCK_TAIL_P)
+xfs_dir2_block_tail_t *
+xfs_dir2_block_tail_p(xfs_mount_t *mp, xfs_dir2_block_t *block)
+{
+	return XFS_DIR2_BLOCK_TAIL_P(mp, block);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DA)
+xfs_dablk_t
+xfs_dir2_byte_to_da(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+	return XFS_DIR2_BYTE_TO_DA(mp, by);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DATAPTR)
+/* ARGSUSED */
+xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+	return XFS_DIR2_BYTE_TO_DATAPTR(mp, by);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_byte_to_db(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+	return XFS_DIR2_BYTE_TO_DB(mp, by);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+	return XFS_DIR2_BYTE_TO_OFF(mp, by);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DA_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_da_to_byte(xfs_mount_t *mp, xfs_dablk_t da)
+{
+	return XFS_DIR2_DA_TO_BYTE(mp, da);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DA_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_da_to_db(xfs_mount_t *mp, xfs_dablk_t da)
+{
+	return XFS_DIR2_DA_TO_DB(mp, da);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_ENTRY_TAG_P)
+xfs_dir2_data_off_t *
+xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep)
+{
+	return XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_ENTSIZE)
+int
+xfs_dir2_data_entsize(int n)
+{
+	return XFS_DIR2_DATA_ENTSIZE(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_UNUSED_TAG_P)
+xfs_dir2_data_off_t *
+xfs_dir2_data_unused_tag_p_arch(xfs_dir2_data_unused_t *dup, xfs_arch_t arch)
+{
+	return XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup,arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_BYTE)
+/* ARGSUSED */
+xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
+{
+	return XFS_DIR2_DATAPTR_TO_BYTE(mp, dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
+{
+	return XFS_DIR2_DATAPTR_TO_DB(mp, dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
+{
+	return XFS_DIR2_DATAPTR_TO_OFF(mp, dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_OFF_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(xfs_mount_t *mp, xfs_dir2_db_t db,
+			xfs_dir2_data_aoff_t o)
+{
+	return XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_OFF_TO_DATAPTR)
+xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(xfs_mount_t *mp, xfs_dir2_db_t db,
+			   xfs_dir2_data_aoff_t o)
+{
+	return XFS_DIR2_DB_OFF_TO_DATAPTR(mp, db, o);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_DA)
+xfs_dablk_t
+xfs_dir2_db_to_da(xfs_mount_t *mp, xfs_dir2_db_t db)
+{
+	return XFS_DIR2_DB_TO_DA(mp, db);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_FDB)
+xfs_dir2_db_t
+xfs_dir2_db_to_fdb(xfs_mount_t *mp, xfs_dir2_db_t db)
+{
+	return XFS_DIR2_DB_TO_FDB(mp, db);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_FDINDEX)
+int
+xfs_dir2_db_to_fdindex(xfs_mount_t *mp, xfs_dir2_db_t db)
+{
+	return XFS_DIR2_DB_TO_FDINDEX(mp, db);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_LEAF_BESTS_P)
+xfs_dir2_data_off_t *
+xfs_dir2_leaf_bests_p_arch(xfs_dir2_leaf_tail_t *ltp, xfs_arch_t arch)
+{
+	return XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_LEAF_TAIL_P)
+xfs_dir2_leaf_tail_t *
+xfs_dir2_leaf_tail_p(xfs_mount_t *mp, xfs_dir2_leaf_t *lp)
+{
+	return XFS_DIR2_LEAF_TAIL_P(mp, lp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_MAX_LEAF_ENTS)
+int
+xfs_dir2_max_leaf_ents(xfs_mount_t *mp)
+{
+	return XFS_DIR2_MAX_LEAF_ENTS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_ENTSIZE_BYENTRY)
+int
+xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+	return XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp, sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_FIRSTENTRY)
+xfs_dir2_sf_entry_t *
+xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp)
+{
+	return XFS_DIR2_SF_FIRSTENTRY(sfp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_ENTSIZE_BYNAME)
+int
+xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len)
+{
+	return XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, len);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_GET_INUMBER)
+xfs_intino_t
+xfs_dir2_sf_get_inumber_arch(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from, xfs_arch_t arch)
+{
+	return XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, from, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_GET_OFFSET)
+xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset_arch(xfs_dir2_sf_entry_t *sfep, xfs_arch_t arch)
+{
+	return XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_HDR_SIZE)
+int
+xfs_dir2_sf_hdr_size(int i8count)
+{
+	return XFS_DIR2_SF_HDR_SIZE(i8count);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_INUMBERP)
+xfs_dir2_inou_t *
+xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep)
+{
+	return XFS_DIR2_SF_INUMBERP(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_NEXTENTRY)
+xfs_dir2_sf_entry_t *
+xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+	return XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_PUT_INUMBER)
+void
+xfs_dir2_sf_put_inumber_arch(xfs_dir2_sf_t *sfp, xfs_ino_t *from, xfs_dir2_inou_t *to, xfs_arch_t arch)
+{
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, from, to, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_PUT_OFFSET)
+void
+xfs_dir2_sf_put_offset_arch(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off, xfs_arch_t arch)
+{
+	XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep, off, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTFMT_INODE )
+xfs_exntfmt_t
+xfs_extfmt_inode(struct xfs_inode *ip)
+{
+	return XFS_EXTFMT_INODE(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTLEN_MAX)
+xfs_extlen_t
+xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b)
+{
+	return XFS_EXTLEN_MAX(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTLEN_MIN)
+xfs_extlen_t
+xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b)
+{
+	return XFS_EXTLEN_MIN(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILBLKS_MAX)
+xfs_filblks_t
+xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b)
+{
+	return XFS_FILBLKS_MAX(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILBLKS_MIN)
+xfs_filblks_t
+xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b)
+{
+	return XFS_FILBLKS_MIN(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILEOFF_MAX)
+xfs_fileoff_t
+xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b)
+{
+	return XFS_FILEOFF_MAX(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILEOFF_MIN)
+xfs_fileoff_t
+xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b)
+{
+	return XFS_FILEOFF_MIN(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_SANITY_CHECK)
+int
+xfs_fsb_sanity_check(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+	return XFS_FSB_SANITY_CHECK(mp, fsbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_AGBNO)
+xfs_agblock_t
+xfs_fsb_to_agbno(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+	return XFS_FSB_TO_AGBNO(mp, fsbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_AGNO)
+xfs_agnumber_t
+xfs_fsb_to_agno(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+	return XFS_FSB_TO_AGNO(mp, fsbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_DADDR)
+xfs_daddr_t
+xfs_fsb_to_daddr(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+	return XFS_FSB_TO_DADDR(mp, fsbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_DB)
+xfs_daddr_t
+xfs_fsb_to_db(xfs_inode_t *ip, xfs_fsblock_t fsb)
+{
+	return XFS_FSB_TO_DB(ip, fsb);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_HDR_BLOCK)
+xfs_agblock_t
+xfs_hdr_block(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	return XFS_HDR_BLOCK(mp, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_BLOCKS)
+xfs_extlen_t
+xfs_ialloc_blocks(xfs_mount_t *mp)
+{
+	return XFS_IALLOC_BLOCKS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_FIND_FREE)
+int
+xfs_ialloc_find_free(xfs_inofree_t *fp)
+{
+	return XFS_IALLOC_FIND_FREE(fp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_INODES)
+int
+xfs_ialloc_inodes(xfs_mount_t *mp)
+{
+	return XFS_IALLOC_INODES(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IBT_BLOCK)
+xfs_agblock_t
+xfs_ibt_block(xfs_mount_t *mp)
+{
+	return XFS_IBT_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_ASIZE)
+int
+xfs_ifork_asize(xfs_inode_t *ip)
+{
+	return XFS_IFORK_ASIZE(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_DSIZE)
+int
+xfs_ifork_dsize(xfs_inode_t *ip)
+{
+	return XFS_IFORK_DSIZE(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_FMT_SET)
+void
+xfs_ifork_fmt_set(xfs_inode_t *ip, int w, int n)
+{
+	XFS_IFORK_FMT_SET(ip, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_FORMAT)
+int
+xfs_ifork_format(xfs_inode_t *ip, int w)
+{
+	return XFS_IFORK_FORMAT(ip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_NEXT_SET)
+void
+xfs_ifork_next_set(xfs_inode_t *ip, int w, int n)
+{
+	XFS_IFORK_NEXT_SET(ip, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_NEXTENTS)
+int
+xfs_ifork_nextents(xfs_inode_t *ip, int w)
+{
+	return XFS_IFORK_NEXTENTS(ip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_PTR)
+xfs_ifork_t *
+xfs_ifork_ptr(xfs_inode_t *ip, int w)
+{
+	return XFS_IFORK_PTR(ip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_Q)
+int
+xfs_ifork_q(xfs_inode_t *ip)
+{
+	return XFS_IFORK_Q(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_SIZE)
+int
+xfs_ifork_size(xfs_inode_t *ip, int w)
+{
+	return XFS_IFORK_SIZE(ip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FBROOT)
+int
+xfs_ilog_fbroot(int w)
+{
+	return XFS_ILOG_FBROOT(w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FDATA)
+int
+xfs_ilog_fdata(int w)
+{
+	return XFS_ILOG_FDATA(w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FEXT)
+int
+xfs_ilog_fext(int w)
+{
+	return XFS_ILOG_FEXT(w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IN_MAXLEVELS)
+int
+xfs_in_maxlevels(xfs_mount_t *mp)
+{
+	return XFS_IN_MAXLEVELS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGBNO_BITS)
+int
+xfs_ino_agbno_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_AGBNO_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGINO_BITS)
+int
+xfs_ino_agino_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_AGINO_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGNO_BITS)
+int
+xfs_ino_agno_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_AGNO_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_BITS)
+int
+xfs_ino_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_MASK)
+__uint32_t
+xfs_ino_mask(int k)
+{
+	return XFS_INO_MASK(k);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_OFFSET_BITS)
+int
+xfs_ino_offset_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_OFFSET_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGBNO)
+xfs_agblock_t
+xfs_ino_to_agbno(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_AGBNO(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGINO)
+xfs_agino_t
+xfs_ino_to_agino(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_AGINO(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGNO)
+xfs_agnumber_t
+xfs_ino_to_agno(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_AGNO(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_FSB)
+xfs_fsblock_t
+xfs_ino_to_fsb(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_FSB(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_OFFSET)
+int
+xfs_ino_to_offset(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_OFFSET(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_MAXRECS)
+int
+xfs_inobt_block_maxrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_BLOCK_MAXRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_MINRECS)
+int
+xfs_inobt_block_minrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_BLOCK_MINRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_SIZE)
+/*ARGSUSED1*/
+int
+xfs_inobt_block_size(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_BLOCK_SIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_CLR_FREE)
+void
+xfs_inobt_clr_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch)
+{
+	XFS_INOBT_CLR_FREE(rp, i, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_IS_FREE)
+int
+xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch)
+{
+	return XFS_INOBT_IS_FREE(rp, i, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_IS_LAST_REC)
+int
+xfs_inobt_is_last_rec(xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_IS_LAST_REC(cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_KEY_ADDR)
+/*ARGSUSED3*/
+xfs_inobt_key_t *
+xfs_inobt_key_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_KEY_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_MASK)
+xfs_inofree_t
+xfs_inobt_mask(int i)
+{
+	return XFS_INOBT_MASK(i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_MASKN)
+xfs_inofree_t
+xfs_inobt_maskn(int i, int n)
+{
+	return XFS_INOBT_MASKN(i, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_PTR_ADDR)
+xfs_inobt_ptr_t *
+xfs_inobt_ptr_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_PTR_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_REC_ADDR)
+/*ARGSUSED3*/
+xfs_inobt_rec_t *
+xfs_inobt_rec_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_REC_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_SET_FREE)
+void
+xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch)
+{
+	XFS_INOBT_SET_FREE(rp, i, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ITOBHV)
+bhv_desc_t *
+xfs_itobhv(xfs_inode_t *ip)
+{
+	return XFS_ITOBHV(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ITOV)
+vnode_t *
+xfs_itov(xfs_inode_t *ip)
+{
+	return XFS_ITOV(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LBLOG)
+int
+xfs_lblog(xfs_mount_t *mp)
+{
+	return XFS_LBLOG(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LBSIZE)
+int
+xfs_lbsize(xfs_mount_t *mp)
+{
+	return XFS_LBSIZE(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ALL_FREE)
+void
+xfs_lic_all_free(xfs_log_item_chunk_t *cp)
+{
+	XFS_LIC_ALL_FREE(cp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ARE_ALL_FREE)
+int
+xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
+{
+	return XFS_LIC_ARE_ALL_FREE(cp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_CLAIM)
+void
+xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
+{
+	XFS_LIC_CLAIM(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_DESC_TO_CHUNK)
+xfs_log_item_chunk_t *
+xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
+{
+	return XFS_LIC_DESC_TO_CHUNK(dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_DESC_TO_SLOT)
+int
+xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
+{
+	return XFS_LIC_DESC_TO_SLOT(dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_INIT)
+void
+xfs_lic_init(xfs_log_item_chunk_t *cp)
+{
+	XFS_LIC_INIT(cp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_INIT_SLOT)
+void
+xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
+{
+	XFS_LIC_INIT_SLOT(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ISFREE)
+int
+xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
+{
+	return XFS_LIC_ISFREE(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_RELSE)
+void
+xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
+{
+	XFS_LIC_RELSE(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_SLOT)
+xfs_log_item_desc_t *
+xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
+{
+	return XFS_LIC_SLOT(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_VACANCY)
+int
+xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
+{
+	return XFS_LIC_VACANCY(cp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LITINO)
+int
+xfs_litino(xfs_mount_t *mp)
+{
+	return XFS_LITINO(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MAKE_IPTR)
+xfs_dinode_t *
+xfs_make_iptr(xfs_mount_t *mp, xfs_buf_t *b, int o)
+{
+	return XFS_MAKE_IPTR(mp, b, o);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK32HI)
+__uint32_t
+xfs_mask32hi(int n)
+{
+	return XFS_MASK32HI(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK32LO)
+__uint32_t
+xfs_mask32lo(int n)
+{
+	return XFS_MASK32LO(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK64HI)
+__uint64_t
+xfs_mask64hi(int n)
+{
+	return XFS_MASK64HI(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK64LO)
+__uint64_t
+xfs_mask64lo(int n)
+{
+	return XFS_MASK64LO(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST)
+int
+xfs_min_freelist(xfs_agf_t *a, xfs_mount_t *mp)
+{
+	return XFS_MIN_FREELIST(a, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST_PAG)
+int
+xfs_min_freelist_pag(xfs_perag_t *pag, xfs_mount_t *mp)
+{
+	return XFS_MIN_FREELIST_PAG(pag, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST_RAW)
+int
+xfs_min_freelist_raw(uint bl, uint cl, xfs_mount_t *mp)
+{
+	return XFS_MIN_FREELIST_RAW(bl, cl, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MTOVFS)
+vfs_t *
+xfs_mtovfs(xfs_mount_t *mp)
+{
+	return XFS_MTOVFS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_OFFBNO_TO_AGINO)
+xfs_agino_t
+xfs_offbno_to_agino(xfs_mount_t *mp, xfs_agblock_t b, int o)
+{
+	return XFS_OFFBNO_TO_AGINO(mp, b, o);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_PREALLOC_BLOCKS)
+xfs_agblock_t
+xfs_prealloc_blocks(xfs_mount_t *mp)
+{
+	return XFS_PREALLOC_BLOCKS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_BLOCK)
+xfs_agblock_t
+xfs_sb_block(xfs_mount_t *mp)
+{
+	return XFS_SB_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_GOOD_VERSION)
+int
+xfs_sb_good_version(xfs_sb_t *sbp)
+{
+	return XFS_SB_GOOD_VERSION(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDATTR)
+void
+xfs_sb_version_addattr(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDATTR(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDDALIGN)
+void
+xfs_sb_version_adddalign(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDDALIGN(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDNLINK)
+void
+xfs_sb_version_addnlink(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDNLINK(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDQUOTA)
+void
+xfs_sb_version_addquota(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDQUOTA(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDSHARED)
+void
+xfs_sb_version_addshared(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDSHARED(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASALIGN)
+int
+xfs_sb_version_hasalign(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASALIGN(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASATTR)
+int
+xfs_sb_version_hasattr(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASATTR(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASDALIGN)
+int
+xfs_sb_version_hasdalign(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASDALIGN(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASDIRV2)
+int
+xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASDIRV2(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASEXTFLGBIT)
+int
+xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASEXTFLGBIT(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASNLINK)
+int
+xfs_sb_version_hasnlink(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASNLINK(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASQUOTA)
+int
+xfs_sb_version_hasquota(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASQUOTA(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASSHARED)
+int
+xfs_sb_version_hasshared(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASSHARED(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_NUM)
+int
+xfs_sb_version_num(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_SUBALIGN)
+void
+xfs_sb_version_subalign(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_SUBALIGN(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_SUBSHARED)
+void
+xfs_sb_version_subshared(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_SUBSHARED(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASLOGV2)
+int
+xfs_sb_version_haslogv2(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASLOGV2(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_TONEW)
+unsigned
+xfs_sb_version_tonew(unsigned v)
+{
+	return XFS_SB_VERSION_TONEW(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_TOOLD)
+unsigned
+xfs_sb_version_toold(unsigned v)
+{
+	return XFS_SB_VERSION_TOOLD(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_BTOLRBB)
+int
+xlog_btolrbb(int b)
+{
+	return XLOG_BTOLRBB(b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_GRANT_ADD_SPACE)
+void
+xlog_grant_add_space(xlog_t *log, int bytes, int type)
+{
+	XLOG_GRANT_ADD_SPACE(log, bytes, type);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_GRANT_SUB_SPACE)
+void
+xlog_grant_sub_space(xlog_t *log, int bytes, int type)
+{
+	XLOG_GRANT_SUB_SPACE(log, bytes, type);
+}
+#endif
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_macros.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_macros.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_macros.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_macros.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_MACROS_H__
+#define __XFS_MACROS_H__
+
+/*
+ * Set for debug kernels and simulation, and 32-bit kernels,
+ * but not for standalone.  These replacements save space.
+ * Used in xfs_macros.c.
+ */
+#define XFS_WANT_SPACE_C	\
+	(!defined(_STANDALONE) && \
+	 (defined(DEBUG) || (defined(_KERNEL))))
+
+/*
+ * Set for debug simulation and kernel builds, but not for standalone.
+ * These replacements do not save space.
+ * Used in xfs_macros.c.
+ */
+#define XFS_WANT_FUNCS_C	\
+	(!defined(_STANDALONE) && defined(DEBUG))
+
+/*
+ * Corresponding names used in .h files.
+ */
+#define XFS_WANT_SPACE	(XFS_WANT_SPACE_C && !defined(XFS_MACRO_C))
+#define XFS_WANT_FUNCS	(XFS_WANT_FUNCS_C && !defined(XFS_MACRO_C))
+
+/*
+ * These are the macros that get turned into functions to save space.
+ */
+#define XFSSO_NULLSTARTBLOCK 1
+#define XFSSO_XFS_AGB_TO_DADDR 1
+#define XFSSO_XFS_AGB_TO_FSB 1
+#define XFSSO_XFS_AGINO_TO_INO 1
+#define XFSSO_XFS_ALLOC_BLOCK_MINRECS 1
+#define XFSSO_XFS_ATTR_SF_NEXTENTRY 1
+#define XFSSO_XFS_BMAP_BLOCK_DMAXRECS 1
+#define XFSSO_XFS_BMAP_BLOCK_IMAXRECS 1
+#define XFSSO_XFS_BMAP_BLOCK_IMINRECS 1
+#define XFSSO_XFS_BMAP_INIT 1
+#define XFSSO_XFS_BMAP_PTR_IADDR 1
+#define XFSSO_XFS_BMAP_SANITY_CHECK 1
+#define XFSSO_XFS_BMAPI_AFLAG 1
+#define XFSSO_XFS_CFORK_SIZE 1
+#define XFSSO_XFS_DA_COOKIE_BNO 1
+#define XFSSO_XFS_DA_COOKIE_ENTRY 1
+#define XFSSO_XFS_DADDR_TO_AGBNO 1
+#define XFSSO_XFS_DADDR_TO_FSB 1
+#define XFSSO_XFS_DFORK_PTR 1
+#define XFSSO_XFS_DIR_SF_GET_DIRINO 1
+#define XFSSO_XFS_DIR_SF_NEXTENTRY 1
+#define XFSSO_XFS_DIR_SF_PUT_DIRINO 1
+#define XFSSO_XFS_FILBLKS_MIN 1
+#define XFSSO_XFS_FSB_SANITY_CHECK 1
+#define XFSSO_XFS_FSB_TO_DADDR 1
+#define XFSSO_XFS_FSB_TO_DB 1
+#define XFSSO_XFS_IALLOC_INODES 1
+#define XFSSO_XFS_IFORK_ASIZE 1
+#define XFSSO_XFS_IFORK_DSIZE 1
+#define XFSSO_XFS_IFORK_FORMAT 1
+#define XFSSO_XFS_IFORK_NEXT_SET 1
+#define XFSSO_XFS_IFORK_NEXTENTS 1
+#define XFSSO_XFS_IFORK_PTR 1
+#define XFSSO_XFS_ILOG_FBROOT 1
+#define XFSSO_XFS_ILOG_FEXT 1
+#define XFSSO_XFS_INO_MASK 1
+#define XFSSO_XFS_INO_TO_FSB 1
+#define XFSSO_XFS_INODE_CLEAR_READ_AHEAD 1
+#define XFSSO_XFS_MIN_FREELIST 1
+#define XFSSO_XFS_SB_GOOD_VERSION 1
+#define XFSSO_XFS_SB_VERSION_HASNLINK 1
+#define XFSSO_XLOG_GRANT_ADD_SPACE 1
+#define XFSSO_XLOG_GRANT_SUB_SPACE 1
+
+#endif	/* __XFS_MACROS_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_mount.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_mount.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_mount.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_mount.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1736 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+STATIC void	xfs_mount_reset_sbqflags(xfs_mount_t *);
+STATIC void	xfs_mount_log_sbunit(xfs_mount_t *, __int64_t);
+STATIC int	xfs_uuid_mount(xfs_mount_t *);
+
+mutex_t		xfs_uuidtabmon;		/* monitor for uuidtab */
+STATIC int	xfs_uuidtab_size;
+STATIC uuid_t	*xfs_uuidtab;
+
+STATIC void	xfs_uuid_unmount(xfs_mount_t *);
+
+void xfs_xlatesb(void *, xfs_sb_t *, int, xfs_arch_t, __int64_t);
+
+static struct {
+    short offset;
+    short type;	    /* 0 = integer
+		     * 1 = binary / string (no translation)
+		     */
+} xfs_sb_info[] = {
+    { offsetof(xfs_sb_t, sb_magicnum),	 0 },
+    { offsetof(xfs_sb_t, sb_blocksize),	 0 },
+    { offsetof(xfs_sb_t, sb_dblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_rblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_rextents),	 0 },
+    { offsetof(xfs_sb_t, sb_uuid),	 1 },
+    { offsetof(xfs_sb_t, sb_logstart),	 0 },
+    { offsetof(xfs_sb_t, sb_rootino),	 0 },
+    { offsetof(xfs_sb_t, sb_rbmino),	 0 },
+    { offsetof(xfs_sb_t, sb_rsumino),	 0 },
+    { offsetof(xfs_sb_t, sb_rextsize),	 0 },
+    { offsetof(xfs_sb_t, sb_agblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_agcount),	 0 },
+    { offsetof(xfs_sb_t, sb_rbmblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_logblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_versionnum), 0 },
+    { offsetof(xfs_sb_t, sb_sectsize),	 0 },
+    { offsetof(xfs_sb_t, sb_inodesize),	 0 },
+    { offsetof(xfs_sb_t, sb_inopblock),	 0 },
+    { offsetof(xfs_sb_t, sb_fname[0]),	 1 },
+    { offsetof(xfs_sb_t, sb_blocklog),	 0 },
+    { offsetof(xfs_sb_t, sb_sectlog),	 0 },
+    { offsetof(xfs_sb_t, sb_inodelog),	 0 },
+    { offsetof(xfs_sb_t, sb_inopblog),	 0 },
+    { offsetof(xfs_sb_t, sb_agblklog),	 0 },
+    { offsetof(xfs_sb_t, sb_rextslog),	 0 },
+    { offsetof(xfs_sb_t, sb_inprogress), 0 },
+    { offsetof(xfs_sb_t, sb_imax_pct),	 0 },
+    { offsetof(xfs_sb_t, sb_icount),	 0 },
+    { offsetof(xfs_sb_t, sb_ifree),	 0 },
+    { offsetof(xfs_sb_t, sb_fdblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_frextents),	 0 },
+    { offsetof(xfs_sb_t, sb_uquotino),	 0 },
+    { offsetof(xfs_sb_t, sb_gquotino),	 0 },
+    { offsetof(xfs_sb_t, sb_qflags),	 0 },
+    { offsetof(xfs_sb_t, sb_flags),	 0 },
+    { offsetof(xfs_sb_t, sb_shared_vn),	 0 },
+    { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
+    { offsetof(xfs_sb_t, sb_unit),	 0 },
+    { offsetof(xfs_sb_t, sb_width),	 0 },
+    { offsetof(xfs_sb_t, sb_dirblklog),	 0 },
+    { offsetof(xfs_sb_t, sb_dummy),	 1 },
+    { offsetof(xfs_sb_t, sb_logsunit),	 0 },
+    { sizeof(xfs_sb_t),			 0 }
+};
+
+/*
+ * Return a pointer to an initialized xfs_mount structure.
+ */
+xfs_mount_t *
+xfs_mount_init(void)
+{
+	xfs_mount_t *mp;
+
+	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
+
+	AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
+	spinlock_init(&mp->m_sb_lock, "xfs_sb");
+	mutex_init(&mp->m_ilock, MUTEX_DEFAULT, "xfs_ilock");
+	initnsema(&mp->m_growlock, 1, "xfs_grow");
+	/*
+	 * Initialize the AIL.
+	 */
+	xfs_trans_ail_init(mp);
+
+	/* Init freeze sync structures */
+	spinlock_init(&mp->m_freeze_lock, "xfs_freeze");
+	init_sv(&mp->m_wait_unfreeze, SV_DEFAULT, "xfs_freeze", 0);
+	atomic_set(&mp->m_active_trans, 0);
+
+	return mp;
+}	/* xfs_mount_init */
+
+/*
+ * Free up the resources associated with a mount structure.  Assume that
+ * the structure was initially zeroed, so we can tell which fields got
+ * initialized.
+ */
+void
+xfs_mount_free(
+	xfs_mount_t *mp,
+	int	    remove_bhv)
+{
+	if (mp->m_ihash)
+		xfs_ihash_free(mp);
+	if (mp->m_chash)
+		xfs_chash_free(mp);
+
+	if (mp->m_perag) {
+		int	agno;
+
+		for (agno = 0; agno < mp->m_maxagi; agno++)
+			if (mp->m_perag[agno].pagb_list)
+				kmem_free(mp->m_perag[agno].pagb_list,
+				  sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
+		kmem_free(mp->m_perag,
+			  sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
+	}
+
+#if 0
+	/*
+	 * XXXdpd - Doesn't work now for shutdown case.
+	 * Should at least free the memory.
+	 */
+	ASSERT(mp->m_ail.ail_back == (xfs_log_item_t*)&(mp->m_ail));
+	ASSERT(mp->m_ail.ail_forw == (xfs_log_item_t*)&(mp->m_ail));
+#endif
+	AIL_LOCK_DESTROY(&mp->m_ail_lock);
+	spinlock_destroy(&mp->m_sb_lock);
+	mutex_destroy(&mp->m_ilock);
+	freesema(&mp->m_growlock);
+
+	if (mp->m_fsname != NULL) {
+		kmem_free(mp->m_fsname, mp->m_fsname_len);
+	}
+	if (mp->m_quotainfo != NULL) {
+		xfs_qm_unmount_quotadestroy(mp);
+	}
+
+	if (remove_bhv) {
+		VFS_REMOVEBHV(XFS_MTOVFS(mp), &mp->m_bhv);
+	}
+	spinlock_destroy(&mp->m_freeze_lock);
+	sv_destroy(&mp->m_wait_unfreeze);
+	kmem_free(mp, sizeof(xfs_mount_t));
+}
+
+
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+	xfs_mount_t	*mp,
+	xfs_sb_t	*sbp)
+{
+	/*
+	 * If the log device and data device have the
+	 * same device number, the log is internal.
+	 * Consequently, the sb_logstart should be non-zero.  If
+	 * we have a zero sb_logstart in this case, we may be trying to mount
+	 * a volume filesystem in a non-volume manner.
+	 */
+	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+		cmn_err(CE_WARN, "XFS: bad magic number");
+		return XFS_ERROR(EWRONGFS);
+	}
+
+	if (!XFS_SB_GOOD_VERSION(sbp)) {
+		cmn_err(CE_WARN, "XFS: bad version");
+		return XFS_ERROR(EWRONGFS);
+	}
+
+	if (sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp) {
+		cmn_err(CE_WARN, "XFS: filesystem is marked as having an external log; specify logdev on the\nmount command line.");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp) {
+		cmn_err(CE_WARN, "XFS: filesystem is marked as having an internal log; don't specify logdev on\nthe mount command line.");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * More sanity checking. These were stolen directly from
+	 * xfs_repair.
+	 */
+	if (sbp->sb_blocksize <= 0					||
+	    sbp->sb_agcount <= 0					||
+	    sbp->sb_sectsize <= 0					||
+	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
+	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
+	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
+	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
+	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
+	    sbp->sb_imax_pct > 100) {
+		cmn_err(CE_WARN, "XFS: SB sanity check 1 failed");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * sanity check ag count, size fields against data size field
+	 */
+	if (sbp->sb_dblocks == 0 ||
+	    sbp->sb_dblocks >
+	     (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
+	    sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
+			      sbp->sb_agblocks + XFS_MIN_AG_BLOCKS) {
+		cmn_err(CE_WARN, "XFS: SB sanity check 2 failed");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+#if !XFS_BIG_FILESYSTEMS
+	if (sbp->sb_dblocks > INT_MAX || sbp->sb_rblocks > INT_MAX)  {
+		cmn_err(CE_WARN,
+"XFS:  File systems greater than 1TB not supported on this system.");
+		return XFS_ERROR(E2BIG);
+	}
+#endif
+
+	if (sbp->sb_inprogress) {
+		cmn_err(CE_WARN, "XFS: file system busy");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * Until this is fixed only page-sized or smaller data blocks work.
+	 */
+	if (sbp->sb_blocksize > PAGE_SIZE) {
+		cmn_err(CE_WARN,
+		"XFS: Trying to mount file system with blocksize %d bytes",
+			sbp->sb_blocksize);
+		cmn_err(CE_WARN,
+		"XFS: Only page-sized (%d bytes) or less blocksizes currently work.",
+			PAGE_SIZE);
+		return XFS_ERROR(EWRONGFS);
+	}
+	return (0);
+}
+
+void
+xfs_initialize_perag(xfs_mount_t *mp, int agcount)
+{
+	int		index, max_metadata;
+	xfs_perag_t	*pag;
+	xfs_agino_t	agino;
+	xfs_ino_t	ino;
+	xfs_sb_t	*sbp = &mp->m_sb;
+	xfs_ino_t	max_inum = XFS_MAXINUMBER_32;
+
+	/* Check to see if the filesystem can overflow 32 bit inodes */
+	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+	/* Clear the mount flag if no inode can overflow 32 bits
+	 * on this filesystem.
+	 */
+	if (ino <= max_inum) {
+		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
+	}
+
+	/* If we can overflow then setup the ag headers accordingly */
+	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+		/* Calculate how much should be reserved for inodes to
+		 * meet the max inode percentage.
+		 */
+		if (mp->m_maxicount) {
+			__uint64_t	icount;
+
+			icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+			do_div(icount, 100);
+			icount += sbp->sb_agblocks - 1;
+			do_div(icount, mp->m_ialloc_blks);
+			max_metadata = icount;
+		} else {
+			max_metadata = agcount;
+		}
+		for (index = 0; index < agcount; index++) {
+			ino = XFS_AGINO_TO_INO(mp, index, agino);
+			if (ino > max_inum) {
+				index++;
+				break;
+			}
+
+			/* This ag is prefered for inodes */
+			pag = &mp->m_perag[index];
+			pag->pagi_inodeok = 1;
+			if (index < max_metadata)
+				pag->pagf_metadata = 1;
+		}
+	} else {
+		/* Setup default behavior for smaller filesystems */
+		for (index = 0; index < agcount; index++) {
+			pag = &mp->m_perag[index];
+			pag->pagi_inodeok = 1;
+		}
+	}
+	mp->m_maxagi = index;
+}
+
+/*
+ * xfs_xlatesb
+ *
+ *     data	  - on disk version of sb
+ *     sb	  - a superblock
+ *     dir	  - conversion direction: <0 - convert sb to buf
+ *					  >0 - convert buf to sb
+ *     arch	  - architecture to read/write from/to buf
+ *     fields	  - which fields to copy (bitmask)
+ */
+void
+xfs_xlatesb(
+	void		*data,
+	xfs_sb_t	*sb,
+	int		dir,
+	xfs_arch_t	arch,
+	__int64_t	fields)
+{
+	xfs_caddr_t	buf_ptr;
+	xfs_caddr_t	mem_ptr;
+	xfs_sb_field_t	f;
+	int		first;
+	int		size;
+
+	ASSERT(dir);
+	ASSERT(fields);
+
+	if (!fields)
+		return;
+
+	buf_ptr = (xfs_caddr_t)data;
+	mem_ptr = (xfs_caddr_t)sb;
+
+	while (fields) {
+		f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+		first = xfs_sb_info[f].offset;
+		size = xfs_sb_info[f + 1].offset - first;
+
+		ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+
+		if (arch == ARCH_NOCONVERT ||
+		    size == 1 ||
+		    xfs_sb_info[f].type == 1) {
+			if (dir > 0) {
+				bcopy(buf_ptr + first, mem_ptr + first, size);
+			} else {
+				bcopy(mem_ptr + first, buf_ptr + first, size);
+			}
+		} else {
+			switch (size) {
+			case 2:
+				INT_XLATE(*(__uint16_t*)(buf_ptr+first),
+					  *(__uint16_t*)(mem_ptr+first),
+					  dir, arch);
+				break;
+			case 4:
+				INT_XLATE(*(__uint32_t*)(buf_ptr+first),
+					  *(__uint32_t*)(mem_ptr+first),
+					  dir, arch);
+				break;
+			case 8:
+				INT_XLATE(*(__uint64_t*)(buf_ptr+first),
+					  *(__uint64_t*)(mem_ptr+first), dir, arch);
+				break;
+			default:
+				ASSERT(0);
+			}
+		}
+
+		fields &= ~(1LL << f);
+	}
+}
+
+/*
+ * xfs_readsb
+ *
+ * Does the initial read of the superblock.
+ */
+int
+xfs_readsb(xfs_mount_t *mp)
+{
+	xfs_buf_t	*bp;
+	xfs_sb_t	*sbp;
+	int		error = 0;
+
+	ASSERT(mp->m_sb_bp == 0);
+
+	/*
+	 * Allocate a (locked) buffer to hold the superblock.
+	 * This will be kept around at all time to optimize
+	 * access to the superblock.
+	 */
+	bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, 1,
+		PBF_LOCK|PBF_READ|PBF_MAPPED|PBF_MAPPABLE|PBF_FS_MANAGED);
+	ASSERT(bp != NULL);
+	ASSERT(XFS_BUF_ISBUSY(bp) && XFS_BUF_VALUSEMA(bp) <= 0);
+
+	/*
+	 * Initialize the mount structure from the superblock.
+	 * But first do some basic consistency checking.
+	 */
+	sbp = XFS_BUF_TO_SBP(bp);
+	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, ARCH_CONVERT, XFS_SB_ALL_BITS);
+	if ((error = xfs_mount_validate_sb(mp, &(mp->m_sb)))) {
+		cmn_err(CE_WARN, "XFS: SB validate failed");
+		goto err;
+	}
+
+	mp->m_sb_bp = bp;
+	xfs_buf_relse(bp);
+	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
+	return 0;
+
+ err:
+	bp->pb_flags &= ~PBF_FS_MANAGED;
+	xfs_buf_relse(bp);
+	return error;
+}
+
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+void
+xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
+{
+	int	i;
+
+	mp->m_agfrotor = mp->m_agirotor = 0;
+	mp->m_maxagi = mp->m_sb.sb_agcount;
+	mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+	mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+	mp->m_litino = sbp->sb_inodesize -
+		((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
+	mp->m_blockmask = sbp->sb_blocksize - 1;
+	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+	mp->m_blockwmask = mp->m_blockwsize - 1;
+
+
+	if (XFS_SB_VERSION_HASLOGV2(sbp)) {
+		if (sbp->sb_logsunit <= 1) {
+			mp->m_lstripemask = 1;
+		} else {
+			mp->m_lstripemask =
+				1 << xfs_highbit32(sbp->sb_logsunit >> BBSHIFT);
+		}
+	}
+
+	/*
+	 * Setup for attributes, in case they get created.
+	 * This value is for inodes getting attributes for the first time,
+	 * the per-inode value is for old attribute values.
+	 */
+	ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
+	switch (sbp->sb_inodesize) {
+	case 256:
+		mp->m_attroffset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(2);
+		break;
+	case 512:
+	case 1024:
+	case 2048:
+		mp->m_attroffset = XFS_BMDR_SPACE_CALC(12);
+		break;
+	default:
+		ASSERT(0);
+	}
+	ASSERT(mp->m_attroffset < XFS_LITINO(mp));
+
+	for (i = 0; i < 2; i++) {
+		mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+			xfs_alloc, i == 0);
+		mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+			xfs_alloc, i == 0);
+	}
+	for (i = 0; i < 2; i++) {
+		mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+			xfs_bmbt, i == 0);
+		mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+			xfs_bmbt, i == 0);
+	}
+	for (i = 0; i < 2; i++) {
+		mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+			xfs_inobt, i == 0);
+		mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+			xfs_inobt, i == 0);
+	}
+
+	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+					sbp->sb_inopblock);
+	mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+
+extern void xfs_refcache_sbdirty(struct super_block*);
+
+/*
+ * xfs_mountfs
+ *
+ * This function does the following on an initial mount of a file system:
+ *	- reads the superblock from disk and init the mount struct
+ *	- if we're a 32-bit kernel, do a size check on the superblock
+ *		so we don't mount terabyte filesystems
+ *	- init mount struct realtime fields
+ *	- allocate inode hash table for fs
+ *	- init directory manager
+ *	- perform recovery and init the log manager
+ */
+int
+xfs_mountfs(
+	vfs_t		*vfsp,
+	xfs_mount_t	*mp,
+	dev_t 		dev, 
+	int		mfsi_flags)
+{
+	xfs_buf_t	*bp;
+	xfs_sb_t	*sbp = &(mp->m_sb);
+	int		error = 0;
+	xfs_inode_t	*rip;
+	vnode_t		*rvp = 0;
+	int		readio_log;
+	int		writeio_log;
+	vmap_t		vmap;
+	xfs_daddr_t	d;
+	extern xfs_ioops_t xfs_iocore_xfs;	/* from xfs_iocore.c */
+	__uint64_t	ret64;
+	uint		quotaflags, quotaondisk, rootqcheck, needquotacheck;
+	boolean_t	needquotamount;
+	__int64_t	update_flags;
+	int		agno, noio;
+	int		uuid_mounted = 0;
+
+	noio = dev == 0 && mp->m_sb_bp != NULL;
+	if (mp->m_sb_bp == NULL) {
+		if ((error = xfs_readsb(mp))) {
+			return (error);
+		}
+	}
+	xfs_mount_common(mp, sbp);
+
+	/*
+	 * Check if sb_agblocks is aligned at stripe boundary
+	 * If sb_agblocks is NOT aligned turn off m_dalign since
+	 * allocator alignment is within an ag, therefore ag has
+	 * to be aligned at stripe boundary.
+	 */
+	update_flags = 0LL;
+	if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
+		/*
+		 * If stripe unit and stripe width are not multiples
+		 * of the fs blocksize turn off alignment.
+		 */
+		if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+		    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+			if (mp->m_flags & XFS_MOUNT_RETERR) {
+				cmn_err(CE_WARN, "XFS: alignment check 1 failed");
+				error = XFS_ERROR(EINVAL);
+				goto error1;
+			}
+			mp->m_dalign = mp->m_swidth = 0;
+		} else {
+			/*
+			 * Convert the stripe unit and width to FSBs.
+			 */
+			mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+			if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
+				if (mp->m_flags & XFS_MOUNT_RETERR) {
+					error = XFS_ERROR(EINVAL);
+					goto error1;
+				}
+				mp->m_dalign = 0;
+				mp->m_swidth = 0;
+			} else if (mp->m_dalign) {
+				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+			} else {
+				if (mp->m_flags & XFS_MOUNT_RETERR) {
+					cmn_err(CE_WARN, "XFS: alignment check 3 failed");
+					error = XFS_ERROR(EINVAL);
+					goto error1;
+				}
+				mp->m_swidth = 0;
+			}
+		}
+
+		/*
+		 * Update superblock with new values
+		 * and log changes
+		 */
+		if (XFS_SB_VERSION_HASDALIGN(sbp)) {
+			if (sbp->sb_unit != mp->m_dalign) {
+				sbp->sb_unit = mp->m_dalign;
+				update_flags |= XFS_SB_UNIT;
+			}
+			if (sbp->sb_width != mp->m_swidth) {
+				sbp->sb_width = mp->m_swidth;
+				update_flags |= XFS_SB_WIDTH;
+			}
+		}
+	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
+		    XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) {
+			mp->m_dalign = sbp->sb_unit;
+			mp->m_swidth = sbp->sb_width;
+	}
+
+	xfs_alloc_compute_maxlevels(mp);
+	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
+	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
+	xfs_ialloc_compute_maxlevels(mp);
+
+	if (sbp->sb_imax_pct) {
+		__uint64_t	icount;
+
+		/* Make sure the maximum inode count is a multiple of the
+		 * units we allocate inodes in.
+		 */
+
+		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+		do_div(icount, 100);
+		do_div(icount, mp->m_ialloc_blks);
+		mp->m_maxicount = (icount * mp->m_ialloc_blks)	<<
+				   sbp->sb_inopblog;
+	} else
+		mp->m_maxicount = 0;
+
+	/*
+	 * XFS uses the uuid from the superblock as the unique
+	 * identifier for fsid.	 We can not use the uuid from the volume
+	 * since a single partition filesystem is identical to a single
+	 * partition volume/filesystem.
+	 */
+	if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
+		if (xfs_uuid_mount(mp)) {
+			error = XFS_ERROR(EINVAL);
+			goto error1;
+		}
+		uuid_mounted=1;
+		ret64 = uuid_hash64(&sbp->sb_uuid);
+		bcopy(&ret64, &vfsp->vfs_fsid, sizeof(ret64));
+	}
+
+	/*
+	 * Set the default minimum read and write sizes unless
+	 * already specified in a mount option.
+	 * We use smaller I/O sizes when the file system
+	 * is being used for NFS service (wsync mount option).
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+		if (mp->m_flags & XFS_MOUNT_WSYNC) {
+			readio_log = XFS_WSYNC_READIO_LOG;
+			writeio_log = XFS_WSYNC_WRITEIO_LOG;
+		} else {
+			readio_log = XFS_READIO_LOG_LARGE;
+			writeio_log = XFS_WRITEIO_LOG_LARGE;
+		}
+	} else {
+		readio_log = mp->m_readio_log;
+		writeio_log = mp->m_writeio_log;
+	}
+
+	/*
+	 * Set the number of readahead buffers to use based on
+	 * physical memory size.
+	 */
+	if (xfs_physmem <= 4096)		/* <= 16MB */
+		mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB;
+	else if (xfs_physmem <= 8192)	/* <= 32MB */
+		mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB;
+	else
+		mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32;
+	if (sbp->sb_blocklog > readio_log) {
+		mp->m_readio_log = sbp->sb_blocklog;
+	} else {
+		mp->m_readio_log = readio_log;
+	}
+	mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
+	if (sbp->sb_blocklog > writeio_log) {
+		mp->m_writeio_log = sbp->sb_blocklog;
+	} else {
+		mp->m_writeio_log = writeio_log;
+	}
+	mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
+
+	/*
+	 * Set the inode cluster size based on the physical memory
+	 * size.  This may still be overridden by the file system
+	 * block size if it is larger than the chosen cluster size.
+	 */
+	if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */
+		mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE;
+	} else {
+		mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+	}
+	/*
+	 * Set whether we're using inode alignment.
+	 */
+	if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) &&
+	    mp->m_sb.sb_inoalignmt >=
+	    XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
+		mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
+	else
+		mp->m_inoalign_mask = 0;
+	/*
+	 * If we are using stripe alignment, check whether
+	 * the stripe unit is a multiple of the inode alignment
+	 */
+	if (mp->m_dalign && mp->m_inoalign_mask &&
+	    !(mp->m_dalign & mp->m_inoalign_mask))
+		mp->m_sinoalign = mp->m_dalign;
+	else
+		mp->m_sinoalign = 0;
+	/*
+	 * Check that the data (and log if separate) are an ok size.
+	 */
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
+		cmn_err(CE_WARN, "XFS: size check 1 failed");
+		error = XFS_ERROR(E2BIG);
+		goto error1;
+	}
+	if (!noio) {
+		error = xfs_read_buf(mp, mp->m_ddev_targp, d - 1, 1, 0, &bp);
+		if (!error) {
+			xfs_buf_relse(bp);
+		} else {
+			cmn_err(CE_WARN, "XFS: size check 2 failed");
+			if (error == ENOSPC) {
+				error = XFS_ERROR(E2BIG);
+			}
+			goto error1;
+		}
+	}
+
+	if (!noio && ((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
+	    mp->m_logdev_targp != mp->m_ddev_targp) {
+		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
+			cmn_err(CE_WARN, "XFS: size check 3 failed");
+			error = XFS_ERROR(E2BIG);
+			goto error1;
+		}
+		error = xfs_read_buf(mp, mp->m_logdev_targp, d - 1, 1, 0, &bp);
+		if (!error) {
+			xfs_buf_relse(bp);
+		} else {
+			cmn_err(CE_WARN, "XFS: size check 3 failed");
+			if (error == ENOSPC) {
+				error = XFS_ERROR(E2BIG);
+			}
+			goto error1;
+		}
+	}
+
+	/*
+	 * Disallow mount attempts with (IRIX) project quota enabled
+	 */
+	if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+	    (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT)) {
+		cmn_err(CE_WARN, "XFS: IRIX project quota are enabled");
+		error = XFS_ERROR(ENOSYS);
+		goto error1;
+	}
+
+	/*
+	 * Initialize realtime fields in the mount structure
+	 */
+	if ((error = xfs_rtmount_init(mp))) {
+		cmn_err(CE_WARN, "XFS: RT mount failed");
+		goto error1;
+	}
+
+	/*
+	 * For client case we are done now
+	 */
+	if (mfsi_flags & XFS_MFSI_CLIENT) {
+		return(0);
+	}
+
+	/*
+	 * Set up timer list structure for nfs refcache
+	 */
+	init_timer(&mp->m_sbdirty_timer);
+	mp->m_sbdirty_timer.function = (void (*)(unsigned long)) xfs_refcache_sbdirty;
+
+	/* Initialize the I/O function vector with XFS functions */
+	mp->m_io_ops = xfs_iocore_xfs;
+
+	/*
+	 *  Copies the low order bits of the timestamp and the randomly
+	 *  set "sequence" number out of a UUID.
+	 */
+	uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
+
+	/*
+	 *  The vfs structure needs to have a file system independent
+	 *  way of checking for the invariant file system ID.  Since it
+	 *  can't look at mount structures it has a pointer to the data
+	 *  in the mount structure.
+	 *
+	 *  File systems that don't support user level file handles (i.e.
+	 *  all of them except for XFS) will leave vfs_altfsid as NULL.
+	 */
+	vfsp->vfs_altfsid = (fsid_t *)mp->m_fixedfsid;
+	mp->m_dmevmask = 0;	/* not persistent; set after each mount */
+
+	/*
+	 * Select the right directory manager.
+	 */
+	mp->m_dirops =
+		XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
+			xfsv2_dirops :
+			xfsv1_dirops;
+
+	/*
+	 * Initialize directory manager's entries.
+	 */
+	XFS_DIR_MOUNT(mp);
+
+	/*
+	 * Initialize the attribute manager's entries.
+	 */
+	mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
+
+	/*
+	 * Initialize the precomputed transaction reservations values.
+	 */
+	xfs_trans_init(mp);
+	if (noio) {
+		ASSERT((mfsi_flags & XFS_MFSI_CLIENT) == 0);
+		return 0;
+	}
+
+	/*
+	 * Allocate and initialize the inode hash table for this
+	 * file system.
+	 */
+	xfs_ihash_init(mp);
+	xfs_chash_init(mp);
+
+	/*
+	 * Allocate and initialize the per-ag data.
+	 */
+	init_rwsem(&mp->m_peraglock);
+	mp->m_perag =
+		kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
+
+	xfs_initialize_perag(mp, sbp->sb_agcount);
+
+	/*
+	 * log's mount-time initialization. Perform 1st part recovery if needed
+	 */
+	if (sbp->sb_logblocks > 0) {		/* check for volume case */
+		error = xfs_log_mount(mp, mp->m_logdev_targp->pbr_dev,
+				      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+				      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+		if (error) {
+			cmn_err(CE_WARN, "XFS: log mount failed");
+			goto error2;
+		}
+	} else {	/* No log has been defined */
+		cmn_err(CE_WARN, "XFS: no log defined");
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto error2;
+	}
+
+	/*
+	 * Get and sanity-check the root inode.
+	 * Save the pointer to it in the mount structure.
+	 */
+	error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_ILOCK_EXCL, &rip, 0);
+	if (error) {
+		cmn_err(CE_WARN, "XFS: failed to read root inode");
+		goto error3;
+	}
+
+	ASSERT(rip != NULL);
+	rvp = XFS_ITOV(rip);
+	if ((rip->i_d.di_mode & IFMT) != IFDIR) {
+		cmn_err(CE_WARN, "XFS: corrupted root inode");
+		VMAP(rvp, rip, vmap);
+		prdev("Root inode %llu is not a directory",
+		      mp->m_dev, (unsigned long long)rip->i_ino);
+		rvp->v_flag |= VPURGE;
+		xfs_iunlock(rip, XFS_ILOCK_EXCL);
+		VN_RELE(rvp);
+		vn_purge(rvp, &vmap);
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto error3;
+	}
+	mp->m_rootip = rip;	/* save it */
+
+	xfs_iunlock(rip, XFS_ILOCK_EXCL);
+
+	quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+		mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT);
+
+	/*
+	 * If the device itself is read-only, we can't allow
+	 * the user to change the state of quota on the mount -
+	 * this would generate a transaction on the ro device,
+	 * which would lead to an I/O error and shutdown
+	 */
+
+	if (((quotaondisk && !XFS_IS_QUOTA_ON(mp)) ||
+	      (!quotaondisk && XFS_IS_QUOTA_ON(mp))) &&
+	    (is_read_only(mp->m_dev) || is_read_only(mp->m_logdev_targp->pbr_dev))) {
+		cmn_err(CE_WARN,
+			"XFS: device %s is read-only, cannot change "
+			"quota state.  Please mount with%s quota option.",
+			mp->m_fsname, quotaondisk ? "" : "out");
+		rvp->v_flag |= VPURGE;
+		VN_RELE(rvp);
+		vn_remove(rvp);
+		error = XFS_ERROR(EPERM);
+		goto error3;
+	}
+
+	/*
+	 * Initialize realtime inode pointers in the mount structure
+	 */
+	if ((error = xfs_rtmount_inodes(mp))) {
+		/*
+		 * Free up the root inode.
+		 */
+		cmn_err(CE_WARN, "XFS: failed to read RT inodes");
+		rvp->v_flag |= VPURGE;
+		VMAP(rvp, rip, vmap);
+		VN_RELE(rvp);
+		vn_purge(rvp, &vmap);
+		goto error3;
+	}
+
+	/*
+	 * If fs is not mounted readonly, then update the superblock
+	 * unit and width changes.
+	 */
+	if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY))
+		xfs_mount_log_sbunit(mp, update_flags);
+
+	quotaflags = 0;
+	needquotamount = B_FALSE;
+
+	/*
+	 * Figure out if we'll need to do a quotacheck.
+	 * The requirements are a little different depending on whether
+	 * this fs is root or not.
+	 */
+	rootqcheck = (mp->m_dev == rootdev && quotaondisk && 
+		      ((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT &&
+			(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) ||
+		       (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT &&
+			(mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0)));
+	needquotacheck = rootqcheck ||	XFS_QM_NEED_QUOTACHECK(mp);
+	if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
+		/*
+		 * Call mount_quotas at this point only if we won't have to do
+		 * a quotacheck.
+		 */
+		if (quotaondisk && !needquotacheck) {
+			/*
+			 * If the xfs quota code isn't installed,
+			 * we have to reset the quotachk'd bit.
+			 * If an error occured, qm_mount_quotas code
+			 * has already disabled quotas. So, just finish
+			 * mounting, and get on with the boring life
+			 * without disk quotas.
+			 */
+			if (xfs_qm_mount_quotas(mp))
+				xfs_mount_reset_sbqflags(mp);
+		} else {
+			/*
+			 * Clear the quota flags, but remember them. This
+			 * is so that the quota code doesn't get invoked
+			 * before we're ready. This can happen when an
+			 * inode goes inactive and wants to free blocks,
+			 * or via xfs_log_mount_finish.
+			 */
+			quotaflags = mp->m_qflags;
+			mp->m_qflags = 0;
+			needquotamount = B_TRUE;
+		}
+	}
+
+	/*
+	 * Finish recovering the file system.  This part needed to be
+	 * delayed until after the root and real-time bitmap inodes
+	 * were consistently read in.
+	 */
+	error = xfs_log_mount_finish(mp, mfsi_flags);
+	if (error) {
+		cmn_err(CE_WARN, "XFS: log mount finish failed");
+		goto error3;
+	}
+
+	if (needquotamount) {
+		ASSERT(mp->m_qflags == 0);
+		mp->m_qflags = quotaflags;
+		rootqcheck = ((XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) &&
+				mp->m_dev == rootdev && needquotacheck);
+		if (rootqcheck && (error = xfs_quotacheck_read_only(mp)))
+			goto error2;
+		if (xfs_qm_mount_quotas(mp))
+			xfs_mount_reset_sbqflags(mp);
+		if (rootqcheck)
+			XFS_MTOVFS(mp)->vfs_flag |= VFS_RDONLY;
+	}
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+	if (! (XFS_IS_QUOTA_ON(mp)))
+		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
+	else
+		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
+#endif
+
+#ifdef QUOTADEBUG
+	if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
+		cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
+#endif
+
+	return (0);
+
+ error3:
+	xfs_log_unmount_dealloc(mp);
+ error2:
+	xfs_ihash_free(mp);
+	xfs_chash_free(mp);
+	for (agno = 0; agno < sbp->sb_agcount; agno++)
+		if (mp->m_perag[agno].pagb_list)
+			kmem_free(mp->m_perag[agno].pagb_list,
+			  sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
+	kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t));
+	mp->m_perag = NULL;
+	/* FALLTHROUGH */
+ error1:
+	if (uuid_mounted)
+		xfs_uuid_unmount(mp);
+	xfs_freesb(mp);
+	return error;
+}
+
+/*
+ * xfs_unmountfs
+ *
+ * This flushes out the inodes,dquots and the superblock, unmounts the
+ * log and makes sure that incore structures are freed.
+ */
+int
+xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
+{
+	int		ndquots;
+#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+	int64_t		fsid;
+#endif
+
+	xfs_iflush_all(mp, XFS_FLUSH_ALL);
+
+	/*
+	 * Purge the dquot cache.
+	 * None of the dquots should really be busy at this point.
+	 */
+	if (mp->m_quotainfo) {
+		while ((ndquots = xfs_qm_dqpurge_all(mp,
+						  XFS_QMOPT_UQUOTA|
+						  XFS_QMOPT_GQUOTA|
+						  XFS_QMOPT_UMOUNTING))) {
+			delay(ndquots * 10);
+		}
+	}
+
+	/*
+	 * Flush out the log synchronously so that we know for sure
+	 * that nothing is pinned.  This is important because bflush()
+	 * will skip pinned buffers.
+	 */
+	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+
+	xfs_binval(mp->m_ddev_targp);
+	if (mp->m_rtdev_targp) {
+		xfs_binval(mp->m_rtdev_targp);
+	}
+
+	xfs_unmountfs_writesb(mp);
+
+	xfs_log_unmount(mp);			/* Done! No more fs ops. */
+
+	xfs_freesb(mp);
+
+	/*
+	 * All inodes from this mount point should be freed.
+	 */
+	ASSERT(mp->m_inodes == NULL);
+
+	/*
+	 * We may have bufs that are in the process of getting written still.
+	 * We must wait for the I/O completion of those. The sync flag here
+	 * does a two pass iteration thru the bufcache.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		xfs_incore_relse(mp->m_ddev_targp, 0, 1); /* synchronous */
+	}
+
+	xfs_unmountfs_close(mp, cr);
+	if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
+		xfs_uuid_unmount(mp);
+
+#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+	/*
+	 * clear all error tags on this filesystem
+	 */
+	bcopy(&(XFS_MTOVFS(mp)->vfs_fsid), &fsid, sizeof(int64_t));
+	(void) xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0);
+#endif
+
+	xfs_mount_free(mp, 1);
+	return 0;
+}
+
+void
+xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
+{
+	int		have_logdev = (mp->m_logdev_targp != mp->m_ddev_targp);
+
+	if (mp->m_ddev_targp) {
+		pagebuf_lock_disable(mp->m_ddev_targp, 0);
+		mp->m_ddev_targp = NULL;
+	}
+	if (mp->m_rtdev_targp) {
+		pagebuf_lock_disable(mp->m_rtdev_targp, 1);
+		mp->m_rtdev_targp = NULL;
+	}
+	if (mp->m_logdev_targp && have_logdev) {
+		pagebuf_lock_disable(mp->m_logdev_targp, 1);
+		mp->m_logdev_targp = NULL;
+	}
+}
+
+int
+xfs_unmountfs_writesb(xfs_mount_t *mp)
+{
+	xfs_buf_t	*sbp;
+	xfs_sb_t	*sb;
+	int		error = 0;
+
+	/*
+	 * skip superblock write if fs is read-only, or
+	 * if we are doing a forced umount.
+	 */
+	sbp = xfs_getsb(mp, 0);
+	if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
+		XFS_FORCED_SHUTDOWN(mp))) {
+		/*
+		 * mark shared-readonly if desired
+		 */
+		sb = XFS_BUF_TO_SBP(sbp);
+		if (mp->m_mk_sharedro) {
+			if (!(sb->sb_flags & XFS_SBF_READONLY))
+				sb->sb_flags |= XFS_SBF_READONLY;
+			if (!XFS_SB_VERSION_HASSHARED(sb))
+				XFS_SB_VERSION_ADDSHARED(sb);
+			xfs_fs_cmn_err(CE_NOTE, mp,
+				"Unmounting, marking shared read-only");
+		}
+		XFS_BUF_UNDONE(sbp);
+		XFS_BUF_UNREAD(sbp);
+		XFS_BUF_UNDELAYWRITE(sbp);
+		XFS_BUF_WRITE(sbp);
+		XFS_BUF_UNASYNC(sbp);
+		ASSERT(XFS_BUF_TARGET_DEV(sbp) == mp->m_dev);
+		xfsbdstrat(mp, sbp);
+		/* Nevermind errors we might get here. */
+		error = xfs_iowait(sbp);
+		if (error)
+			xfs_ioerror_alert("xfs_unmountfs_writesb",
+					  mp, sbp, XFS_BUF_ADDR(sbp));
+		if (error && mp->m_mk_sharedro)
+			xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.	 Filesystem may not be marked shared readonly");
+	}
+	xfs_buf_relse(sbp);
+	return (error);
+}
+
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+	xfs_buf_t		*bp;
+	int		first;
+	int		last;
+	xfs_mount_t	*mp;
+	xfs_sb_t	*sbp;
+	xfs_sb_field_t	f;
+
+	ASSERT(fields);
+	if (!fields)
+		return;
+	mp = tp->t_mountp;
+	bp = xfs_trans_getsb(tp, mp, 0);
+	sbp = XFS_BUF_TO_SBP(bp);
+	first = sizeof(xfs_sb_t);
+	last = 0;
+
+	/* translate/copy */
+
+	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, ARCH_CONVERT, fields);
+
+	/* find modified range */
+
+	f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+	first = xfs_sb_info[f].offset;
+
+	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+	last = xfs_sb_info[f + 1].offset - 1;
+
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
+ * a delta to a specified field in the in-core superblock.  Simply
+ * switch on the field indicated and apply the delta to that field.
+ * Fields are not allowed to dip below zero, so if the delta would
+ * do this do not apply it and return EINVAL.
+ *
+ * The SB_LOCK must be held when this routine is called.
+ */
+STATIC int
+xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
+			  int delta, int rsvd)
+{
+	int		scounter;	/* short counter for 32 bit fields */
+	long long	lcounter;	/* long counter for 64 bit fields */
+	long long res_used, rem;
+
+	/*
+	 * With the in-core superblock spin lock held, switch
+	 * on the indicated field.  Apply the delta to the
+	 * proper field.  If the fields value would dip below
+	 * 0, then do not apply the delta and return EINVAL.
+	 */
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		lcounter = (long long)mp->m_sb.sb_icount;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_icount = lcounter;
+		return (0);
+	case XFS_SBS_IFREE:
+		lcounter = (long long)mp->m_sb.sb_ifree;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_ifree = lcounter;
+		return (0);
+	case XFS_SBS_FDBLOCKS:
+
+		lcounter = (long long)mp->m_sb.sb_fdblocks;
+		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+
+		if (delta > 0) {		/* Putting blocks back */
+			if (res_used > delta) {
+				mp->m_resblks_avail += delta;
+			} else {
+				rem = delta - res_used;
+				mp->m_resblks_avail = mp->m_resblks;
+				lcounter += rem;
+			}
+		} else {				/* Taking blocks away */
+
+			lcounter += delta;
+
+		/*
+		 * If were out of blocks, use any available reserved blocks if
+		 * were allowed to.
+		 */
+
+			if (lcounter < 0) {
+				if (rsvd) {
+					lcounter = (long long)mp->m_resblks_avail + delta;
+					if (lcounter < 0) {
+						return (XFS_ERROR(ENOSPC));
+					}
+					mp->m_resblks_avail = lcounter;
+					return (0);
+				} else {	/* not reserved */
+					return (XFS_ERROR(ENOSPC));
+				}
+			}
+		}
+
+		mp->m_sb.sb_fdblocks = lcounter;
+		return (0);
+	case XFS_SBS_FREXTENTS:
+		lcounter = (long long)mp->m_sb.sb_frextents;
+		lcounter += delta;
+		if (lcounter < 0) {
+			return (XFS_ERROR(ENOSPC));
+		}
+		mp->m_sb.sb_frextents = lcounter;
+		return (0);
+	case XFS_SBS_DBLOCKS:
+		lcounter = (long long)mp->m_sb.sb_dblocks;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_dblocks = lcounter;
+		return (0);
+	case XFS_SBS_AGCOUNT:
+		scounter = mp->m_sb.sb_agcount;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_agcount = scounter;
+		return (0);
+	case XFS_SBS_IMAX_PCT:
+		scounter = mp->m_sb.sb_imax_pct;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_imax_pct = scounter;
+		return (0);
+	case XFS_SBS_REXTSIZE:
+		scounter = mp->m_sb.sb_rextsize;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rextsize = scounter;
+		return (0);
+	case XFS_SBS_RBMBLOCKS:
+		scounter = mp->m_sb.sb_rbmblocks;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rbmblocks = scounter;
+		return (0);
+	case XFS_SBS_RBLOCKS:
+		lcounter = (long long)mp->m_sb.sb_rblocks;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rblocks = lcounter;
+		return (0);
+	case XFS_SBS_REXTENTS:
+		lcounter = (long long)mp->m_sb.sb_rextents;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rextents = lcounter;
+		return (0);
+	case XFS_SBS_REXTSLOG:
+		scounter = mp->m_sb.sb_rextslog;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rextslog = scounter;
+		return (0);
+	default:
+		ASSERT(0);
+		return (XFS_ERROR(EINVAL));
+	}
+}
+
+/*
+ * xfs_mod_incore_sb() is used to change a field in the in-core
+ * superblock structure by the specified delta.	 This modification
+ * is protected by the SB_LOCK.	 Just use the xfs_mod_incore_sb_unlocked()
+ * routine to do the work.
+ */
+int
+xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd)
+{
+	unsigned long	s;
+	int	status;
+
+	s = XFS_SB_LOCK(mp);
+	status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+	XFS_SB_UNLOCK(mp, s);
+	return (status);
+}
+
+/*
+ * xfs_mod_incore_sb_batch() is used to change more than one field
+ * in the in-core superblock structure at a time.  This modification
+ * is protected by a lock internal to this module.  The fields and
+ * changes to those fields are specified in the array of xfs_mod_sb
+ * structures passed in.
+ *
+ * Either all of the specified deltas will be applied or none of
+ * them will.  If any modified field dips below 0, then all modifications
+ * will be backed out and EINVAL will be returned.
+ */
+int
+xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
+{
+	unsigned long		s;
+	int		status=0;
+	xfs_mod_sb_t	*msbp;
+
+	/*
+	 * Loop through the array of mod structures and apply each
+	 * individually.  If any fail, then back out all those
+	 * which have already been applied.  Do all of this within
+	 * the scope of the SB_LOCK so that all of the changes will
+	 * be atomic.
+	 */
+	s = XFS_SB_LOCK(mp);
+	msbp = &msb[0];
+	for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
+		/*
+		 * Apply the delta at index n.	If it fails, break
+		 * from the loop so we'll fall into the undo loop
+		 * below.
+		 */
+		status = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+						    msbp->msb_delta, rsvd);
+		if (status != 0) {
+			break;
+		}
+	}
+
+	/*
+	 * If we didn't complete the loop above, then back out
+	 * any changes made to the superblock.	If you add code
+	 * between the loop above and here, make sure that you
+	 * preserve the value of status. Loop back until
+	 * we step below the beginning of the array.  Make sure
+	 * we don't touch anything back there.
+	 */
+	if (status != 0) {
+		msbp--;
+		while (msbp >= msb) {
+			status = xfs_mod_incore_sb_unlocked(mp,
+				    msbp->msb_field, -(msbp->msb_delta), rsvd);
+			ASSERT(status == 0);
+			msbp--;
+		}
+	}
+	XFS_SB_UNLOCK(mp, s);
+	return (status);
+}
+
+/*
+ * xfs_getsb() is called to obtain the buffer for the superblock.
+ * The buffer is returned locked and read in from disk.
+ * The buffer should be released with a call to xfs_brelse().
+ *
+ * If the flags parameter is BUF_TRYLOCK, then we'll only return
+ * the superblock buffer if it can be locked without sleeping.
+ * If it can't then we'll return NULL.
+ */
+xfs_buf_t *
+xfs_getsb(xfs_mount_t	*mp,
+	  int		flags)
+{
+	xfs_buf_t	*bp;
+	ASSERT(mp->m_sb_bp != NULL);
+	bp = mp->m_sb_bp;
+	if (flags & XFS_BUF_TRYLOCK) {
+		if (!XFS_BUF_CPSEMA(bp)) {
+			return NULL;
+		}
+	} else {
+		XFS_BUF_PSEMA(bp, PRIBIO);
+	}
+	XFS_BUF_HOLD(bp);
+	ASSERT(XFS_BUF_ISDONE(bp));
+	return (bp);
+}
+
+/*
+ * Used to free the superblock along various error paths.
+ */
+void
+xfs_freesb(
+	xfs_mount_t	*mp)
+{
+	xfs_buf_t	*bp;
+
+	/*
+	 * Use xfs_getsb() so that the buffer will be locked
+	 * when we call nfreerbuf().
+	 */
+	bp = xfs_getsb(mp, 0);
+	bp->pb_flags &= ~PBF_FS_MANAGED;
+	xfs_buf_relse(bp);
+	mp->m_sb_bp = NULL;
+}
+
+/*
+ * See if the uuid is unique among mounted xfs filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already
+ * mounted
+ */
+STATIC int
+xfs_uuid_mount(xfs_mount_t *mp)
+{
+	int	hole;
+	int	i;
+
+	if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
+		cmn_err(CE_WARN, "XFS: Filesystem %s has nil UUID - can't mount",
+			mp->m_fsname);
+		return -1;
+	}
+
+	mutex_lock(&xfs_uuidtabmon, PVFS);
+	for (i = 0, hole = -1; i < xfs_uuidtab_size; i++) {
+		if (uuid_is_nil(&xfs_uuidtab[i])) {
+			hole = i;
+			continue;
+		}
+		if (uuid_equal(&mp->m_sb.sb_uuid, &xfs_uuidtab[i])) {
+			cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+				mp->m_fsname);
+			mutex_unlock(&xfs_uuidtabmon);
+			return -1;
+		}
+	}
+	if (hole < 0) {
+		xfs_uuidtab = kmem_realloc(xfs_uuidtab,
+			(xfs_uuidtab_size + 1) * sizeof(*xfs_uuidtab),
+			xfs_uuidtab_size  * sizeof(*xfs_uuidtab),
+			KM_SLEEP);
+		hole = xfs_uuidtab_size++;
+	}
+	xfs_uuidtab[hole] = mp->m_sb.sb_uuid;
+	mutex_unlock(&xfs_uuidtabmon);
+
+	return 0;
+}
+
+/*
+ * Remove filesystem from the uuid table.
+ */
+STATIC void
+xfs_uuid_unmount(xfs_mount_t *mp)
+{
+	int	i;
+
+	mutex_lock(&xfs_uuidtabmon, PVFS);
+	for (i = 0; i < xfs_uuidtab_size; i++) {
+		if (uuid_is_nil(&xfs_uuidtab[i]))
+			continue;
+		if (!uuid_equal(&mp->m_sb.sb_uuid, &xfs_uuidtab[i]))
+			continue;
+		uuid_create_nil(&xfs_uuidtab[i]);
+		break;
+	}
+	ASSERT(i < xfs_uuidtab_size);
+	mutex_unlock(&xfs_uuidtabmon);
+}
+
+/*
+ * When xfsquotas isn't installed and the superblock had quotas, we need to
+ * clear the quotaflags from superblock.
+ */
+STATIC void
+xfs_mount_reset_sbqflags(
+	xfs_mount_t	*mp)
+{
+	xfs_trans_t	*tp;
+	unsigned long		s;
+
+	mp->m_qflags = 0;
+	/*
+	 * It is OK to look at sb_qflags here in mount path,
+	 * without SB_LOCK.
+	 */
+	if (mp->m_sb.sb_qflags == 0)
+		return;
+	s = XFS_SB_LOCK(mp);
+	mp->m_sb.sb_qflags = 0;
+	XFS_SB_UNLOCK(mp, s);
+
+	/*
+	 * if the fs is readonly, let the incore superblock run
+	 * with quotas off but don't flush the update out to disk
+	 */
+	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+		return;
+#ifdef QUOTADEBUG
+	xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+#endif
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+	if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				      XFS_DEFAULT_LOG_COUNT)) {
+		xfs_trans_cancel(tp, 0);
+		return;
+	}
+	xfs_mod_sb(tp, XFS_SB_QFLAGS);
+	(void)xfs_trans_commit(tp, 0, NULL);
+}
+
+/*
+ * Used to log changes to the superblock unit and width fields which could
+ * be altered by the mount options. Only the first superblock is updated.
+ */
+STATIC void
+xfs_mount_log_sbunit(
+	xfs_mount_t *mp,
+	__int64_t fields)
+{
+	xfs_trans_t *tp;
+
+	ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID));
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
+	if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				XFS_DEFAULT_LOG_COUNT)) {
+		xfs_trans_cancel(tp, 0);
+		return;
+	}
+	xfs_mod_sb(tp, fields);
+	(void)xfs_trans_commit(tp, 0, NULL);
+}
+
+/* Functions to lock access out of the filesystem for forced
+ * shutdown or snapshot.
+ */
+
+void
+xfs_start_freeze(
+	xfs_mount_t	*mp,
+	int		level)
+{
+	unsigned long	s = mutex_spinlock(&mp->m_freeze_lock);
+
+	mp->m_frozen = level;
+	mutex_spinunlock(&mp->m_freeze_lock, s);
+
+	if (level == XFS_FREEZE_TRANS) {
+		while (atomic_read(&mp->m_active_trans) > 0)
+			delay(100);
+	}
+}
+
+void
+xfs_finish_freeze(
+	xfs_mount_t *mp)
+{
+	unsigned long	s = mutex_spinlock(&mp->m_freeze_lock);
+
+	if (mp->m_frozen) {
+		mp->m_frozen = 0;
+		sv_broadcast(&mp->m_wait_unfreeze);
+	}
+
+	mutex_spinunlock(&mp->m_freeze_lock, s);
+}
+
+void
+xfs_check_frozen(
+	xfs_mount_t *mp,
+	bhv_desc_t *bdp,
+	int	level)
+{
+	SPLDECL(s);
+
+	if (mp->m_frozen) {
+		s = mutex_spinlock(&mp->m_freeze_lock);
+
+		if (mp->m_frozen < level) {
+			mutex_spinunlock(&mp->m_freeze_lock, s);
+		} else {
+			sv_wait(&mp->m_wait_unfreeze, 0, &mp->m_freeze_lock, s);
+		}
+	}
+
+	if (level == XFS_FREEZE_TRANS)
+		atomic_inc(&mp->m_active_trans);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_mount.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_mount.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_mount.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_mount.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_MOUNT_H__
+#define __XFS_MOUNT_H__
+
+
+typedef struct xfs_trans_reservations {
+	uint	tr_write;	/* extent alloc trans */
+	uint	tr_itruncate;	/* truncate trans */
+	uint	tr_rename;	/* rename trans */
+	uint	tr_link;	/* link trans */
+	uint	tr_remove;	/* unlink trans */
+	uint	tr_symlink;	/* symlink trans */
+	uint	tr_create;	/* create trans */
+	uint	tr_mkdir;	/* mkdir trans */
+	uint	tr_ifree;	/* inode free trans */
+	uint	tr_ichange;	/* inode update trans */
+	uint	tr_growdata;	/* fs data section grow trans */
+	uint	tr_swrite;	/* sync write inode trans */
+	uint	tr_addafork;	/* cvt inode to attributed trans */
+	uint	tr_writeid;	/* write setuid/setgid file */
+	uint	tr_attrinval;	/* attr fork buffer invalidation */
+	uint	tr_attrset;	/* set/create an attribute */
+	uint	tr_attrrm;	/* remove an attribute */
+	uint	tr_clearagi;	/* clear bad agi unlinked ino bucket */
+	uint	tr_growrtalloc; /* grow realtime allocations */
+	uint	tr_growrtzero;	/* grow realtime zeroing */
+	uint	tr_growrtfree;	/* grow realtime freeing */
+} xfs_trans_reservations_t;
+
+
+#ifndef __KERNEL__
+/*
+ * Moved here from xfs_ag.h to avoid reordering header files
+ */
+#define XFS_DADDR_TO_AGNO(mp,d) \
+	((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
+#define XFS_DADDR_TO_AGBNO(mp,d) \
+	((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
+#else
+struct cred;
+struct vfs;
+struct vnode;
+struct xfs_mount_args;
+struct xfs_ihash;
+struct xfs_chash;
+struct xfs_inode;
+struct xfs_perag;
+struct xfs_quotainfo;
+struct xfs_iocore;
+struct xfs_dio;
+struct xfs_bmbt_irec;
+struct xfs_bmap_free;
+
+#define SPLDECL(s)		unsigned long s
+#define AIL_LOCK_T		lock_t
+#define AIL_LOCKINIT(x,y)	spinlock_init(x,y)
+#define AIL_LOCK_DESTROY(x)	spinlock_destroy(x)
+#define AIL_LOCK(mp,s)		s=mutex_spinlock(&(mp)->m_ail_lock)
+#define AIL_UNLOCK(mp,s)	mutex_spinunlock(&(mp)->m_ail_lock, s)
+
+
+/* Prototypes and functions for I/O core modularization, a vector
+ * of functions is used to indirect from xfs/cxfs independent code
+ * to the xfs/cxfs dependent code.
+ * The vector is placed in the mount structure so that we can
+ * minimize the number of memory indirections involved.
+ */
+
+typedef int		(*xfs_dio_write_t)(struct xfs_dio *);
+typedef int		(*xfs_dio_read_t)(struct xfs_dio *);
+typedef int		(*xfs_strat_write_t)(struct xfs_iocore *, struct xfs_buf *);
+typedef int		(*xfs_bmapi_t)(struct xfs_trans *, void *,
+				xfs_fileoff_t, xfs_filblks_t, int,
+				xfs_fsblock_t *, xfs_extlen_t,
+				struct xfs_bmbt_irec *, int *,
+				struct xfs_bmap_free *);
+typedef int		(*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *);
+typedef int		(*xfs_rsync_t)(void *, int, xfs_off_t, xfs_off_t);
+typedef uint		(*xfs_lck_map_shared_t)(void *);
+typedef void		(*xfs_lock_t)(void *, uint);
+typedef void		(*xfs_lock_demote_t)(void *, uint);
+typedef int		(*xfs_lock_nowait_t)(void *, uint);
+typedef void		(*xfs_unlk_t)(void *, unsigned int);
+typedef void		(*xfs_chgtime_t)(void *, int);
+typedef xfs_fsize_t	(*xfs_size_t)(void *);
+typedef xfs_fsize_t	(*xfs_setsize_t)(void *, xfs_off_t);
+typedef xfs_fsize_t	(*xfs_lastbyte_t)(void *);
+
+typedef struct xfs_ioops {
+	xfs_bmapi_t		xfs_bmapi_func;
+	xfs_bmap_eof_t		xfs_bmap_eof_func;
+	xfs_lock_t		xfs_ilock;
+	xfs_lock_demote_t	xfs_ilock_demote;
+	xfs_lock_nowait_t	xfs_ilock_nowait;
+	xfs_unlk_t		xfs_unlock;
+	xfs_chgtime_t		xfs_chgtime;
+	xfs_size_t		xfs_size_func;
+	xfs_lastbyte_t		xfs_lastbyte;
+} xfs_ioops_t;
+
+
+#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist)	\
+	(*(mp)->m_io_ops.xfs_bmapi_func) \
+		(trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist)
+
+#define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \
+	(*(mp)->m_io_ops.xfs_bmap_eof_func) \
+		((io)->io_obj, endoff, whichfork, eof)
+
+#define XFS_ILOCK(mp, io, mode) \
+	(*(mp)->m_io_ops.xfs_ilock)((io)->io_obj, mode)
+
+#define XFS_IUNLOCK(mp, io, mode) \
+	(*(mp)->m_io_ops.xfs_unlock)((io)->io_obj, mode)
+
+#define XFS_ILOCK_DEMOTE(mp, io, mode) \
+	(*(mp)->m_io_ops.xfs_ilock_demote)((io)->io_obj, mode)
+
+#define XFS_SIZE(mp, io) \
+	(*(mp)->m_io_ops.xfs_size_func)((io)->io_obj)
+
+#define XFS_LASTBYTE(mp, io) \
+	(*(mp)->m_io_ops.xfs_lastbyte)((io)->io_obj)
+
+
+typedef struct xfs_mount {
+	bhv_desc_t		m_bhv;		/* vfs xfs behavior */
+	xfs_tid_t		m_tid;		/* next unused tid for fs */
+	AIL_LOCK_T		m_ail_lock;	/* fs AIL mutex */
+	xfs_ail_entry_t		m_ail;		/* fs active log item list */
+	uint			m_ail_gen;	/* fs AIL generation count */
+	xfs_sb_t		m_sb;		/* copy of fs superblock */
+	lock_t			m_sb_lock;	/* sb counter mutex */
+	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
+	char			*m_fsname;	/* filesystem name */
+	int			m_fsname_len;	/* strlen of fs name */
+	int			m_bsize;	/* fs logical block size */
+	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
+	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
+	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
+	int			m_ihsize;	/* size of next field */
+	struct xfs_ihash	*m_ihash;	/* fs private inode hash table*/
+	struct xfs_inode	*m_inodes;	/* active inode list */
+	mutex_t			m_ilock;	/* inode list mutex */
+	uint			m_ireclaims;	/* count of calls to reclaim*/
+	uint			m_readio_log;	/* min read size log bytes */
+	uint			m_readio_blocks; /* min read size blocks */
+	uint			m_writeio_log;	/* min write size log bytes */
+	uint			m_writeio_blocks; /* min write size blocks */
+	void			*m_log;		/* log specific stuff */
+	int			m_logbufs;	/* number of log buffers */
+	int			m_logbsize;	/* size of each log buffer */
+	uint			m_rsumlevels;	/* rt summary levels */
+	uint			m_rsumsize;	/* size of rt summary, bytes */
+	struct xfs_inode	*m_rbmip;	/* pointer to bitmap inode */
+	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
+	struct xfs_inode	*m_rootip;	/* pointer to root directory */
+	struct xfs_quotainfo	*m_quotainfo;	/* disk quota information */
+	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
+	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
+	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
+#define m_dev		m_ddev_targp->pbr_dev
+	__uint8_t		m_dircook_elog; /* log d-cookie entry bits */
+	__uint8_t		m_blkbit_log;	/* blocklog + NBBY */
+	__uint8_t		m_blkbb_log;	/* blocklog - BBSHIFT */
+	__uint8_t		m_agno_log;	/* log #ag's */
+	__uint8_t		m_agino_log;	/* #bits for agino in inum */
+	__uint8_t		m_nreadaheads;	/* #readahead buffers */
+	__uint16_t		m_inode_cluster_size;/* min inode buf size */
+	uint			m_blockmask;	/* sb_blocksize-1 */
+	uint			m_blockwsize;	/* sb_blocksize in words */
+	uint			m_blockwmask;	/* blockwsize-1 */
+	uint			m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
+	uint			m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
+	uint			m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
+	uint			m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
+	uint			m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
+	uint			m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
+	uint			m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
+	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+	uint			m_in_maxlevels; /* XFS_IN_MAXLEVELS */
+	struct xfs_perag	*m_perag;	/* per-ag accounting info */
+	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
+	sema_t			m_growlock;	/* growfs mutex */
+	int			m_fixedfsid[2]; /* unchanged for life of FS */
+	uint			m_dmevmask;	/* DMI events for this FS */
+	uint			m_flags;	/* global mount flags */
+	uint			m_attroffset;	/* inode attribute offset */
+	int			m_da_node_ents; /* how many entries in danode */
+	int			m_ialloc_inos;	/* inodes in inode allocation */
+	int			m_ialloc_blks;	/* blocks in inode allocation */
+	int			m_litino;	/* size of inode union area */
+	int			m_inoalign_mask;/* mask sb_inoalignmt if used */
+	uint			m_qflags;	/* quota status flags */
+	xfs_trans_reservations_t m_reservations;/* precomputed res values */
+	__uint64_t		m_maxicount;	/* maximum inode count */
+	__uint64_t		m_resblks;	/* total reserved blocks */
+	__uint64_t		m_resblks_avail;/* available reserved blocks */
+#if XFS_BIG_FILESYSTEMS
+	xfs_ino_t		m_inoadd;	/* add value for ino64_offset */
+#endif
+	int			m_dalign;	/* stripe unit */
+	int			m_swidth;	/* stripe width */
+	int			m_lstripemask;	/* log stripe mask */
+	int			m_sinoalign;	/* stripe unit inode alignmnt */
+	int			m_attr_magicpct;/* 37% of the blocksize */
+	int			m_dir_magicpct; /* 37% of the dir blocksize */
+	__uint8_t		m_mk_sharedro;	/* mark shared ro on unmount */
+	__uint8_t		m_inode_quiesce;/* call quiesce on new inodes.
+						   field governed by m_ilock */
+	__uint8_t		m_dirversion;	/* 1 or 2 */
+	xfs_dirops_t		m_dirops;	/* table of dir funcs */
+	int			m_dirblksize;	/* directory block sz--bytes */
+	int			m_dirblkfsbs;	/* directory block sz--fsbs */
+	xfs_dablk_t		m_dirdatablk;	/* blockno of dir data v2 */
+	xfs_dablk_t		m_dirleafblk;	/* blockno of dir non-data v2 */
+	xfs_dablk_t		m_dirfreeblk;	/* blockno of dirfreeindex v2 */
+	int			m_chsize;	/* size of next field */
+	struct xfs_chash	*m_chash;	/* fs private inode per-cluster
+						 * hash table */
+	struct xfs_ioops	m_io_ops;	/* vector of I/O ops */
+	struct xfs_expinfo	*m_expinfo;	/* info to export to other
+						   cells. */
+	uint64_t		m_shadow_pinmask;
+						/* which bits matter in rpc
+						   log item pin masks */
+	uint			m_cxfstype;	/* mounted shared, etc. */
+	lock_t			m_freeze_lock;	/* Lock for m_frozen */
+	uint			m_frozen;	/* FS frozen for shutdown or
+						 * snapshot */
+	sv_t			m_wait_unfreeze;/* waiting to unfreeze */
+	atomic_t		m_active_trans; /* number trans frozen */
+	struct timer_list	m_sbdirty_timer;/* superblock dirty timer
+						 * for nfs refcache */
+} xfs_mount_t;
+
+/*
+ * Flags for m_flags.
+ */
+#define XFS_MOUNT_WSYNC		0x00000001	/* for nfs - all metadata ops
+						   must be synchronous except
+						   for space allocations */
+#if XFS_BIG_FILESYSTEMS
+#define XFS_MOUNT_INO64		0x00000002
+#endif
+#define XFS_MOUNT_ROOTQCHECK	0x00000004
+			     /* 0x00000008	-- currently unused */
+#define XFS_MOUNT_FS_SHUTDOWN	0x00000010	/* atomic stop of all filesystem
+						   operations, typically for
+						   disk errors in metadata */
+#define XFS_MOUNT_NOATIME	0x00000020	/* don't modify inode access
+						   times on reads */
+#define XFS_MOUNT_RETERR	0x00000040	/* return alignment errors to
+						   user */
+#define XFS_MOUNT_NOALIGN	0x00000080	/* turn off stripe alignment
+						   allocations */
+			     /* 0x00000100	-- currently unused */
+#define XFS_MOUNT_REGISTERED	0x00000200	/* registered with cxfs master
+						   cell logic */
+#define XFS_MOUNT_NORECOVERY	0x00000400	/* no recovery - dirty fs */
+#define XFS_MOUNT_SHARED	0x00000800	/* shared mount */
+#define XFS_MOUNT_DFLT_IOSIZE	0x00001000	/* set default i/o size */
+#define XFS_MOUNT_OSYNCISOSYNC	0x00002000	/* o_sync is REALLY o_sync */
+						/* osyncisdsync is now default*/
+#define XFS_MOUNT_NOUUID	0x00004000	/* ignore uuid during mount */
+#define XFS_MOUNT_32BITINODES	0x00008000	/* do not create inodes above
+						 * 32 bits in size */
+#define XFS_MOUNT_IRIXSGID	0x00010000	/* Irix-style sgid inheritance */
+#define XFS_MOUNT_NOLOGFLUSH	0x00020000
+
+/*
+ * Flags for m_cxfstype
+ */
+#define XFS_CXFS_NOT		0x00000001	/* local mount */
+#define XFS_CXFS_SERVER		0x00000002	/* we're the CXFS server */
+#define XFS_CXFS_CLIENT		0x00000004	/* We're a CXFS client */
+#define XFS_CXFS_REC_ENABLED	0x00000008	/* recovery is enabled */
+
+#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
+
+/*
+ * Default minimum read and write sizes.
+ */
+#define XFS_READIO_LOG_LARGE	12
+#define XFS_WRITEIO_LOG_LARGE	12
+/*
+ * Default allocation size
+ */
+#define XFS_WRITE_IO_LOG	16
+
+/*
+ * Max and min values for UIO and mount-option defined I/O sizes;
+ * min value can't be less than a page.	 Currently unused.
+ */
+#define XFS_MAX_IO_LOG		16	/* 64K */
+#define XFS_MIN_IO_LOG		PAGE_SHIFT
+
+/*
+ * Synchronous read and write sizes.  This should be
+ * better for NFSv2 wsync filesystems.
+ */
+#define XFS_WSYNC_READIO_LOG	15	/* 32K */
+#define XFS_WSYNC_WRITEIO_LOG	14	/* 16K */
+
+#define xfs_force_shutdown(m,f)	VFS_FORCE_SHUTDOWN(XFS_MTOVFS(m),f)
+/*
+ * Flags sent to xfs_force_shutdown.
+ */
+#define XFS_METADATA_IO_ERROR	0x1
+#define XFS_LOG_IO_ERROR	0x2
+#define XFS_FORCE_UMOUNT	0x4
+#define XFS_CORRUPT_INCORE	0x8	/* corrupt in-memory data structures */
+#define XFS_SHUTDOWN_REMOTE_REQ 0x10	/* shutdown came from remote cell */
+
+/*
+ * xflags for xfs_syncsub
+ */
+#define XFS_XSYNC_RELOC		0x01
+
+/*
+ * Flags for xfs_mountfs
+ */
+#define XFS_MFSI_SECOND		0x01	/* Is a cxfs secondary mount -- skip */
+					/* stuff which should only be done */
+					/* once. */
+#define XFS_MFSI_CLIENT		0x02	/* Is a client -- skip lots of stuff */
+#define XFS_MFSI_NOUNLINK	0x08	/* Skip unlinked inode processing in */
+					/* log recovery */
+
+/*
+ * Macros for getting from mount to vfs and back.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MTOVFS)
+struct vfs *xfs_mtovfs(xfs_mount_t *mp);
+#define XFS_MTOVFS(mp)		xfs_mtovfs(mp)
+#else
+#define XFS_MTOVFS(mp)		(bhvtovfs(&(mp)->m_bhv))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOM)
+xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp);
+#define XFS_BHVTOM(bdp) xfs_bhvtom(bdp)
+#else
+#define XFS_BHVTOM(bdp)		((xfs_mount_t *)BHV_PDATA(bdp))
+#endif
+
+
+/*
+ * Moved here from xfs_ag.h to avoid reordering header files
+ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGNO)
+xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_DADDR_TO_AGNO(mp,d)		xfs_daddr_to_agno(mp,d)
+#else
+
+static inline xfs_agnumber_t XFS_DADDR_TO_AGNO(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	d = XFS_BB_TO_FSBT(mp, d);
+	do_div(d, mp->m_sb.sb_agblocks);
+	return (xfs_agnumber_t) d;
+}
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGBNO)
+xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_DADDR_TO_AGBNO(mp,d)	xfs_daddr_to_agbno(mp,d)
+#else
+
+static inline xfs_agblock_t XFS_DADDR_TO_AGBNO(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	d = XFS_BB_TO_FSBT(mp, d);
+	return (xfs_agblock_t) do_div(d, mp->m_sb.sb_agblocks);
+}
+
+#endif
+
+/*
+ * This structure is for use by the xfs_mod_incore_sb_batch() routine.
+ */
+typedef struct xfs_mod_sb {
+	xfs_sb_field_t	msb_field;	/* Field to modify, see below */
+	int		msb_delta;	/* change to make to the specified field */
+} xfs_mod_sb_t;
+
+#define XFS_MOUNT_ILOCK(mp)	mutex_lock(&((mp)->m_ilock), PINOD)
+#define XFS_MOUNT_IUNLOCK(mp)	mutex_unlock(&((mp)->m_ilock))
+#define XFS_SB_LOCK(mp)		mutex_spinlock(&(mp)->m_sb_lock)
+#define XFS_SB_UNLOCK(mp,s)	mutex_spinunlock(&(mp)->m_sb_lock,(s))
+
+void		xfs_mod_sb(xfs_trans_t *, __int64_t);
+xfs_mount_t	*xfs_mount_init(void);
+void		xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
+int		xfs_mountfs(struct vfs *, xfs_mount_t *mp, dev_t, int);
+
+int		xfs_unmountfs(xfs_mount_t *, struct cred *);
+void		xfs_unmountfs_close(xfs_mount_t *, struct cred *);
+int		xfs_unmountfs_writesb(xfs_mount_t *);
+int		xfs_unmount_flush(xfs_mount_t *, int);
+int		xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int);
+int		xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int);
+int		xfs_readsb(xfs_mount_t *mp);
+struct xfs_buf	*xfs_getsb(xfs_mount_t *, int);
+void		xfs_freesb(xfs_mount_t *);
+void		xfs_do_force_shutdown(bhv_desc_t *, int, char *, int);
+int		xfs_syncsub(xfs_mount_t *, int, int, int *);
+void		xfs_initialize_perag(xfs_mount_t *, int);
+void		xfs_xlatesb(void *, struct xfs_sb *, int, xfs_arch_t, __int64_t);
+
+/*
+ * Flags for freeze operations.
+ */
+#define XFS_FREEZE_WRITE	1
+#define XFS_FREEZE_TRANS	2
+
+void		xfs_start_freeze(xfs_mount_t *, int);
+void		xfs_finish_freeze(xfs_mount_t *);
+void		xfs_check_frozen(xfs_mount_t *, bhv_desc_t *, int);
+
+extern	struct vfsops xfs_vfsops;
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_MOUNT_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_qm.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_qm.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_qm.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_qm.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,2922 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+
+kmem_zone_t	*qm_dqzone;
+kmem_zone_t	*qm_dqtrxzone;
+
+STATIC void	xfs_qm_list_init(xfs_dqlist_t *, char *, int);
+STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
+STATIC int	xfs_qm_quotacheck(xfs_mount_t *);
+
+STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
+STATIC void	xfs_qm_shake(void);
+
+#ifdef DEBUG
+extern mutex_t	qcheck_lock;
+#endif
+
+#ifdef QUOTADEBUG
+#define XQM_LIST_PRINT(l, NXT, title) \
+{ \
+	  xfs_dquot_t	*dqp; int i = 0;\
+	  printk("%s (#%d)\n", title, (int) (l)->qh_nelems); \
+	  for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
+	    printk("\t%d.\t\"%d (%s)\"\t bcnt = %d, icnt = %d refs = %d\n", \
+			 ++i, (int) INT_GET(dqp->q_core.d_id, ARCH_CONVERT), \
+			 DQFLAGTO_TYPESTR(dqp),	     \
+			 (int) INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), \
+			 (int) INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), \
+			 (int) dqp->q_nrefs);  } \
+}
+#endif
+
+/*
+ * Initialize the XQM structure.
+ * Note that there is not one quota manager per file system.
+ */
+struct xfs_qm *
+xfs_qm_init(void)
+{
+	xfs_qm_t		*xqm;
+	int			hsize, i;
+
+	xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
+	ASSERT(xqm);
+
+	/*
+	 * Initialize the dquot hash tables.
+	 */
+	hsize = (DQUOT_HASH_HEURISTIC < XFS_QM_NCSIZE_THRESHOLD) ?
+		XFS_QM_HASHSIZE_LOW : XFS_QM_HASHSIZE_HIGH;
+	xqm->qm_dqhashmask = hsize - 1;
+
+	/*
+	 * XXXsup We could keep reference counts on usr and grp quotas
+	 * inside XQM separately, and avoid having two hashtables even
+	 * when only one 'type' is active in the system.
+	 */
+	xqm->qm_usr_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize *
+						      sizeof(xfs_dqhash_t),
+						      KM_SLEEP);
+	xqm->qm_grp_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize *
+						      sizeof(xfs_dqhash_t),
+						      KM_SLEEP);
+	ASSERT(xqm->qm_usr_dqhtable != NULL);
+	ASSERT(xqm->qm_grp_dqhtable != NULL);
+
+	for (i = 0; i < hsize; i++) {
+		xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
+		xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
+	}
+
+	/*
+	 * Freelist of all dquots of all file systems
+	 */
+	xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
+
+	/*
+	 * dquot zone. we register our own low-memory callback.
+	 */
+	if (!qm_dqzone) {
+		xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
+						"xfs_dquots");
+		qm_dqzone = xqm->qm_dqzone;
+	} else
+		xqm->qm_dqzone = qm_dqzone;
+
+	kmem_shake_register(xfs_qm_shake);
+
+	/*
+	 * The t_dqinfo portion of transactions.
+	 */
+	if (!qm_dqtrxzone) {
+		xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
+						   "xfs_dqtrx");
+		qm_dqtrxzone = xqm->qm_dqtrxzone;
+	} else
+		xqm->qm_dqtrxzone = qm_dqtrxzone;
+
+	atomic_set(&xqm->qm_totaldquots, 0);
+	xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
+	xqm->qm_nrefs = 0;
+#ifdef DEBUG
+	mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk");
+#endif
+	return (xqm);
+}
+
+/*
+ * Destroy the global quota manager when its reference count goes to zero.
+ */
+void
+xfs_qm_destroy(
+	struct xfs_qm *xqm)
+{
+	int	hsize, i;
+
+	ASSERT(xqm != NULL);
+	ASSERT(xqm->qm_nrefs == 0);
+	kmem_shake_deregister(xfs_qm_shake);
+	hsize = xqm->qm_dqhashmask + 1;
+	for (i = 0; i < hsize; i++) {
+		xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
+		xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
+	}
+	kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t));
+	kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t));
+	xqm->qm_usr_dqhtable = NULL;
+	xqm->qm_grp_dqhtable = NULL;
+	xqm->qm_dqhashmask = 0;
+	xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
+#ifdef DEBUG
+	mutex_destroy(&qcheck_lock);
+#endif
+	kmem_free(xqm, sizeof(xfs_qm_t));
+}
+
+/*
+ * Called at mount time to let XQM know that another file system is
+ * starting quotas. This isn't crucial information as the individual mount
+ * structures are pretty independent, but it helps the XQM keep a
+ * global view of what's going on.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_hold_quotafs_ref(
+	struct xfs_mount *mp)
+{
+	/*
+	 * Need to lock the xfs_Gqm structure for things like this. For example,
+	 * the structure could disappear between the entry to this routine and
+	 * a HOLD operation if not locked.
+	 */
+	XFS_QM_LOCK(xfs_Gqm);
+
+	if (xfs_Gqm == NULL) {
+		if ((xfs_Gqm = xfs_qm_init()) == NULL) {
+			return (XFS_ERROR(EINVAL));
+		}
+	}
+	/*
+	 * We can keep a list of all filesystems with quotas mounted for
+	 * debugging and statistical purposes, but ...
+	 * Just take a reference and get out.
+	 */
+	XFS_QM_HOLD(xfs_Gqm);
+	XFS_QM_UNLOCK(xfs_Gqm);
+
+	return 0;
+}
+
+
+/*
+ * Release the reference that a filesystem took at mount time,
+ * so that we know when we need to destroy the entire quota manager.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_rele_quotafs_ref(
+	struct xfs_mount *mp)
+{
+	xfs_dquot_t	*dqp, *nextdqp;
+
+	ASSERT(xfs_Gqm);
+	ASSERT(xfs_Gqm->qm_nrefs > 0);
+
+	/*
+	 * Go thru the freelist and destroy all inactive dquots.
+	 */
+	xfs_qm_freelist_lock(xfs_Gqm);
+
+	for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+	     dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
+		xfs_dqlock(dqp);
+		nextdqp = dqp->dq_flnext;
+		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+			ASSERT(dqp->q_mount == NULL);
+			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+			ASSERT(dqp->HL_PREVP == NULL);
+			ASSERT(dqp->MPL_PREVP == NULL);
+			XQM_FREELIST_REMOVE(dqp);
+			xfs_dqunlock(dqp);
+			xfs_qm_dqdestroy(dqp);
+		} else {
+			xfs_dqunlock(dqp);
+		}
+		dqp = nextdqp;
+	}
+	xfs_qm_freelist_unlock(xfs_Gqm);
+
+	/*
+	 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
+	 * be restarted.
+	 */
+	XFS_QM_LOCK(xfs_Gqm);
+	XFS_QM_RELE(xfs_Gqm);
+	if (xfs_Gqm->qm_nrefs == 0) {
+		xfs_qm_destroy(xfs_Gqm);
+		xfs_Gqm = NULL;
+	}
+	XFS_QM_UNLOCK(xfs_Gqm);
+}
+
+/*
+ * This is called at mount time from xfs_mountfs to initialize the quotainfo
+ * structure and start the global quotamanager (xfs_Gqm) if it hasn't done
+ * so already.	Note that the superblock has not been read in yet.
+ */
+void
+xfs_qm_mount_quotainit(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	/*
+	 * User or group quotas has to be on, or we must have the
+	 * QUOTAMAYBE flag set.
+	 */
+	ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA | XFSMNT_QUOTAMAYBE));
+
+	/*
+	 * Initialize the flags in the mount structure. From this point
+	 * onwards we look at m_qflags to figure out if quotas's ON/OFF, etc.
+	 * Note that we enforce nothing if accounting is off.
+	 * ie.	XFSMNT_*QUOTA must be ON for XFSMNT_*QUOTAENF.
+	 * It isn't necessary to take the quotaoff lock to do this; this is
+	 * called from mount.
+	 */
+	if (flags & XFSMNT_UQUOTA) {
+		mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+		if (flags & XFSMNT_UQUOTAENF)
+			mp->m_qflags |= XFS_UQUOTA_ENFD;
+	}
+	if (flags & XFSMNT_GQUOTA) {
+		mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+		if (flags & XFSMNT_GQUOTAENF)
+			mp->m_qflags |= XFS_GQUOTA_ENFD;
+	}
+	/*
+	 * Typically, we turn quotas off if we weren't explicitly asked to mount
+	 * quotas. This is the mount option not to do that.
+	 * This option is handy in the miniroot, when trying to mount /root.
+	 * We can't really know what's in /etc/fstab until /root is already
+	 * mounted! This stops quotas getting turned off in the root
+	 * filesystem everytime the system boots up a miniroot.
+	 */
+	if (flags & XFSMNT_QUOTAMAYBE)
+		mp->m_qflags |= XFS_QUOTA_MAYBE;
+}
+
+/*
+ * Just destroy the quotainfo structure.
+ */
+void
+xfs_qm_unmount_quotadestroy(
+	xfs_mount_t	*mp)
+{
+	xfs_qm_destroy_quotainfo(mp);
+}
+
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo, and in the rootfs's case
+ * xfs_Gqm. This is also responsible for running a quotacheck as necessary.
+ * We are guaranteed that the superblock is consistently read in at this
+ * point.
+ */
+int
+xfs_qm_mount_quotas(
+	xfs_mount_t	*mp)
+{
+	unsigned long		s;
+	int		error;
+	uint		sbf;
+
+	error = 0;
+	/*
+	 * If a non-root file system had quotas running earlier, but decided
+	 * to mount without -o quota/pquota options, revoke the quotachecked
+	 * license, and bail out.
+	 * Unless, of course the XFS_QUOTA_MAYBE flag is specified.
+	 */
+	if (mp->m_qflags & XFS_QUOTA_MAYBE) {
+		mp->m_qflags &= ~(XFS_QUOTA_MAYBE);
+		if (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) {
+			mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_MOUNT_QUOTA_ALL);
+			mp->m_qflags |= XFS_UQUOTA_ACTIVE;
+		}
+		if (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) {
+			mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_MOUNT_QUOTA_ALL);
+			mp->m_qflags |= XFS_GQUOTA_ACTIVE;
+		}
+	}
+	if (! XFS_IS_QUOTA_ON(mp) &&
+	    (mp->m_dev != rootdev) &&
+	    (mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT))) {
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+
+	/*
+	 * If quotas on realtime volumes is not supported, we disable
+	 * quotas immediately.
+	 */
+	if (mp->m_sb.sb_rextents) {
+		cmn_err(CE_NOTE,
+			"Cannot turn on quotas for realtime filesystem %s",
+			mp->m_fsname);
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+	cmn_err(CE_NOTE, "Attempting to turn on disk quotas.");
+#endif
+	/*
+	 * If this is the root file system, mark flags in mount struct first.
+	 * We couldn't do this earlier because we didn't have the superblock
+	 * read in.
+	 */
+	if (mp->m_dev == rootdev) {
+		ASSERT(XFS_SB_VERSION_HASQUOTA(&mp->m_sb));
+		ASSERT(mp->m_sb.sb_qflags &
+		       (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT));
+		if (xfs_Gqm == NULL) {
+			if ((xfs_Gqm = xfs_qm_init()) == NULL) {
+				mp->m_qflags = 0;
+				error = EINVAL;
+				goto write_changes;
+			}
+		}
+		mp->m_qflags = mp->m_sb.sb_qflags;
+		if (mp->m_qflags & XFS_UQUOTA_ACCT)
+			mp->m_qflags |= XFS_UQUOTA_ACTIVE;
+		if (mp->m_qflags & XFS_GQUOTA_ACCT)
+			mp->m_qflags |= XFS_GQUOTA_ACTIVE;
+		/*
+		 * The quotainode of the root file system may or may not
+		 * exist at this point.
+		 */
+	}
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+	/*
+	 * Allocate the quotainfo structure inside the mount struct, and
+	 * create quotainode(s), and change/rev superblock if necessary.
+	 */
+	if ((error = xfs_qm_init_quotainfo(mp))) {
+		/*
+		 * We must turn off quotas.
+		 */
+		ASSERT(mp->m_quotainfo == NULL);
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+	/*
+	 * If any of the quotas are not consistent, do a quotacheck.
+	 */
+	if (XFS_QM_NEED_QUOTACHECK(mp)) {
+#ifdef DEBUG
+		cmn_err(CE_NOTE, "Doing a quotacheck. Please wait.");
+#endif
+		if ((error = xfs_qm_quotacheck(mp))) {
+			cmn_err(CE_WARN, "Quotacheck unsuccessful (Error %d): "
+				"Disabling quotas.",
+				error);
+			/*
+			 * We must turn off quotas.
+			 */
+			ASSERT(mp->m_quotainfo != NULL);
+			ASSERT(xfs_Gqm != NULL);
+			xfs_qm_destroy_quotainfo(mp);
+			mp->m_qflags = 0;
+			goto write_changes;
+		}
+#ifdef DEBUG
+		cmn_err(CE_NOTE, "Done quotacheck.");
+#endif
+	}
+ write_changes:
+	/*
+	 * We actually don't have to acquire the SB_LOCK at all.
+	 * This can only be called from mount, and that's single threaded. XXX
+	 */
+	s = XFS_SB_LOCK(mp);
+	sbf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+	XFS_SB_UNLOCK(mp, s);
+
+	if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+		if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+			/*
+			 * We could only have been turning quotas off.
+			 * We aren't in very good shape actually because
+			 * the incore structures are convinced that quotas are
+			 * off, but the on disk superblock doesn't know that !
+			 */
+			ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"XFS mount_quotas: Superblock update failed!");
+		}
+	}
+
+	if (error) {
+		xfs_fs_cmn_err(CE_WARN, mp,
+			"Failed to initialize disk quotas.");
+	}
+	return XFS_ERROR(error);
+}
+
+/*
+ * Called from the vfsops layer.
+ */
+int
+xfs_qm_unmount_quotas(
+	xfs_mount_t	*mp)
+{
+	xfs_inode_t	*uqp, *gqp;
+	int		error;
+
+	error = 0;
+
+	/*
+	 * Release the dquots that root inode, et al might be holding,
+	 * before we flush quotas and blow away the quotainfo structure.
+	 */
+	ASSERT(mp->m_rootip);
+	if (mp->m_rootip->i_udquot || mp->m_rootip->i_gdquot)
+		xfs_qm_dqdettach_inode(mp->m_rootip);
+	if (mp->m_rbmip &&
+	    (mp->m_rbmip->i_udquot || mp->m_rbmip->i_gdquot))
+		xfs_qm_dqdettach_inode(mp->m_rbmip);
+	if (mp->m_rsumip &&
+	    (mp->m_rsumip->i_udquot || mp->m_rsumip->i_gdquot))
+		xfs_qm_dqdettach_inode(mp->m_rsumip);
+
+	/*
+	 * Flush out the quota inodes.
+	 */
+	uqp = gqp = NULL;
+	if (mp->m_quotainfo) {
+		if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
+			xfs_ilock(uqp, XFS_ILOCK_EXCL);
+			xfs_iflock(uqp);
+			error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
+			xfs_iunlock(uqp, XFS_ILOCK_EXCL);
+			if (error == EFSCORRUPTED)
+				goto out;
+		}
+		if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
+			xfs_ilock(gqp, XFS_ILOCK_EXCL);
+			xfs_iflock(gqp);
+			error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
+			xfs_iunlock(gqp, XFS_ILOCK_EXCL);
+			if (error == EFSCORRUPTED)
+				goto out;
+		}
+	}
+	if (uqp) {
+		 XFS_PURGE_INODE(uqp);
+		 mp->m_quotainfo->qi_uquotaip = NULL;
+	}
+	if (gqp) {
+		XFS_PURGE_INODE(gqp);
+		mp->m_quotainfo->qi_gquotaip = NULL;
+	}
+out:
+	return XFS_ERROR(error);
+}
+
+/*
+ * Flush all dquots of the given file system to disk. The dquots are
+ * _not_ purged from memory here, just their data written to disk.
+ */
+int
+xfs_qm_dqflush_all(
+	xfs_mount_t	*mp,
+	int		flags)
+{
+	int		recl;
+	xfs_dquot_t	*dqp;
+	int		niters;
+	int		error;
+
+	if (mp->m_quotainfo == NULL)
+		return (0);
+	niters = 0;
+again:
+	xfs_qm_mplist_lock(mp);
+	FOREACH_DQUOT_IN_MP(dqp, mp) {
+		xfs_dqlock(dqp);
+		if (! XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqunlock(dqp);
+			continue;
+		}
+		xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
+		/* XXX a sentinel would be better */
+		recl = XFS_QI_MPLRECLAIMS(mp);
+		if (! xfs_qm_dqflock_nowait(dqp)) {
+			/*
+			 * If we can't grab the flush lock then check
+			 * to see if the dquot has been flushed delayed
+			 * write.  If so, grab its buffer and send it
+			 * out immediately.  We'll be able to acquire
+			 * the flush lock when the I/O completes.
+			 */
+			xfs_qm_dqflock_pushbuf_wait(dqp);
+		}
+		/*
+		 * Let go of the mplist lock. We don't want to hold it
+		 * across a disk write.
+		 */
+		xfs_qm_mplist_unlock(mp);
+		error = xfs_qm_dqflush(dqp, flags);
+		xfs_dqunlock(dqp);
+		if (error)
+			return (error);
+
+		/*
+		 * If this is the root filesystem doing a quotacheck,
+		 * we should do periodic bflushes. This is because there's
+		 * no bflushd at this point.
+		 */
+		if (mp->m_flags & XFS_MOUNT_ROOTQCHECK) {
+			if (++niters == XFS_QM_MAX_DQCLUSTER_LOGSZ) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE | XFS_LOG_SYNC);
+				XFS_bflush(mp->m_ddev_targp);
+				niters = 0;
+			}
+		}
+
+		xfs_qm_mplist_lock(mp);
+		if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+			xfs_qm_mplist_unlock(mp);
+			/* XXX restart limit */
+			goto again;
+		}
+	}
+
+	xfs_qm_mplist_unlock(mp);
+	/* return ! busy */
+	return (0);
+}
+/*
+ * Release the group dquot pointers the user dquots may be
+ * carrying around as a hint. mplist is locked on entry and exit.
+ */
+STATIC void
+xfs_qm_detach_gdquots(
+	xfs_mount_t	*mp)
+{
+	xfs_dquot_t	*dqp, *gdqp;
+	int		nrecl;
+
+ again:
+	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+	dqp = XFS_QI_MPLNEXT(mp);
+	while (dqp) {
+		xfs_dqlock(dqp);
+		if ((gdqp = dqp->q_gdquot)) {
+			xfs_dqlock(gdqp);
+			dqp->q_gdquot = NULL;
+		}
+		xfs_dqunlock(dqp);
+
+		if (gdqp) {
+			/*
+			 * Can't hold the mplist lock across a dqput.
+			 * XXXmust convert to marker based iterations here.
+			 */
+			nrecl = XFS_QI_MPLRECLAIMS(mp);
+			xfs_qm_mplist_unlock(mp);
+			xfs_qm_dqput(gdqp);
+
+			xfs_qm_mplist_lock(mp);
+			if (nrecl != XFS_QI_MPLRECLAIMS(mp))
+				goto again;
+		}
+		dqp = dqp->MPL_NEXT;
+	}
+}
+
+/*
+ * Go through all the incore dquots of this file system and take them
+ * off the mplist and hashlist, if the dquot type matches the dqtype
+ * parameter. This is used when turning off quota accounting for
+ * users and/or groups, as well as when the filesystem is unmounting.
+ */
+int
+xfs_qm_dqpurge_all(
+		   xfs_mount_t	*mp,
+		   uint		flags) /* QUOTAOFF/UMOUNTING/UQUOTA/GQUOTA */
+{
+	xfs_dquot_t	*dqp;
+	uint		dqtype;
+	int		nrecl;
+	xfs_dquot_t	*nextdqp;
+	int		nmisses;
+
+	if (mp->m_quotainfo == NULL)
+		return (0);
+
+	dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
+	dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
+
+	xfs_qm_mplist_lock(mp);
+
+	/*
+	 * In the first pass through all incore dquots of this filesystem,
+	 * we release the group dquot pointers the user dquots may be
+	 * carrying around as a hint. We need to do this irrespective of
+	 * what's being turned off.
+	 */
+	xfs_qm_detach_gdquots(mp);
+
+      again:
+	nmisses = 0;
+	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+	/*
+	 * Try to get rid of all of the unwanted dquots. The idea is to
+	 * get them off mplist and hashlist, but leave them on freelist.
+	 */
+	dqp = XFS_QI_MPLNEXT(mp);
+	while (dqp) {
+		/*
+		 * It's OK to look at the type without taking dqlock here.
+		 * We're holding the mplist lock here, and that's needed for
+		 * a dqreclaim.
+		 */
+		if ((dqp->dq_flags & dqtype) == 0) {
+			dqp = dqp->MPL_NEXT;
+			continue;
+		}
+
+		if (! xfs_qm_dqhashlock_nowait(dqp)) {
+			nrecl = XFS_QI_MPLRECLAIMS(mp);
+			xfs_qm_mplist_unlock(mp);
+			XFS_DQ_HASH_LOCK(dqp->q_hash);
+			xfs_qm_mplist_lock(mp);
+
+			/*
+			 * XXXTheoretically, we can get into a very long
+			 * ping pong game here.
+			 * No one can be adding dquots to the mplist at
+			 * this point, but somebody might be taking things off.
+			 */
+			if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
+				XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+				goto again;
+			}
+		}
+
+		/*
+		 * Take the dquot off the mplist and hashlist. It may remain on
+		 * freelist in INACTIVE state.
+		 */
+		nextdqp = dqp->MPL_NEXT;
+		nmisses += xfs_qm_dqpurge(dqp, flags);
+		dqp = nextdqp;
+	}
+	xfs_qm_mplist_unlock(mp);
+	return (nmisses);
+}
+
+STATIC int
+xfs_qm_dqattach_one(
+	xfs_inode_t	*ip,
+	xfs_dqid_t	id,
+	uint		type,
+	uint		doalloc,
+	uint		dolock,
+	xfs_dquot_t	*udqhint, /* hint */
+	xfs_dquot_t	**IO_idqpp)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	error = 0;
+	/*
+	 * See if we already have it in the inode itself. IO_idqpp is
+	 * &i_udquot or &i_gdquot. This made the code look weird, but
+	 * made the logic a lot simpler.
+	 */
+	if ((dqp = *IO_idqpp)) {
+		if (dolock)
+			xfs_dqlock(dqp);
+		xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
+		goto done;
+	}
+
+	/*
+	 * udqhint is the i_udquot field in inode, and is non-NULL only
+	 * when the type arg is XFS_DQ_GROUP. Its purpose is to save a
+	 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
+	 * the user dquot.
+	 */
+	ASSERT(!udqhint || type == XFS_DQ_GROUP);
+	if (udqhint && !dolock)
+		xfs_dqlock(udqhint);
+
+	/*
+	 * No need to take dqlock to look at the id.
+	 * The ID can't change until it gets reclaimed, and it won't
+	 * be reclaimed as long as we have a ref from inode and we hold
+	 * the ilock.
+	 */
+	if (udqhint &&
+	    (dqp = udqhint->q_gdquot) &&
+	    (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id)) {
+		ASSERT(XFS_DQ_IS_LOCKED(udqhint));
+		xfs_dqlock(dqp);
+		XFS_DQHOLD(dqp);
+		ASSERT(*IO_idqpp == NULL);
+		*IO_idqpp = dqp;
+		if (!dolock) {
+			xfs_dqunlock(dqp);
+			xfs_dqunlock(udqhint);
+		}
+		/* XXX XFS_STATS */
+		goto done;
+	}
+	/*
+	 * We can't hold a dquot lock when we call the dqget code.
+	 * We'll deadlock in no time, because of (not conforming to)
+	 * lock ordering - the inodelock comes before any dquot lock,
+	 * and we may drop and reacquire the ilock in xfs_qm_dqget().
+	 */
+	if (udqhint)
+		xfs_dqunlock(udqhint);
+	/*
+	 * Find the dquot from somewhere. This bumps the
+	 * reference count of dquot and returns it locked.
+	 * This can return ENOENT if dquot didn't exist on
+	 * disk and we didn't ask it to allocate;
+	 * ESRCH if quotas got turned off suddenly.
+	 */
+	if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+				 doalloc|XFS_QMOPT_DOWARN, &dqp))) {
+		if (udqhint && dolock)
+			xfs_dqlock(udqhint);
+		goto done;
+	}
+
+	xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
+	/*
+	 * dqget may have dropped and re-acquired the ilock, but it guarantees
+	 * that the dquot returned is the one that should go in the inode.
+	 */
+	*IO_idqpp = dqp;
+	ASSERT(dqp);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (! dolock) {
+		xfs_dqunlock(dqp);
+		ASSERT(!udqhint || !XFS_DQ_IS_LOCKED(udqhint));
+		goto done;
+	}
+	if (! udqhint)
+		goto done;
+
+	ASSERT(udqhint);
+	ASSERT(dolock);
+	ASSERT(! XFS_DQ_IS_LOCKED(udqhint));
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (! xfs_qm_dqlock_nowait(udqhint)) {
+		xfs_dqunlock(dqp);
+		xfs_dqlock(udqhint);
+		xfs_dqlock(dqp);
+	}
+      done:
+#ifdef QUOTADEBUG
+	if (udqhint) {
+		if (dolock)
+			ASSERT(XFS_DQ_IS_LOCKED(udqhint));
+		else
+			ASSERT(! XFS_DQ_IS_LOCKED(udqhint));
+	}
+	if (! error) {
+		if (dolock)
+			ASSERT(XFS_DQ_IS_LOCKED(dqp));
+		else
+			ASSERT(! XFS_DQ_IS_LOCKED(dqp));
+	}
+#endif
+	return (error);
+}
+
+
+/*
+ * Given a udquot and gdquot, attach a ptr to the group dquot in the
+ * udquot as a hint for future lookups. The idea sounds simple, but the
+ * execution isn't, because the udquot might have a group dquot attached
+ * already and getting rid of that gets us into lock ordering contraints.
+ * The process is complicated more by the fact that the dquots may or may not
+ * be locked on entry.
+ */
+STATIC void
+xfs_qm_dqattach_grouphint(
+	xfs_dquot_t	*udq,
+	xfs_dquot_t	*gdq,
+	uint		locked)
+{
+	xfs_dquot_t	*tmp;
+
+#ifdef QUOTADEBUG
+	if (locked) {
+		ASSERT(XFS_DQ_IS_LOCKED(udq));
+		ASSERT(XFS_DQ_IS_LOCKED(gdq));
+	} else {
+		ASSERT(! XFS_DQ_IS_LOCKED(udq));
+		ASSERT(! XFS_DQ_IS_LOCKED(gdq));
+	}
+#endif
+	if (! locked)
+		xfs_dqlock(udq);
+
+	if ((tmp = udq->q_gdquot)) {
+		if (tmp == gdq) {
+			if (! locked)
+				xfs_dqunlock(udq);
+			return;
+		}
+
+		udq->q_gdquot = NULL;
+		/*
+		 * We can't keep any dqlocks when calling dqrele,
+		 * because the freelist lock comes before dqlocks.
+		 */
+		xfs_dqunlock(udq);
+		if (locked)
+			xfs_dqunlock(gdq);
+		/*
+		 * we took a hard reference once upon a time in dqget,
+		 * so give it back when the udquot no longer points at it
+		 * dqput() does the unlocking of the dquot.
+		 */
+		xfs_qm_dqrele(tmp);
+
+		ASSERT(! XFS_DQ_IS_LOCKED(udq));
+		ASSERT(! XFS_DQ_IS_LOCKED(gdq));
+		xfs_dqlock(udq);
+		xfs_dqlock(gdq);
+
+	} else {
+		ASSERT(XFS_DQ_IS_LOCKED(udq));
+		if (! locked) {
+			ASSERT(! XFS_DQ_IS_LOCKED(gdq));
+			xfs_dqlock(gdq);
+		}
+	}
+
+	ASSERT(XFS_DQ_IS_LOCKED(udq));
+	ASSERT(XFS_DQ_IS_LOCKED(gdq));
+	/*
+	 * Somebody could have attached a gdquot here,
+	 * when we dropped the uqlock. If so, just do nothing.
+	 */
+	if (udq->q_gdquot == NULL) {
+		XFS_DQHOLD(gdq);
+		udq->q_gdquot = gdq;
+	}
+	if (! locked) {
+		xfs_dqunlock(gdq);
+		xfs_dqunlock(udq);
+	}
+}
+
+
+/*
+ * Given a locked inode, attach dquot(s) to it, taking UQUOTAON / GQUOTAON
+ * in to account.
+ * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
+ * much made this code a complete mess, but it has been pretty useful.
+ * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
+ * Inode may get unlocked and relocked in here, and the caller must deal with
+ * the consequences.
+ */
+int
+xfs_qm_dqattach(
+		xfs_inode_t	*ip,
+		uint		flags)
+{
+	int		error;
+	xfs_mount_t	*mp;
+	uint		nquotas;
+
+	mp = ip->i_mount;
+	ASSERT(ip->i_ino != mp->m_sb.sb_uquotino &&
+	       ip->i_ino != mp->m_sb.sb_gquotino);
+
+	ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
+	       XFS_ISLOCKED_INODE_EXCL(ip));
+
+	nquotas = 0;
+	error = 0;
+	if (! (flags & XFS_QMOPT_ILOCKED))
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		if ((error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
+						flags & XFS_QMOPT_DQALLOC,
+						flags & XFS_QMOPT_DQLOCK,
+						NULL, &ip->i_udquot)))
+			goto done;
+		nquotas++;
+	}
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		if ((error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+						flags & XFS_QMOPT_DQALLOC,
+						flags & XFS_QMOPT_DQLOCK,
+						ip->i_udquot, &ip->i_gdquot)))
+			/*
+			 * Don't worry about the udquot that we may have
+			 * attached above. It'll get dettached, if not already.
+			 */
+			goto done;
+		nquotas++;
+	}
+
+	/*
+	 * Attach this group quota to the user quota as a hint.
+	 * This WON'T, in general, result in a thrash.
+	 */
+	if (nquotas == 2) {
+		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+		ASSERT(ip->i_udquot);
+		ASSERT(ip->i_gdquot);
+
+		/*
+		 * We may or may not have the i_udquot locked at this point,
+		 * but this check is OK since we don't depend on the i_gdquot to
+		 * be accurate 100% all the time. It is just a hint, and this
+		 * will succeed in general.
+		 */
+		if (ip->i_udquot->q_gdquot == ip->i_gdquot)
+			goto done;
+		/*
+		 * Attach i_gdquot to the gdquot hint inside the i_udquot.
+		 */
+		xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
+					 flags & XFS_QMOPT_DQLOCK);
+	}
+
+      done:
+
+#ifdef QUOTADEBUG
+	if (! error) {
+		if (ip->i_udquot) {
+			if (flags & XFS_QMOPT_DQLOCK)
+				ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
+			else
+				ASSERT(! XFS_DQ_IS_LOCKED(ip->i_udquot));
+		}
+		if (ip->i_gdquot) {
+			if (flags & XFS_QMOPT_DQLOCK)
+				ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
+			else
+				ASSERT(! XFS_DQ_IS_LOCKED(ip->i_gdquot));
+		}
+		if (XFS_IS_UQUOTA_ON(mp))
+			ASSERT(ip->i_udquot);
+		if (XFS_IS_GQUOTA_ON(mp))
+			ASSERT(ip->i_gdquot);
+	}
+#endif
+
+	if (! (flags & XFS_QMOPT_ILOCKED))
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+#ifdef QUOTADEBUG
+	else
+		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+#endif
+	return (error);
+}
+
+/*
+ * Release dquots (and their references) if any.
+ * The inode should be locked EXCL except when this's called by
+ * xfs_ireclaim.
+ */
+void
+xfs_qm_dqdettach_inode(
+	xfs_inode_t	*ip)
+{
+	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
+	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+	if (ip->i_udquot)
+		xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip);
+	if (ip->i_udquot) {
+		xfs_qm_dqrele(ip->i_udquot);
+		ip->i_udquot = NULL;
+	}
+	if (ip->i_gdquot) {
+		xfs_qm_dqrele(ip->i_gdquot);
+		ip->i_gdquot = NULL;
+	}
+}
+
+int
+xfs_qm_unmount(
+	xfs_mount_t	*mp)
+{
+	vnode_t		*vp;
+
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		vp = XFS_ITOV(XFS_QI_UQIP(mp));
+		VN_RELE(vp);
+		if (vn_count(vp) > 1)
+			cmn_err(CE_WARN, "UQUOTA busy vp=0x%x count=%d\n",
+				vp, vn_count(vp));
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		vp = XFS_ITOV(XFS_QI_GQIP(mp));
+		VN_RELE(vp);
+		if (vn_count(vp) > 1)
+			cmn_err(CE_WARN, "GQUOTA busy vp=0x%x count=%d\n",
+				vp, vn_count(vp));
+	}
+
+	return (0);
+}
+
+
+/*
+ * This is called by xfs_sync and flags arg determines the caller,
+ * and its motives, as done in xfs_sync.
+ *
+ * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
+ * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
+ * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
+ */
+
+int
+xfs_qm_sync(
+	xfs_mount_t	*mp,
+	short		flags)
+{
+	int		recl, restarts;
+	xfs_dquot_t	*dqp;
+	uint		flush_flags;
+	boolean_t	nowait;
+	int		error;
+
+	restarts = 0;
+	/*
+	 * We won't block unless we are asked to.
+	 */
+	nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
+
+  again:
+	xfs_qm_mplist_lock(mp);
+	/*
+	 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
+	 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
+	 * when we have the mplist lock, we know that dquots will be consistent
+	 * as long as we have it locked.
+	 */
+	if (! XFS_IS_QUOTA_ON(mp)) {
+		xfs_qm_mplist_unlock(mp);
+		return (0);
+	}
+	FOREACH_DQUOT_IN_MP(dqp, mp) {
+		/*
+		 * If this is vfs_sync calling, then skip the dquots that
+		 * don't 'seem' to be dirty. ie. don't acquire dqlock.
+		 * This is very similar to what xfs_sync does with inodes.
+		 */
+		if (flags & SYNC_BDFLUSH) {
+			if (! XFS_DQ_IS_DIRTY(dqp))
+				continue;
+		}
+
+		if (nowait) {
+			/*
+			 * Try to acquire the dquot lock. We are NOT out of
+			 * lock order, but we just don't want to wait for this
+			 * lock, unless somebody wanted us to.
+			 */
+			if (! xfs_qm_dqlock_nowait(dqp))
+				continue;
+		} else {
+			xfs_dqlock(dqp);
+		}
+
+		/*
+		 * Now, find out for sure if this dquot is dirty or not.
+		 */
+		if (! XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqunlock(dqp);
+			continue;
+		}
+
+		/* XXX a sentinel would be better */
+		recl = XFS_QI_MPLRECLAIMS(mp);
+		if (! xfs_qm_dqflock_nowait(dqp)) {
+			if (nowait) {
+				xfs_dqunlock(dqp);
+				continue;
+			}
+			/*
+			 * If we can't grab the flush lock then if the caller
+			 * really wanted us to give this our best shot,
+			 * see if we can give a push to the buffer before we wait
+			 * on the flush lock. At this point, we know that
+			 * eventhough the dquot is being flushed,
+			 * it has (new) dirty data.
+			 */
+			xfs_qm_dqflock_pushbuf_wait(dqp);
+		}
+		/*
+		 * Let go of the mplist lock. We don't want to hold it
+		 * across a disk write
+		 */
+		flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
+		xfs_qm_mplist_unlock(mp);
+		xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
+		error = xfs_qm_dqflush(dqp, flush_flags);
+		xfs_dqunlock(dqp);
+		if (error && XFS_FORCED_SHUTDOWN(mp))
+			return(0);	/* Need to prevent umount failure */
+		else if (error)
+			return (error);
+
+		xfs_qm_mplist_lock(mp);
+		if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+			if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
+				break;
+
+			xfs_qm_mplist_unlock(mp);
+			goto again;
+		}
+	}
+
+	xfs_qm_mplist_unlock(mp);
+	return (0);
+}
+
+
+/*
+ * This initializes all the quota information that's kept in the
+ * mount structure
+ */
+int
+xfs_qm_init_quotainfo(
+	xfs_mount_t	*mp)
+{
+	xfs_quotainfo_t *qinf;
+	int		error;
+	xfs_dquot_t	*dqp;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * Tell XQM that we exist as soon as possible.
+	 */
+	if ((error = xfs_qm_hold_quotafs_ref(mp))) {
+		return (error);
+	}
+
+	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+
+	/*
+	 * See if quotainodes are setup, and if not, allocate them,
+	 * and change the superblock accordingly.
+	 */
+	if ((error = xfs_qm_init_quotainos(mp))) {
+		kmem_free(qinf, sizeof(xfs_quotainfo_t));
+		mp->m_quotainfo = NULL;
+		return (error);
+	}
+
+	spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin");
+	xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+	qinf->qi_dqreclaims = 0;
+
+	/* mutex used to serialize quotaoffs */
+	mutex_init(&qinf->qi_quotaofflock, MUTEX_DEFAULT, "qoff");
+
+	/* Precalc some constants */
+	qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+	ASSERT(qinf->qi_dqchunklen);
+	qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
+	do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
+
+	mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
+
+	/*
+	 * We try to get the limits from the superuser's limits fields.
+	 * This is quite hacky, but it is standard quota practice.
+	 * We look at the USR dquot with id == 0 first, but if user quotas
+	 * are not enabled we goto the GRP dquot with id == 0.
+	 * We don't really care to keep separate default limits for user
+	 * and group quotas, at least not at this point.
+	 */
+	error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
+			     (XFS_IS_UQUOTA_RUNNING(mp)) ?
+			     XFS_DQ_USER : XFS_DQ_GROUP,
+			     XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
+			     &dqp);
+	if (! error) {
+		/*
+		 * The warnings and timers set the grace period given to
+		 * a user or group before he or she can not perform any
+		 * more writing. If it is zero, a default is used.
+		 */
+		qinf->qi_btimelimit = INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT) : XFS_QM_BTIMELIMIT;
+		qinf->qi_itimelimit = INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT) : XFS_QM_ITIMELIMIT;
+		qinf->qi_rtbtimelimit = INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT) : XFS_QM_RTBTIMELIMIT;
+		qinf->qi_bwarnlimit = INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) : XFS_QM_BWARNLIMIT;
+		qinf->qi_iwarnlimit = INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) : XFS_QM_IWARNLIMIT;
+
+		/*
+		 * We sent the XFS_QMOPT_DQSUSER flag to dqget because
+		 * we don't want this dquot cached. We haven't done a
+		 * quotacheck yet, and quotacheck doesn't like incore dquots.
+		 */
+		xfs_qm_dqdestroy(dqp);
+	} else {
+		qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+		qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+		qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+		qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+		qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+	}
+
+	return (0);
+}
+
+
+/*
+ * Gets called when unmounting a filesystem or when all quotas get
+ * turned off.
+ * This purges the quota inodes, destroys locks and frees itself.
+ */
+void
+xfs_qm_destroy_quotainfo(
+	xfs_mount_t	*mp)
+{
+	xfs_quotainfo_t *qi;
+
+	qi = mp->m_quotainfo;
+	ASSERT(qi != NULL);
+	ASSERT(xfs_Gqm != NULL);
+
+	/*
+	 * Release the reference that XQM kept, so that we know
+	 * when the XQM structure should be freed. We cannot assume
+	 * that xfs_Gqm is non-null after this point.
+	 */
+	xfs_qm_rele_quotafs_ref(mp);
+
+	spinlock_destroy(&qi->qi_pinlock);
+	xfs_qm_list_destroy(&qi->qi_dqlist);
+
+	if (qi->qi_uquotaip) {
+		XFS_PURGE_INODE(qi->qi_uquotaip);
+		qi->qi_uquotaip = NULL; /* paranoia */
+	}
+	if (qi->qi_gquotaip) {
+		XFS_PURGE_INODE(qi->qi_gquotaip);
+		qi->qi_gquotaip = NULL;
+	}
+	mutex_destroy(&qi->qi_quotaofflock);
+	kmem_free(qi, sizeof(xfs_quotainfo_t));
+	mp->m_quotainfo = NULL;
+}
+
+
+
+/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
+
+/* ARGSUSED */
+STATIC void
+xfs_qm_list_init(
+	xfs_dqlist_t	*list,
+	char		*str,
+	int		n)
+{
+	mutex_init(&list->qh_lock, MUTEX_DEFAULT, str);
+	list->qh_next = NULL;
+	list->qh_version = 0;
+	list->qh_nelems = 0;
+}
+
+STATIC void
+xfs_qm_list_destroy(
+	xfs_dqlist_t	*list)
+{
+	mutex_destroy(&(list->qh_lock));
+}
+
+
+/*
+ * Stripped down version of dqattach. This doesn't attach, or even look at the
+ * dquots attached to the inode. The rationale is that there won't be any
+ * attached at the time this is called from quotacheck.
+ */
+STATIC int
+xfs_qm_dqget_noattach(
+	xfs_inode_t	*ip,
+	xfs_dquot_t	**O_udqpp,
+	xfs_dquot_t	**O_gdqpp)
+{
+	int		error;
+	xfs_mount_t	*mp;
+	xfs_dquot_t	*udqp, *gdqp;
+
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	mp = ip->i_mount;
+	udqp = NULL;
+	gdqp = NULL;
+
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(ip->i_udquot == NULL);
+		/*
+		 * We want the dquot allocated if it doesn't exist.
+		 */
+		if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
+					 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
+					 &udqp))) {
+			/*
+			 * Shouldn't be able to turn off quotas here.
+			 */
+			ASSERT(error != ESRCH);
+			ASSERT(error != ENOENT);
+			return (error);
+		}
+		ASSERT(udqp);
+	}
+
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		ASSERT(ip->i_gdquot == NULL);
+		if (udqp)
+			xfs_dqunlock(udqp);
+		if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+					 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
+					 &gdqp))) {
+			if (udqp)
+				xfs_qm_dqrele(udqp);
+			ASSERT(error != ESRCH);
+			ASSERT(error != ENOENT);
+			return (error);
+		}
+		ASSERT(gdqp);
+
+		/* Reacquire the locks in the right order */
+		if (udqp) {
+			if (! xfs_qm_dqlock_nowait(udqp)) {
+				xfs_dqunlock(gdqp);
+				xfs_dqlock(udqp);
+				xfs_dqlock(gdqp);
+			}
+		}
+	}
+
+	*O_udqpp = udqp;
+	*O_gdqpp = gdqp;
+
+#ifdef QUOTADEBUG
+	if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
+	if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
+#endif
+	return (0);
+}
+
+/*
+ * Create an inode and return with a reference already taken, but unlocked
+ * This is how we create quota inodes
+ */
+STATIC int
+xfs_qm_qino_alloc(
+	xfs_mount_t	*mp,
+	xfs_inode_t	**ip,
+	__int64_t	sbfields,
+	uint		flags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	unsigned long s;
+	cred_t		zerocr;
+	int		committed;
+
+	tp = xfs_trans_alloc(mp,XFS_TRANS_QM_QINOCREATE);
+	if ((error = xfs_trans_reserve(tp,
+				      XFS_QM_QINOCREATE_SPACE_RES(mp),
+				      XFS_CREATE_LOG_RES(mp), 0,
+				      XFS_TRANS_PERM_LOG_RES,
+				      XFS_CREATE_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+	bzero(&zerocr, sizeof(zerocr));
+
+	if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, IFREG, 1, mp->m_dev,
+				   &zerocr, 0, 1, ip, &committed))) {
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT);
+		return (error);
+	}
+
+	/*
+	 * Keep an extra reference to this quota inode. This inode is
+	 * locked exclusively and joined to the transaction already.
+	 */
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip));
+	VN_HOLD(XFS_ITOV((*ip)));
+
+	/*
+	 * Make the changes in the superblock, and log those too.
+	 * sbfields arg may contain fields other than *QUOTINO;
+	 * VERSIONNUM for example.
+	 */
+	s = XFS_SB_LOCK(mp);
+	if (flags & XFS_QMOPT_SBVERSION) {
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+		unsigned oldv = mp->m_sb.sb_versionnum;
+#endif
+		ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb));
+		ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+				   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
+		       (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+			XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+
+		XFS_SB_VERSION_ADDQUOTA(&mp->m_sb);
+		mp->m_sb.sb_uquotino = NULLFSINO;
+		mp->m_sb.sb_gquotino = NULLFSINO;
+
+		/* qflags will get updated _after_ quotacheck */
+		mp->m_sb.sb_qflags = 0;
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+		cmn_err(CE_NOTE,
+			"Old superblock version %x, converting to %x.",
+			oldv, mp->m_sb.sb_versionnum);
+#endif
+	}
+	if (flags & XFS_QMOPT_UQUOTA)
+		mp->m_sb.sb_uquotino = (*ip)->i_ino;
+	else
+		mp->m_sb.sb_gquotino = (*ip)->i_ino;
+	XFS_SB_UNLOCK(mp, s);
+	xfs_mod_sb(tp, sbfields);
+
+	if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
+				     NULL))) {
+		xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
+		return (error);
+	}
+	return (0);
+}
+
+
+STATIC int
+xfs_qm_reset_dqcounts(
+	xfs_mount_t	*mp,
+	xfs_buf_t	*bp,
+	xfs_dqid_t	id,
+	uint		type)
+{
+	xfs_disk_dquot_t	*ddq;
+	int			j;
+
+	xfs_buftrace("RESET DQUOTS", bp);
+	/*
+	 * Reset all counters and timers. They'll be
+	 * started afresh by xfs_qm_quotacheck.
+	 */
+#ifdef DEBUG
+	j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+	do_div(j, sizeof(xfs_dqblk_t));
+	ASSERT(XFS_QM_DQPERBLK(mp) == j);
+#endif
+	ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
+	for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
+		/*
+		 * Do a sanity check, and if needed, repair the dqblk. Don't
+		 * output any warnings because it's perfectly possible to
+		 * find unitialized dquot blks. See comment in xfs_qm_dqcheck.
+		 */
+		(void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+				      "xfs_quotacheck");
+		INT_SET(ddq->d_bcount, ARCH_CONVERT, 0ULL);
+		INT_SET(ddq->d_icount, ARCH_CONVERT, 0ULL);
+		INT_SET(ddq->d_rtbcount, ARCH_CONVERT, 0ULL);
+		INT_SET(ddq->d_btimer, ARCH_CONVERT, (time_t)0);
+		INT_SET(ddq->d_itimer, ARCH_CONVERT, (time_t)0);
+		INT_SET(ddq->d_bwarns, ARCH_CONVERT, 0UL);
+		INT_SET(ddq->d_iwarns, ARCH_CONVERT, 0UL);
+		ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+	}
+
+	return (0);
+}
+
+STATIC int
+xfs_qm_dqiter_bufs(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	firstid,
+	xfs_fsblock_t	bno,
+	xfs_filblks_t	blkcnt,
+	uint		flags)
+{
+	xfs_buf_t	*bp;
+	int		error;
+	int		notcommitted;
+	int		incr;
+
+	ASSERT(blkcnt > 0);
+	notcommitted = 0;
+	incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
+		XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
+	error = 0;
+
+	/*
+	 * Blkcnt arg can be a very big number, and might even be
+	 * larger than the log itself. So, we have to break it up into
+	 * manageable-sized transactions.
+	 * Note that we don't start a permanent transaction here; we might
+	 * not be able to get a log reservation for the whole thing up front,
+	 * and we don't really care to either, because we just discard
+	 * everything if we were to crash in the middle of this loop.
+	 */
+	while (blkcnt--) {
+		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+			      XFS_FSB_TO_DADDR(mp, bno),
+			      (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
+		if (error)
+			break;
+
+		(void) xfs_qm_reset_dqcounts(mp, bp, firstid,
+					     flags & XFS_QMOPT_UQUOTA ?
+					     XFS_DQ_USER : XFS_DQ_GROUP);
+		xfs_bdwrite(mp, bp);
+		/*
+		 * When quotachecking the root filesystem,
+		 * we may not have bdflush, and we may fill
+		 * up all available freebufs.
+		 * The workaround here is to push on the
+		 * log and do a bflush on the rootdev
+		 * periodically.
+		 */
+		if (mp->m_flags & XFS_MOUNT_ROOTQCHECK) {
+			if (++notcommitted == incr) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE | XFS_LOG_SYNC);
+				XFS_bflush(mp->m_ddev_targp);
+				notcommitted = 0;
+			}
+		}
+		/*
+		 * goto the next block.
+		 */
+		bno++;
+		firstid += XFS_QM_DQPERBLK(mp);
+	}
+	return (error);
+}
+
+/*
+ * Iterate over all allocated USR/GRP dquots in the system, calling a
+ * caller supplied function for every chunk of dquots that we find.
+ */
+STATIC int
+xfs_qm_dqiterate(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*qip,
+	uint		flags)
+{
+	xfs_bmbt_irec_t		*map;
+	int			i, nmaps;	/* number of map entries */
+	int			error;		/* return value */
+	xfs_fileoff_t		lblkno;
+	xfs_filblks_t		maxlblkcnt;
+	xfs_dqid_t		firstid;
+	xfs_fsblock_t		rablkno;
+	xfs_filblks_t		rablkcnt;
+
+	error = 0;
+	/*
+	 * This looks racey, but we can't keep an inode lock across a
+	 * trans_reserve. But, this gets called during quotacheck, and that
+	 * happens only at mount time which is single threaded.
+	 */
+	if (qip->i_d.di_nblocks == 0)
+		return (0);
+
+	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+
+	lblkno = 0;
+	maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAX_FILE_OFFSET);
+	do {
+		nmaps = XFS_DQITER_MAP_SIZE;
+		/*
+		 * We aren't changing the inode itself. Just changing
+		 * some of its data. No new blocks are added here, and
+		 * the inode is never added to the transaction.
+		 */
+		xfs_ilock(qip, XFS_ILOCK_SHARED);
+		error = xfs_bmapi(NULL, qip, lblkno,
+				  maxlblkcnt - lblkno,
+				  XFS_BMAPI_METADATA,
+				  NULL,
+				  0, map, &nmaps, NULL);
+		xfs_iunlock(qip, XFS_ILOCK_SHARED);
+		if (error)
+			break;
+
+		ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
+		for (i = 0; i < nmaps; i++) {
+			ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+			ASSERT(map[i].br_blockcount);
+
+
+			lblkno += map[i].br_blockcount;
+
+			if (map[i].br_startblock == HOLESTARTBLOCK)
+				continue;
+
+			firstid = (xfs_dqid_t) map[i].br_startoff *
+				XFS_QM_DQPERBLK(mp);
+			/*
+			 * Do a read-ahead on the next extent.
+			 */
+			if ((i+1 < nmaps) &&
+			    (map[i+1].br_startblock != HOLESTARTBLOCK)) {
+				rablkcnt =  map[i+1].br_blockcount;
+				rablkno = map[i+1].br_startblock;
+				while (rablkcnt--) {
+					xfs_baread(mp->m_ddev_targp,
+					       XFS_FSB_TO_DADDR(mp, rablkno),
+					       (int)XFS_QI_DQCHUNKLEN(mp));
+					rablkno++;
+				}
+			}
+			/*
+			 * Iterate thru all the blks in the extent and
+			 * reset the counters of all the dquots inside them.
+			 */
+			if ((error = xfs_qm_dqiter_bufs(mp,
+						       firstid,
+						       map[i].br_startblock,
+						       map[i].br_blockcount,
+						       flags))) {
+				break;
+			}
+		}
+
+		if (error)
+			break;
+	} while (nmaps > 0);
+
+	kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
+
+	return (error);
+}
+
+/*
+ * Called by dqusage_adjust in doing a quotacheck.
+ * Given the inode, and a dquot (either USR or GRP, doesn't matter),
+ * this updates its incore copy as well as the buffer copy. This is
+ * so that once the quotacheck is done, we can just log all the buffers,
+ * as opposed to logging numerous updates to individual dquots.
+ */
+STATIC void
+xfs_qm_quotacheck_dqadjust(
+	xfs_dquot_t		*dqp,
+	xfs_qcnt_t		nblks,
+	xfs_qcnt_t		rtblks)
+{
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	xfs_dqtrace_entry(dqp, "QCHECK DQADJUST");
+	/*
+	 * Adjust the inode count and the block count to reflect this inode's
+	 * resource usage.
+	 */
+	INT_MOD(dqp->q_core.d_icount, ARCH_CONVERT, +1);
+	dqp->q_res_icount++;
+	if (nblks) {
+		INT_MOD(dqp->q_core.d_bcount, ARCH_CONVERT, nblks);
+		dqp->q_res_bcount += nblks;
+	}
+	if (rtblks) {
+		INT_MOD(dqp->q_core.d_rtbcount, ARCH_CONVERT, rtblks);
+		dqp->q_res_rtbcount += rtblks;
+	}
+
+	/*
+	 * Adjust the timers since we just changed usages
+	 */
+	if (! XFS_IS_SUSER_DQUOT(dqp))
+		xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
+
+	dqp->dq_flags |= XFS_DQ_DIRTY;
+}
+
+STATIC int
+xfs_qm_get_rtblks(
+	xfs_inode_t	*ip,
+	xfs_qcnt_t	*O_rtblks)
+{
+	xfs_filblks_t	rtblks;			/* total rt blks */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+	xfs_bmbt_rec_t	*base;			/* base of extent array */
+	xfs_bmbt_rec_t	*ep;			/* pointer to an extent entry */
+	int		error;
+
+	ASSERT(XFS_IS_REALTIME_INODE(ip));
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
+			return (error);
+	}
+	rtblks = 0;
+	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	base = &ifp->if_u1.if_extents[0];
+	for (ep = base; ep < &base[nextents]; ep++)
+		rtblks += xfs_bmbt_get_blockcount(ep);
+	*O_rtblks = (xfs_qcnt_t)rtblks;
+	return (0);
+}
+
+/*
+ * callback routine supplied to bulkstat(). Given an inumber, find its
+ * dquots and update them to account for resources taken by that inode.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqusage_adjust(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer - NULL */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		*buffer,	/* not used */
+	xfs_daddr_t	bno,		/* starting block of inode cluster */
+	void		*dip,		/* on-disk inode pointer (not used) */
+	int		*res)		/* result code value */
+{
+	xfs_inode_t	*ip;
+	xfs_dquot_t	*udqp, *gdqp;
+	xfs_qcnt_t	nblks, rtblks;
+	int		error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * rootino must have its resources accounted for, not so with the quota
+	 * inodes.
+	 */
+	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+		*res = BULKSTAT_RV_NOTHING;
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
+	 * interface expects the inode to be exclusively locked because that's
+	 * the case in all other instances. It's OK that we do this because
+	 * quotacheck is done only at mount time.
+	 */
+	if ((error = xfs_iget(mp, tp, ino, XFS_ILOCK_EXCL, &ip, bno))) {
+		*res = BULKSTAT_RV_NOTHING;
+		return (error);
+	}
+
+	if (ip->i_d.di_mode == 0) {
+		xfs_iput_new(ip, XFS_ILOCK_EXCL);
+		*res = BULKSTAT_RV_NOTHING;
+		return XFS_ERROR(ENOENT);
+	}
+
+	/*
+	 * Obtain the locked dquots. In case of an error (eg. allocation
+	 * fails for ENOSPC), we return the negative of the error number
+	 * to bulkstat, so that it can get propagated to quotacheck() and
+	 * making us disable quotas for the file system.
+	 */
+	if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
+		xfs_iput(ip, XFS_ILOCK_EXCL);
+		*res = BULKSTAT_RV_GIVEUP;
+		return (error);
+	}
+
+	rtblks = 0;
+	if (! XFS_IS_REALTIME_INODE(ip)) {
+		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
+	} else {
+		/*
+		 * Walk thru the extent list and count the realtime blocks.
+		 */
+		if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
+			xfs_iput(ip, XFS_ILOCK_EXCL);
+			if (udqp)
+				xfs_qm_dqput(udqp);
+			if (gdqp)
+				xfs_qm_dqput(gdqp);
+			*res = BULKSTAT_RV_GIVEUP;
+			return (error);
+		}
+		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+	}
+	ASSERT(ip->i_delayed_blks == 0);
+
+	/*
+	 * We can't release the inode while holding its dquot locks.
+	 * The inode can go into inactive and might try to acquire the dquotlocks.
+	 * So, just unlock here and do a vn_rele at the end.
+	 */
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Add the (disk blocks and inode) resources occupied by this
+	 * inode to its dquots. We do this adjustment in the incore dquot,
+	 * and also copy the changes to its buffer.
+	 * We don't care about putting these changes in a transaction
+	 * envelope because if we crash in the middle of a 'quotacheck'
+	 * we have to start from the beginning anyway.
+	 * Once we're done, we'll log all the dquot bufs.
+	 *
+	 * The *QUOTA_ON checks below may look pretty racey, but quotachecks
+	 * and quotaoffs don't race. (Quotachecks happen at mount time only).
+	 */
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(udqp);
+		xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
+		xfs_qm_dqput(udqp);
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		ASSERT(gdqp);
+		xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
+		xfs_qm_dqput(gdqp);
+	}
+	/*
+	 * Now release the inode. This will send it to 'inactive', and
+	 * possibly even free blocks.
+	 */
+	VN_RELE(XFS_ITOV(ip));
+
+	/*
+	 * Goto next inode.
+	 */
+	*res = BULKSTAT_RV_DIDONE;
+	return (0);
+}
+
+/*
+ * Walk thru all the filesystem inodes and construct a consistent view
+ * of the disk quota world.
+ */
+STATIC int
+xfs_qm_quotacheck(
+	xfs_mount_t	*mp)
+{
+	int		done, count, error;
+	xfs_ino_t	lastino;
+	size_t		structsz;
+	xfs_inode_t	*uip, *gip;
+	uint		flags;
+
+	count = INT_MAX;
+	structsz = 1;
+	lastino = 0;
+	flags = 0;
+
+	ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * There should be no cached dquots. The (simplistic) quotacheck
+	 * algorithm doesn't like that.
+	 */
+	ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
+
+	cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
+
+	/*
+	 * First we go thru all the dquots on disk, USR and GRP, and reset
+	 * their counters to zero. We need a clean slate.
+	 * We don't log our changes till later.
+	 */
+	if ((uip = XFS_QI_UQIP(mp))) {
+		if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
+			goto error_return;
+		flags |= XFS_UQUOTA_CHKD;
+	}
+
+	if ((gip = XFS_QI_GQIP(mp))) {
+		if ((error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA)))
+			goto error_return;
+		flags |= XFS_GQUOTA_CHKD;
+	}
+
+	do {
+		/*
+		 * Iterate thru all the inodes in the file system,
+		 * adjusting the corresponding dquot counters in core.
+		 */
+		if ((error = xfs_bulkstat(mp, NULL, &lastino, &count,
+				     xfs_qm_dqusage_adjust,
+				     structsz, NULL,
+				     BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED,
+				     &done)))
+			break;
+
+	} while (! done);
+
+	/*
+	 * We can get this error if we couldn't do a dquot allocation inside
+	 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
+	 * dirty dquots that might be cached, we just want to get rid of them
+	 * and turn quotaoff. The dquots won't be attached to any of the inodes
+	 * at this point (because we intentionally didn't in dqget_noattach).
+	 */
+	if (error) {
+		xfs_qm_dqpurge_all(mp,
+				   XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA|
+				   XFS_QMOPT_QUOTAOFF);
+		goto error_return;
+	}
+	/*
+	 * We've made all the changes that we need to make incore.
+	 * Now flush_them down to disk buffers.
+	 */
+	xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+
+	/*
+	 * We didn't log anything, because if we crashed, we'll have to
+	 * start the quotacheck from scratch anyway. However, we must make
+	 * sure that our dquot changes are secure before we put the
+	 * quotacheck'd stamp on the superblock. So, here we do a synchronous
+	 * flush.
+	 */
+	XFS_bflush(mp->m_ddev_targp);
+
+	/*
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	mp->m_qflags &= ~(XFS_GQUOTA_CHKD | XFS_UQUOTA_CHKD);
+	mp->m_qflags |= flags;
+
+#ifdef QUOTADEBUG
+	XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
+#endif
+
+ error_return:
+	cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
+	mp->m_flags &= ~(XFS_MOUNT_ROOTQCHECK);
+	return (error);
+}
+
+/*
+ * This is called after the superblock has been read in and we're ready to
+ * iget the quota inodes.
+ */
+STATIC int
+xfs_qm_init_quotainos(
+	xfs_mount_t	*mp)
+{
+	xfs_inode_t	*uip, *gip;
+	int		error;
+	__int64_t	sbflags;
+	uint		flags;
+
+	ASSERT(mp->m_quotainfo);
+	uip = gip = NULL;
+	sbflags = 0;
+	flags = 0;
+
+	/*
+	 * Get the uquota and gquota inodes
+	 */
+	if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+		if (XFS_IS_UQUOTA_ON(mp) &&
+		    mp->m_sb.sb_uquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_uquotino > 0);
+			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+					     0, &uip, 0)))
+				return XFS_ERROR(error);
+		}
+		if (XFS_IS_GQUOTA_ON(mp) &&
+		    mp->m_sb.sb_gquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_gquotino > 0);
+			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+					     0, &gip, 0))) {
+				if (uip)
+					VN_RELE(XFS_ITOV(uip));
+				return XFS_ERROR(error);
+			}
+		}
+	} else {
+		flags |= XFS_QMOPT_SBVERSION;
+		sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+			    XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+	}
+
+	/*
+	 * Create the two inodes, if they don't exist already. The changes
+	 * made above will get added to a transaction and logged in one of
+	 * the qino_alloc calls below.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
+		if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+			return XFS_ERROR(EROFS);
+		if ((error = xfs_qm_qino_alloc(mp, &uip,
+					      sbflags | XFS_SB_UQUOTINO,
+					      flags | XFS_QMOPT_UQUOTA)))
+			return XFS_ERROR(error);
+
+		flags &= ~XFS_QMOPT_SBVERSION;
+	}
+	if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
+		if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) {
+			if (uip)
+				VN_RELE(XFS_ITOV(uip));
+			return XFS_ERROR(EROFS);
+		}
+		if ((error = xfs_qm_qino_alloc(mp, &gip,
+					      sbflags | XFS_SB_GQUOTINO,
+					      flags | XFS_QMOPT_GQUOTA))) {
+			if (uip)
+				VN_RELE(XFS_ITOV(uip));
+
+			return XFS_ERROR(error);
+		}
+	}
+
+	XFS_QI_UQIP(mp) = uip;
+	XFS_QI_GQIP(mp) = gip;
+
+	return (0);
+}
+
+
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ * XXXsup merge this with qm_reclaim_one().
+ */
+STATIC int
+xfs_qm_shake_freelist(
+	int howmany)
+{
+	int		nreclaimed;
+	xfs_dqhash_t	*hash;
+	xfs_dquot_t	*dqp, *nextdqp;
+	int		restarts;
+	int		nflushes;
+
+	if (howmany <= 0)
+		return (0);
+
+	nreclaimed = 0;
+	restarts = 0;
+	nflushes = 0;
+
+#ifdef QUOTADEBUG
+	printk("Shake free 0x%x\n", howmany);
+#endif
+	/* lock order is : hashchainlock, freelistlock, mplistlock */
+ tryagain:
+	xfs_qm_freelist_lock(xfs_Gqm);
+
+	for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+	     ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
+	      nreclaimed < howmany); ) {
+		xfs_dqlock(dqp);
+
+		/*
+		 * We are racing with dqlookup here. Naturally we don't
+		 * want to reclaim a dquot that lookup wants.
+		 */
+		if (dqp->dq_flags & XFS_DQ_WANT) {
+			xfs_dqunlock(dqp);
+			xfs_qm_freelist_unlock(xfs_Gqm);
+			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+				return (nreclaimed != howmany);
+			XFS_STATS_INC(xfsstats.xs_qm_dqwants);
+			goto tryagain;
+		}
+
+		/*
+		 * If the dquot is inactive, we are assured that it is
+		 * not on the mplist or the hashlist, and that makes our
+		 * life easier.
+		 */
+		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+			ASSERT(dqp->q_mount == NULL);
+			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+			ASSERT(dqp->HL_PREVP == NULL);
+			ASSERT(dqp->MPL_PREVP == NULL);
+			XFS_STATS_INC(xfsstats.xs_qm_dqinact_reclaims);
+			nextdqp = dqp->dq_flnext;
+			goto off_freelist;
+		}
+
+		ASSERT(dqp->MPL_PREVP);
+		/*
+		 * Try to grab the flush lock. If this dquot is in the process of
+		 * getting flushed to disk, we don't want to reclaim it.
+		 */
+		if (! xfs_qm_dqflock_nowait(dqp)) {
+			xfs_dqunlock(dqp);
+			dqp = dqp->dq_flnext;
+			continue;
+		}
+
+		/*
+		 * We have the flush lock so we know that this is not in the
+		 * process of being flushed. So, if this is dirty, flush it
+		 * DELWRI so that we don't get a freelist infested with
+		 * dirty dquots.
+		 */
+		if (XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
+			/*
+			 * We'll be doing a dqflush, and it is
+			 * possible to fill up the entire buffer cache
+			 * with dirty delayed write buffers when doing
+			 * this on a root filesystem, if bdflush isn't
+			 * running. So, do a flush periodically.
+			 */
+			if (dqp->q_mount->m_flags & XFS_MOUNT_ROOTQCHECK) {
+				if (!(++nflushes % XFS_QM_MAX_DQCLUSTER_LOGSZ)){
+					xfs_log_force(dqp->q_mount, (xfs_lsn_t)0,
+						 XFS_LOG_FORCE | XFS_LOG_SYNC);
+					XFS_bflush(dqp->q_mount->m_ddev_targp);
+				}
+			}
+			/*
+			 * We flush it delayed write, so don't bother
+			 * releasing the mplock.
+			 */
+			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+			dqp = dqp->dq_flnext;
+			continue;
+		}
+		/*
+		 * We're trying to get the hashlock out of order. This races
+		 * with dqlookup; so, we giveup and goto the next dquot if
+		 * we couldn't get the hashlock. This way, we won't starve
+		 * a dqlookup process that holds the hashlock that is
+		 * waiting for the freelist lock.
+		 */
+		if (! xfs_qm_dqhashlock_nowait(dqp)) {
+			xfs_dqfunlock(dqp);
+			xfs_dqunlock(dqp);
+			dqp = dqp->dq_flnext;
+			continue;
+		}
+		/*
+		 * This races with dquot allocation code as well as dqflush_all
+		 * and reclaim code. So, if we failed to grab the mplist lock,
+		 * giveup everything and start over.
+		 */
+		hash = dqp->q_hash;
+		ASSERT(hash);
+		if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+			/* XXX put a sentinel so that we can come back here */
+			xfs_dqfunlock(dqp);
+			xfs_dqunlock(dqp);
+			XFS_DQ_HASH_UNLOCK(hash);
+			xfs_qm_freelist_unlock(xfs_Gqm);
+			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+				return (nreclaimed != howmany);
+			goto tryagain;
+		}
+		xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING");
+#ifdef QUOTADEBUG
+		printk("Shake 0x%p, ID 0x%x\n", dqp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
+#endif
+		ASSERT(dqp->q_nrefs == 0);
+		nextdqp = dqp->dq_flnext;
+		XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+		XQM_HASHLIST_REMOVE(hash, dqp);
+		xfs_dqfunlock(dqp);
+		xfs_qm_mplist_unlock(dqp->q_mount);
+		XFS_DQ_HASH_UNLOCK(hash);
+
+ off_freelist:
+		XQM_FREELIST_REMOVE(dqp);
+		xfs_dqunlock(dqp);
+		nreclaimed++;
+		XFS_STATS_INC(xfsstats.xs_qm_dqshake_reclaims);
+		xfs_qm_dqdestroy(dqp);
+		dqp = nextdqp;
+	}
+	xfs_qm_freelist_unlock(xfs_Gqm);
+	return (nreclaimed != howmany);
+}
+
+
+/*
+ * The shake manager routine called by shaked() when memory is
+ * running low.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_shake(void)
+{
+	int	ndqused, nfree, n;
+
+	if (!xfs_Gqm)
+		return;
+
+	nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
+	/* incore dquots in all f/s's */
+	ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
+
+	ASSERT(ndqused >= 0);
+
+	if (nfree <= ndqused && nfree < ndquot)
+		return;
+
+	ndqused *= xfs_Gqm->qm_dqfree_ratio;	/* target # of free dquots */
+	n = nfree - ndqused - ndquot;		/* # over target */
+
+	(void) xfs_qm_shake_freelist(MAX(nfree, n));
+}
+
+
+/*
+ * Just pop the least recently used dquot off the freelist and
+ * recycle it. The returned dquot is locked.
+ */
+STATIC xfs_dquot_t *
+xfs_qm_dqreclaim_one(void)
+{
+	xfs_dquot_t	*dqpout;
+	xfs_dquot_t	*dqp;
+	int		restarts;
+	int		nflushes;
+
+	restarts = 0;
+	dqpout = NULL;
+	nflushes = 0;
+
+	/* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
+ startagain:
+	xfs_qm_freelist_lock(xfs_Gqm);
+
+	FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
+		xfs_dqlock(dqp);
+
+		/*
+		 * We are racing with dqlookup here. Naturally we don't
+		 * want to reclaim a dquot that lookup wants. We release the
+		 * freelist lock and start over, so that lookup will grab
+		 * both the dquot and the freelistlock.
+		 */
+		if (dqp->dq_flags & XFS_DQ_WANT) {
+			ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT");
+			xfs_dqunlock(dqp);
+			xfs_qm_freelist_unlock(xfs_Gqm);
+			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+				return (NULL);
+			XFS_STATS_INC(xfsstats.xs_qm_dqwants);
+			goto startagain;
+		}
+
+		/*
+		 * If the dquot is inactive, we are assured that it is
+		 * not on the mplist or the hashlist, and that makes our
+		 * life easier.
+		 */
+		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+			ASSERT(dqp->q_mount == NULL);
+			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+			ASSERT(dqp->HL_PREVP == NULL);
+			ASSERT(dqp->MPL_PREVP == NULL);
+			XQM_FREELIST_REMOVE(dqp);
+			xfs_dqunlock(dqp);
+			dqpout = dqp;
+			XFS_STATS_INC(xfsstats.xs_qm_dqinact_reclaims);
+			break;
+		}
+
+		ASSERT(dqp->q_hash);
+		ASSERT(dqp->MPL_PREVP);
+
+		/*
+		 * Try to grab the flush lock. If this dquot is in the process of
+		 * getting flushed to disk, we don't want to reclaim it.
+		 */
+		if (! xfs_qm_dqflock_nowait(dqp)) {
+			xfs_dqunlock(dqp);
+			continue;
+		}
+
+		/*
+		 * We have the flush lock so we know that this is not in the
+		 * process of being flushed. So, if this is dirty, flush it
+		 * DELWRI so that we don't get a freelist infested with
+		 * dirty dquots.
+		 */
+		if (XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
+			/*
+			 * We'll be doing a dqflush, and it is
+			 * possible to fill up the entire buffer cache
+			 * with dirty delayed write buffers when doing
+			 * this on a root filesystem, if bdflush isn't
+			 * running. So, do a flush periodically.
+			 */
+			if (dqp->q_mount->m_flags & XFS_MOUNT_ROOTQCHECK) {
+				if (!(++nflushes % XFS_QM_MAX_DQCLUSTER_LOGSZ)) {
+					xfs_log_force(dqp->q_mount, (xfs_lsn_t)0,
+						XFS_LOG_FORCE | XFS_LOG_SYNC);
+					XFS_bflush(dqp->q_mount->m_ddev_targp);
+				}
+			}
+
+			/*
+			 * We flush it delayed write, so don't bother
+			 * releasing the freelist lock.
+			 */
+			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+			continue;
+		}
+
+		if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+			xfs_dqfunlock(dqp);
+			xfs_dqunlock(dqp);
+			continue;
+		}
+
+		if (! xfs_qm_dqhashlock_nowait(dqp))
+			goto mplistunlock;
+
+		ASSERT(dqp->q_nrefs == 0);
+		xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING");
+		XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+		XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
+		XQM_FREELIST_REMOVE(dqp);
+		dqpout = dqp;
+		XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+ mplistunlock:
+		xfs_qm_mplist_unlock(dqp->q_mount);
+		xfs_dqfunlock(dqp);
+		xfs_dqunlock(dqp);
+		if (dqpout)
+			break;
+	}
+
+	xfs_qm_freelist_unlock(xfs_Gqm);
+	return (dqpout);
+}
+
+
+/*------------------------------------------------------------------*/
+
+/*
+ * Return a new incore dquot. Depending on the number of
+ * dquots in the system, we either allocate a new one on the kernel heap,
+ * or reclaim a free one.
+ * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
+ * to reclaim an existing one from the freelist.
+ */
+boolean_t
+xfs_qm_dqalloc_incore(
+	xfs_dquot_t **O_dqpp)
+{
+	xfs_dquot_t	*dqp;
+
+	/*
+	 * Check against high water mark to see if we want to pop
+	 * a nincompoop dquot off the freelist.
+	 */
+	if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
+		/*
+		 * Try to recycle a dquot from the freelist.
+		 */
+		if ((dqp = xfs_qm_dqreclaim_one())) {
+			XFS_STATS_INC(xfsstats.xs_qm_dqreclaims);
+			/*
+			 * Just bzero the core here. The rest will get
+			 * reinitialized by caller. XXX we shouldn't even
+			 * do this bzero ...
+			 */
+			bzero(&dqp->q_core, sizeof(dqp->q_core));
+			*O_dqpp = dqp;
+			return (B_FALSE);
+		}
+		XFS_STATS_INC(xfsstats.xs_qm_dqreclaim_misses);
+	}
+
+	/*
+	 * Allocate a brand new dquot on the kernel heap and return it
+	 * to the caller to initialize.
+	 */
+	ASSERT(xfs_Gqm->qm_dqzone != NULL);
+	*O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+	atomic_inc(&xfs_Gqm->qm_totaldquots);
+
+	return (B_TRUE);
+}
+
+
+/*
+ * Start a transaction and write the incore superblock changes to
+ * disk. flags parameter indicates which fields have changed.
+ */
+int
+xfs_qm_write_sb_changes(
+	xfs_mount_t	*mp,
+	__int64_t	flags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+
+#ifdef QUOTADEBUG
+	cmn_err(CE_NOTE,
+		"Writing superblock quota changes :%s",
+		mp->m_fsname);
+#endif
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+	if ((error = xfs_trans_reserve(tp, 0,
+				      mp->m_sb.sb_sectsize + 128, 0,
+				      0,
+				      XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+
+	xfs_mod_sb(tp, flags);
+	(void) xfs_trans_commit(tp, 0, NULL);
+
+	return (0);
+}
+
+
+/* --------------- utility functions for vnodeops ---------------- */
+
+
+/*
+ * Given an inode, a uid and gid (from cred_t) make sure that we have
+ * allocated relevant dquot(s) on disk, and that we won't exceed inode
+ * quotas by creating this file.
+ * This also attaches dquot(s) to the given inode after locking it,
+ * and returns the dquots corresponding to the uid and/or gid.
+ *
+ * in	: inode (unlocked)
+ * out	: udquot, gdquot with references taken and unlocked
+ */
+int
+xfs_qm_vop_dqalloc(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	uid_t		uid,
+	gid_t		gid,
+	uint		flags,
+	xfs_dquot_t	**O_udqpp,
+	xfs_dquot_t	**O_gdqpp)
+{
+	int		error;
+	xfs_dquot_t	*uq, *gq;
+	uint		lockflags;
+
+	lockflags = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lockflags);
+
+	if ((flags & XFS_QMOPT_INHERIT) &&
+	    XFS_INHERIT_GID(ip, XFS_MTOVFS(mp)))
+		gid = ip->i_d.di_gid;
+
+	/*
+	 * Attach the dquot(s) to this inode, doing a dquot allocation
+	 * if necessary. The dquot(s) will not be locked.
+	 */
+	if (XFS_NOT_DQATTACHED(mp, ip)) {
+		if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
+					    XFS_QMOPT_ILOCKED))) {
+			xfs_iunlock(ip, lockflags);
+			return (error);
+		}
+	}
+
+	uq = gq = NULL;
+	if ((flags & XFS_QMOPT_UQUOTA) &&
+	    XFS_IS_UQUOTA_ON(mp)) {
+		if (ip->i_d.di_uid != uid) {
+			/*
+			 * What we need is the dquot that has this uid, and
+			 * if we send the inode to dqget, the uid of the inode
+			 * takes priority over what's sent in the uid argument.
+			 * We must unlock inode here before calling dqget if
+			 * we're not sending the inode, because otherwise
+			 * we'll deadlock by doing trans_reserve while
+			 * holding ilock.
+			 */
+			xfs_iunlock(ip, lockflags);
+			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+						 XFS_DQ_USER,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &uq))) {
+				ASSERT(error != ENOENT);
+				return (error);
+			}
+			/*
+			 * Get the ilock in the right order.
+			 */
+			xfs_dqunlock(uq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			/*
+			 * Take an extra reference, because we'll return
+			 * this to caller
+			 */
+			ASSERT(ip->i_udquot);
+			uq = ip->i_udquot;
+			xfs_dqlock(uq);
+			XFS_DQHOLD(uq);
+			xfs_dqunlock(uq);
+		}
+	}
+	if ((flags & XFS_QMOPT_GQUOTA) &&
+	    XFS_IS_GQUOTA_ON(mp)) {
+		if (ip->i_d.di_gid != gid) {
+			xfs_iunlock(ip, lockflags);
+			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+						 XFS_DQ_GROUP,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &gq))) {
+				if (uq)
+					xfs_qm_dqrele(uq);
+				ASSERT(error != ENOENT);
+				return (error);
+			}
+			xfs_dqunlock(gq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			ASSERT(ip->i_gdquot);
+			gq = ip->i_gdquot;
+			xfs_dqlock(gq);
+			XFS_DQHOLD(gq);
+			xfs_dqunlock(gq);
+		}
+	}
+	if (uq)
+		xfs_dqtrace_entry_ino(uq, "DQALLOC", ip);
+
+	xfs_iunlock(ip, lockflags);
+	if (O_udqpp)
+		*O_udqpp = uq;
+	else if (uq)
+		xfs_qm_dqrele(uq);
+	if (O_gdqpp)
+		*O_gdqpp = gq;
+	else if (gq)
+		xfs_qm_dqrele(gq);
+	return (0);
+}
+
+/*
+ * Actually transfer ownership, and do dquot modifications.
+ * These were already reserved.
+ */
+xfs_dquot_t *
+xfs_qm_vop_chown(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dquot_t	**IO_olddq,
+	xfs_dquot_t	*newdq)
+{
+	xfs_dquot_t	*prevdq;
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+
+	/* old dquot */
+	prevdq = *IO_olddq;
+	ASSERT(prevdq);
+	ASSERT(prevdq != newdq);
+
+	xfs_trans_mod_dquot(tp, prevdq,
+			    XFS_TRANS_DQ_BCOUNT,
+			    -(ip->i_d.di_nblocks));
+	xfs_trans_mod_dquot(tp, prevdq,
+			    XFS_TRANS_DQ_ICOUNT,
+			    -1);
+
+	/* the sparkling new dquot */
+	xfs_trans_mod_dquot(tp, newdq,
+			    XFS_TRANS_DQ_BCOUNT,
+			    ip->i_d.di_nblocks);
+	xfs_trans_mod_dquot(tp, newdq,
+			    XFS_TRANS_DQ_ICOUNT,
+			    1);
+
+	/*
+	 * Take an extra reference, because the inode
+	 * is going to keep this dquot pointer even
+	 * after the trans_commit.
+	 */
+	xfs_dqlock(newdq);
+	XFS_DQHOLD(newdq);
+	xfs_dqunlock(newdq);
+	*IO_olddq = newdq;
+
+	return (prevdq);
+}
+
+/*
+ * Quota reservations for setattr(AT_UID|AT_GID).
+ */
+int
+xfs_qm_vop_chown_reserve(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dquot_t	*udqp,
+	xfs_dquot_t	*gdqp,
+	uint		privileged)
+{
+	int		error;
+	xfs_mount_t	*mp;
+	uint		delblks;
+	xfs_dquot_t	*unresudq, *unresgdq, *delblksudq, *delblksgdq;
+
+	ASSERT(XFS_ISLOCKED_INODE(ip));
+	mp = ip->i_mount;
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	delblks = ip->i_delayed_blks;
+	delblksudq = delblksgdq = unresudq = unresgdq = NULL;
+
+	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
+	    ip->i_d.di_uid != (uid_t)INT_GET(udqp->q_core.d_id, ARCH_CONVERT)) {
+		delblksudq = udqp;
+		/*
+		 * If there are delayed allocation blocks, then we have to
+		 * unreserve those from the old dquot, and add them to the
+		 * new dquot.
+		 */
+		if (delblks) {
+			ASSERT(ip->i_udquot);
+			unresudq = ip->i_udquot;
+		}
+	}
+	if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
+	    ip->i_d.di_gid != INT_GET(gdqp->q_core.d_id, ARCH_CONVERT)) {
+		delblksgdq = gdqp;
+		if (delblks) {
+			ASSERT(ip->i_gdquot);
+			unresgdq = ip->i_gdquot;
+		}
+	}
+
+	if ((error = xfs_trans_reserve_quota(tp, delblksudq,
+					    delblksgdq,
+					    ip->i_d.di_nblocks, 1,
+					    privileged)))
+		return (error);
+
+
+	/*
+	 * Do the delayed blks reservations/unreservations now. Since, these
+	 * are done without the help of a transaction, if a reservation fails
+	 * its previous reservations won't be automatically undone by trans
+	 * code. So, we have to do it manually here.
+	 */
+	if (delblks) {
+		/*
+		 * Do the reservations first. Unreservation can't fail.
+		 */
+		ASSERT(delblksudq || delblksgdq);
+		ASSERT(unresudq || unresgdq);
+		if ((error = xfs_trans_reserve_quota(NULL,
+						    delblksudq, delblksgdq,
+						    (xfs_qcnt_t)delblks, 0,
+						    privileged)))
+			return (error);
+		(void) xfs_trans_unreserve_quota(NULL,
+						 unresudq, unresgdq,
+						 (xfs_qcnt_t)delblks, 0,
+						 0);
+	}
+
+	return (0);
+}
+
+int
+xfs_qm_vop_rename_dqattach(
+	xfs_inode_t	**i_tab)
+{
+	xfs_inode_t	*ip;
+	int		i;
+	int		error;
+
+	ip = i_tab[0];
+
+	if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+		error = xfs_qm_dqattach(ip, 0);
+		if (error)
+			return (error);
+	}
+	for (i = 1; (i < 4 && i_tab[i]); i++) {
+		/*
+		 * Watch out for duplicate entries in the table.
+		 */
+		if ((ip = i_tab[i]) != i_tab[i-1]) {
+			if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+				error = xfs_qm_dqattach(ip, 0);
+				if (error)
+					return (error);
+			}
+		}
+	}
+	return (0);
+}
+
+void
+xfs_qm_vop_dqattach_and_dqmod_newinode(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dquot_t	*udqp,
+	xfs_dquot_t	*gdqp)
+{
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+
+	if (udqp) {
+		xfs_dqlock(udqp);
+		XFS_DQHOLD(udqp);
+		xfs_dqunlock(udqp);
+		ASSERT(ip->i_udquot == NULL);
+		ip->i_udquot = udqp;
+		ASSERT(ip->i_d.di_uid == INT_GET(udqp->q_core.d_id, ARCH_CONVERT));
+		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+	if (gdqp) {
+		xfs_dqlock(gdqp);
+		XFS_DQHOLD(gdqp);
+		xfs_dqunlock(gdqp);
+		ASSERT(ip->i_gdquot == NULL);
+		ip->i_gdquot = gdqp;
+		ASSERT(ip->i_d.di_gid == INT_GET(gdqp->q_core.d_id, ARCH_CONVERT));
+		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+}
+
+/* ------------- list stuff -----------------*/
+void
+xfs_qm_freelist_init(xfs_frlist_t *ql)
+{
+	ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
+	mutex_init(&ql->qh_lock, MUTEX_DEFAULT, "dqf");
+	ql->qh_version = 0;
+	ql->qh_nelems = 0;
+}
+
+void
+xfs_qm_freelist_destroy(xfs_frlist_t *ql)
+{
+	xfs_dquot_t	*dqp, *nextdqp;
+
+	mutex_lock(&ql->qh_lock, PINOD);
+	for (dqp = ql->qh_next;
+	     dqp != (xfs_dquot_t *)ql; ) {
+		xfs_dqlock(dqp);
+		nextdqp = dqp->dq_flnext;
+#ifdef QUOTADEBUG
+		printk("FREELIST destroy 0x%p\n", dqp);
+#endif
+		XQM_FREELIST_REMOVE(dqp);
+		xfs_dqunlock(dqp);
+		xfs_qm_dqdestroy(dqp);
+		dqp = nextdqp;
+	}
+	/*
+	 * Don't bother about unlocking.
+	 */
+	mutex_destroy(&ql->qh_lock);
+
+	ASSERT(ql->qh_nelems == 0);
+}
+
+void
+xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
+{
+	dq->dq_flnext = ql->qh_next;
+	dq->dq_flprev = (xfs_dquot_t *)ql;
+	ql->qh_next = dq;
+	dq->dq_flnext->dq_flprev = dq;
+	xfs_Gqm->qm_dqfreelist.qh_nelems++;
+	xfs_Gqm->qm_dqfreelist.qh_version++;
+}
+
+void
+xfs_qm_freelist_unlink(xfs_dquot_t *dq)
+{
+	xfs_dquot_t *next = dq->dq_flnext;
+	xfs_dquot_t *prev = dq->dq_flprev;
+
+	next->dq_flprev = prev;
+	prev->dq_flnext = next;
+	dq->dq_flnext = dq->dq_flprev = dq;
+	xfs_Gqm->qm_dqfreelist.qh_nelems--;
+	xfs_Gqm->qm_dqfreelist.qh_version++;
+}
+
+#ifdef QUOTADEBUG
+void
+xfs_qm_freelist_print(xfs_frlist_t *qlist, char *title)
+{
+	xfs_dquot_t *dq;
+	int i = 0;
+	printk("%s (#%d)\n", title, (int) qlist->qh_nelems);
+	FOREACH_DQUOT_IN_FREELIST(dq, qlist) {
+		printk("\t%d.\t\"%d (%s:0x%p)\"\t bcnt = %d, icnt = %d "
+		       "refs = %d\n",
+		       ++i, INT_GET(dq->q_core.d_id, ARCH_CONVERT),
+		       DQFLAGTO_TYPESTR(dq), dq,
+		       (int) INT_GET(dq->q_core.d_bcount, ARCH_CONVERT),
+		       (int) INT_GET(dq->q_core.d_icount, ARCH_CONVERT),
+		       (int) dq->q_nrefs);
+	}
+}
+#endif
+
+void
+xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
+{
+	xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
+}
+
+int
+xfs_qm_dqhashlock_nowait(
+	xfs_dquot_t *dqp)
+{
+	int locked;
+
+	locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
+	return (locked);
+}
+
+int
+xfs_qm_freelist_lock_nowait(
+	xfs_qm_t *xqm)
+{
+	int locked;
+
+	locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
+	return (locked);
+}
+
+int
+xfs_qm_mplist_nowait(
+	xfs_mount_t	*mp)
+{
+	int locked;
+
+	ASSERT(mp->m_quotainfo);
+	locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
+	return (locked);
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_qm.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_qm.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_qm.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_qm.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QM_H__
+#define __XFS_QM_H__
+
+struct	xfs_dqhash;
+struct	xfs_inode;
+struct	xfs_dquot;
+
+extern kmem_zone_t	*qm_dqzone;
+extern kmem_zone_t	*qm_dqtrxzone;
+
+/*
+ * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
+ * iterate over the mountpt's dquot list in one call.
+ */
+#define XFS_QM_SYNC_MAX_RESTARTS	7
+
+/*
+ * Ditto, for xfs_qm_dqreclaim_one.
+ */
+#define XFS_QM_RECLAIM_MAX_RESTARTS	4
+
+/*
+ * Ideal ratio of free to in use dquots. Quota manager makes an attempt
+ * to keep this balance.
+ */
+#define XFS_QM_DQFREE_RATIO		2
+
+/*
+ * Dquot hashtable constants/threshold values.
+ */
+#define XFS_QM_NCSIZE_THRESHOLD		5000
+#define XFS_QM_HASHSIZE_LOW		32
+#define XFS_QM_HASHSIZE_HIGH		64
+
+/*
+ * We output a cmn_err when quotachecking a quota file with more than
+ * this many fsbs.
+ */
+#define XFS_QM_BIG_QCHECK_NBLKS		500
+
+/*
+ * This defines the unit of allocation of dquots.
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ * XXXsup However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB	(xfs_filblks_t)1
+/*
+ * When doing a quotacheck, we log dquot clusters of this many FSBs at most
+ * in a single transaction. We don't want to ask for too huge a log reservation.
+ */
+#define XFS_QM_MAX_DQCLUSTER_LOGSZ	3
+
+typedef xfs_dqhash_t	xfs_dqlist_t;
+/*
+ * The freelist head. The first two fields match the first two in the
+ * xfs_dquot_t structure (in xfs_dqmarker_t)
+ */
+typedef struct xfs_frlist {
+       struct xfs_dquot *qh_next;
+       struct xfs_dquot *qh_prev;
+       mutex_t		 qh_lock;
+       uint		 qh_version;
+       uint		 qh_nelems;
+} xfs_frlist_t;
+
+/*
+ * Quota Manager (global) structure. Lives only in core.
+ */
+typedef struct xfs_qm {
+	xfs_dqlist_t	*qm_usr_dqhtable;/* udquot hash table */
+	xfs_dqlist_t	*qm_grp_dqhtable;/* gdquot hash table */
+	uint		 qm_dqhashmask;	 /* # buckets in dq hashtab - 1 */
+	xfs_frlist_t	 qm_dqfreelist;	 /* freelist of dquots */
+	atomic_t	 qm_totaldquots; /* total incore dquots */
+	uint		 qm_nrefs;	 /* file systems with quota on */
+	int		 qm_dqfree_ratio;/* ratio of free to inuse dquots */
+	kmem_zone_t	*qm_dqzone;	 /* dquot mem-alloc zone */
+	kmem_zone_t	*qm_dqtrxzone;	 /* t_dqinfo of transactions */
+} xfs_qm_t;
+
+/*
+ * Various quota information for individual filesystems.
+ * The mount structure keeps a pointer to this.
+ */
+typedef struct xfs_quotainfo {
+	xfs_inode_t	*qi_uquotaip;	 /* user quota inode */
+	xfs_inode_t	*qi_gquotaip;	 /* group quota inode */
+	lock_t		 qi_pinlock;	 /* dquot pinning mutex */
+	xfs_dqlist_t	 qi_dqlist;	 /* all dquots in filesys */
+	int		 qi_dqreclaims;	 /* a change here indicates
+					    a removal in the dqlist */
+	time_t		 qi_btimelimit;	 /* limit for blks timer */
+	time_t		 qi_itimelimit;	 /* limit for inodes timer */
+	time_t		 qi_rtbtimelimit;/* limit for rt blks timer */
+	xfs_qwarncnt_t	 qi_bwarnlimit;	 /* limit for num warnings */
+	xfs_qwarncnt_t	 qi_iwarnlimit;	 /* limit for num warnings */
+	mutex_t		 qi_quotaofflock;/* to serialize quotaoff */
+	/* Some useful precalculated constants */
+	xfs_filblks_t	 qi_dqchunklen;	 /* # BBs in a chunk of dqs */
+	uint		 qi_dqperchunk;	 /* # ondisk dqs in above chunk */
+} xfs_quotainfo_t;
+
+
+/*
+ * The structure kept inside the xfs_trans_t keep track of dquot changes
+ * within a transaction and apply them later.
+ */
+typedef struct xfs_dqtrx {
+	struct xfs_dquot *qt_dquot;	  /* the dquot this refers to */
+	ulong		qt_blk_res;	  /* blks reserved on a dquot */
+	ulong		qt_blk_res_used;  /* blks used from the reservation */
+	ulong		qt_ino_res;	  /* inode reserved on a dquot */
+	ulong		qt_ino_res_used;  /* inodes used from the reservation */
+	long		qt_bcount_delta;  /* dquot blk count changes */
+	long		qt_delbcnt_delta; /* delayed dquot blk count changes */
+	long		qt_icount_delta;  /* dquot inode count changes */
+	ulong		qt_rtblk_res;	  /* # blks reserved on a dquot */
+	ulong		qt_rtblk_res_used;/* # blks used from reservation */
+	long		qt_rtbcount_delta;/* dquot realtime blk changes */
+	long		qt_delrtb_delta;  /* delayed RT blk count changes */
+} xfs_dqtrx_t;
+
+/*
+ * We keep the usr and grp dquots separately so that locking will be easier
+ * to do at commit time. All transactions that we know of at this point
+ * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
+ */
+#define XFS_QM_TRANS_MAXDQS		2
+typedef struct xfs_dquot_acct {
+	xfs_dqtrx_t	dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
+	xfs_dqtrx_t	dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
+} xfs_dquot_acct_t;
+
+/*
+ * Users are allowed to have a usage exceeding their softlimit for
+ * a period this long.
+ */
+#define XFS_QM_BTIMELIMIT	DQ_BTIMELIMIT
+#define XFS_QM_RTBTIMELIMIT	DQ_BTIMELIMIT
+#define XFS_QM_ITIMELIMIT	DQ_FTIMELIMIT
+
+#define XFS_QM_BWARNLIMIT	5
+#define XFS_QM_IWARNLIMIT	5
+
+#define XFS_QM_LOCK(xqm)	(mutex_lock(&xqm##_lock, PINOD))
+#define XFS_QM_UNLOCK(xqm)	(mutex_unlock(&xqm##_lock))
+#define XFS_QM_HOLD(xqm)	((xqm)->qm_nrefs++)
+#define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
+
+extern int		xfs_qm_init_quotainfo(xfs_mount_t *);
+extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
+extern void		xfs_qm_dqunlink(xfs_dquot_t *);
+extern boolean_t	xfs_qm_dqalloc_incore(xfs_dquot_t **);
+extern int		xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+
+/* list stuff */
+extern void		xfs_qm_freelist_init(xfs_frlist_t *);
+extern void		xfs_qm_freelist_destroy(xfs_frlist_t *);
+extern void		xfs_qm_freelist_insert(xfs_frlist_t *, xfs_dquot_t *);
+extern void		xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
+extern void		xfs_qm_freelist_unlink(xfs_dquot_t *);
+extern int		xfs_qm_freelist_lock_nowait(xfs_qm_t *);
+extern int		xfs_qm_mplist_nowait(xfs_mount_t *);
+extern int		xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
+
+/* system call interface */
+extern int linvfs_getxstate(struct super_block *, struct fs_quota_stat *);
+extern int linvfs_setxstate(struct super_block *, unsigned int, int);
+extern int linvfs_getxquota(struct super_block *, int, qid_t, struct fs_disk_quota *);
+extern int linvfs_setxquota(struct super_block *, int, qid_t, struct fs_disk_quota *);
+
+#ifdef DEBUG
+extern int		xfs_qm_internalqcheck(xfs_mount_t *);
+#else
+#define xfs_qm_internalqcheck(mp)	(0)
+#endif
+
+#ifdef QUOTADEBUG
+extern void		xfs_qm_freelist_print(xfs_frlist_t *, char *);
+#else
+#define xfs_qm_freelist_print(a, b)	do { } while (0)
+#endif
+
+#endif /* __XFS_QM_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_qm_syscalls.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_qm_syscalls.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_qm_syscalls.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_qm_syscalls.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1465 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+#ifdef DEBUG
+# define qdprintk(s, args...)		printk(s, ## args)
+#else
+# define qdprintk(s, args...)		do { } while (0)
+#endif
+
+STATIC int	xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+STATIC int	xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+					fs_disk_quota_t *);
+STATIC int	xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+STATIC int	xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+					fs_disk_quota_t *);
+STATIC int	xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+STATIC int	xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
+STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
+STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
+					uint);
+STATIC uint	xfs_qm_import_flags(uint);
+STATIC uint	xfs_qm_export_flags(uint);
+STATIC uint	xfs_qm_import_qtype_flags(uint);
+STATIC uint	xfs_qm_export_qtype_flags(uint);
+STATIC void	xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
+					fs_disk_quota_t *);
+
+
+int
+linvfs_getxstate(
+	struct super_block	*sb,
+	struct fs_quota_stat	*fqs)
+{
+	xfs_mount_t		*mp;
+	vfs_t			*vfsp;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	return -xfs_qm_scall_getqstat(mp, fqs);
+}
+
+int
+linvfs_setxstate(
+	struct super_block	*sb,
+	unsigned int		flags,
+	int			op)
+{
+	xfs_mount_t		*mp;
+	vfs_t			*vfsp;
+	uint			qflags;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	if (vfsp->vfs_flag & VFS_RDONLY)
+		return -EROFS;
+
+	switch (op) {
+	case Q_XQUOTARM:
+		if (XFS_IS_QUOTA_ON(mp)) {
+			qdprintk("cannot remove, quota on: flags=%x\n", flags);
+			return -EINVAL;
+		}
+		qflags = xfs_qm_import_qtype_flags(flags);
+		return -xfs_qm_scall_trunc_qfiles(mp, qflags);
+	case Q_XQUOTAON:
+		qflags = xfs_qm_import_flags(flags);
+		return -xfs_qm_scall_quotaon(mp, qflags);
+	case Q_XQUOTAOFF:
+		qflags = xfs_qm_import_flags(flags);
+		if (mp->m_dev == rootdev)
+			return -xfs_qm_scall_quotaoff(mp, qflags, B_FALSE);
+		if (!XFS_IS_QUOTA_ON(mp))
+			return -ESRCH;
+		return -xfs_qm_scall_quotaoff(mp, qflags, B_FALSE);
+	}
+	qdprintk("cannot set state, invalid op: op=%x flags=%x\n", op, flags);
+	return -EINVAL;
+}
+
+int
+linvfs_getxquota(
+	struct super_block	*sb,
+	int			type,
+	qid_t			id,
+	struct fs_disk_quota	*fdq)
+{
+	xfs_mount_t		*mp;
+	vfs_t			*vfsp;
+	int			qtype;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+	qtype = (type == GRPQUOTA)? XFS_DQ_GROUP : XFS_DQ_USER;
+	return -xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, qtype, fdq);
+}
+
+int
+linvfs_setxquota(
+	struct super_block	*sb,
+	int			type,
+	qid_t			id,
+	struct fs_disk_quota	*fdq)
+{
+	xfs_mount_t		*mp;
+	vfs_t			*vfsp;
+	int			qtype;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+	if (vfsp->vfs_flag & VFS_RDONLY)
+		return -EROFS;
+	qtype = (type == GRPQUOTA)? XFS_DQ_GROUP : XFS_DQ_USER;
+	return xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, qtype, fdq);
+}
+
+
+/*
+ * Turn off quota accounting and/or enforcement for all udquots and/or
+ * gdquots. Called only at unmount time.
+ *
+ * This assumes that there are no dquots of this file system cached
+ * incore, and modifies the ondisk dquot directly. Therefore, for example,
+ * it is an error to call this twice, without purging the cache.
+ */
+STATIC int
+xfs_qm_scall_quotaoff(
+	xfs_mount_t		*mp,
+	uint			flags,
+	boolean_t		force)
+{
+	uint			dqtype;
+	unsigned long	s;
+	int			error;
+	uint			inactivate_flags;
+	xfs_qoff_logitem_t	*qoffstart;
+	uint			sbflags, newflags;
+	int			nculprits;
+
+	if (!force && !capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+	/*
+	 * Only root file system can have quotas enabled on disk but not
+	 * in core. Note that quota utilities (like quotaoff) _expect_
+	 * errno == EEXIST here.
+	 */
+	if (mp->m_dev != rootdev && (mp->m_qflags & flags) == 0)
+		return XFS_ERROR(EEXIST);
+	error = 0;
+
+	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+
+	/*
+	 * We don't want to deal with two quotaoffs messing up each other,
+	 * so we're going to serialize it. quotaoff isn't exactly a performance
+	 * critical thing.
+	 * If quotaoff, then we must be dealing with the root filesystem.
+	 */
+	ASSERT(mp->m_quotainfo || mp->m_dev == rootdev);
+	if (mp->m_quotainfo)
+		mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+
+	/*
+	 * Root file system may or may not have quotas on in core.
+	 * We have to perform the quotaoff accordingly.
+	 */
+	if (mp->m_dev == rootdev) {
+		s = XFS_SB_LOCK(mp);
+		sbflags = mp->m_sb.sb_qflags;
+		if ((mp->m_qflags & flags) == 0) {
+			mp->m_sb.sb_qflags &= ~(flags);
+			newflags = mp->m_sb.sb_qflags;
+			XFS_SB_UNLOCK(mp, s);
+			if (mp->m_quotainfo)
+				mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+			if (sbflags != newflags)
+			      error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+			return (error);
+		}
+		XFS_SB_UNLOCK(mp, s);
+
+		if ((sbflags & flags) != (mp->m_qflags & flags)) {
+			/*
+			 * This can happen only with grp+usr quota
+			 * combination. Note: 1) accounting cannot be turned
+			 * off without enforcement also getting turned off.
+			 * 2) Every flag that exists in mpqflags MUST exist
+			 * in sbqflags (but not vice versa).
+			 * which means at this point sbqflags = UQ+GQ+..,
+			 * and mpqflags = UQ or GQ.
+			 */
+			ASSERT(sbflags & XFS_GQUOTA_ACCT);
+			ASSERT((sbflags & XFS_ALL_QUOTA_ACCT) !=
+			       (mp->m_qflags & XFS_ALL_QUOTA_ACCT));
+
+			qdprintk("quotaoff, sbflags=%x flags=%x m_qflags=%x\n",
+				sbflags, flags, mp->m_qflags);
+			/* XXX TBD Finish this for group quota support */
+			/* We need to update the SB and mp separately */
+			return XFS_ERROR(EINVAL);
+		}
+	}
+	ASSERT(mp->m_quotainfo);
+
+	/*
+	 * if we're just turning off quota enforcement, change mp and go.
+	 */
+	if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
+		mp->m_qflags &= ~(flags);
+
+		s = XFS_SB_LOCK(mp);
+		mp->m_sb.sb_qflags = mp->m_qflags;
+		XFS_SB_UNLOCK(mp, s);
+		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+
+		/* XXX what to do if error ? Revert back to old vals incore ? */
+		error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+		return (error);
+	}
+
+	dqtype = 0;
+	inactivate_flags = 0;
+	/*
+	 * If accounting is off, we must turn enforcement off, clear the
+	 * quota 'CHKD' certificate to make it known that we have to
+	 * do a quotacheck the next time this quota is turned on.
+	 */
+	if (flags & XFS_UQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_UQUOTA;
+		flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
+		inactivate_flags |= XFS_UQUOTA_ACTIVE;
+	}
+	if (flags & XFS_GQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_GQUOTA;
+		flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
+		inactivate_flags |= XFS_GQUOTA_ACTIVE;
+	}
+
+	/*
+	 * Nothing to do? Don't complain.
+	 * This happens when we're just turning off quota enforcement.
+	 */
+	if ((mp->m_qflags & flags) == 0) {
+		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+		return (0);
+	}
+
+	/*
+	 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
+	 * and synchronously.
+	 */
+	xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+
+	/*
+	 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
+	 * to take care of the race between dqget and quotaoff. We don't take
+	 * any special locks to reset these bits. All processes need to check
+	 * these bits *after* taking inode lock(s) to see if the particular
+	 * quota type is in the process of being turned off. If *ACTIVE, it is
+	 * guaranteed that all dquot structures and all quotainode ptrs will all
+	 * stay valid as long as that inode is kept locked.
+	 *
+	 * There is no turning back after this.
+	 */
+	mp->m_qflags &= ~inactivate_flags;
+
+	/*
+	 * Give back all the dquot reference(s) held by inodes.
+	 * Here we go thru every single incore inode in this file system, and
+	 * do a dqrele on the i_udquot/i_gdquot that it may have.
+	 * Essentially, as long as somebody has an inode locked, this guarantees
+	 * that quotas will not be turned off. This is handy because in a
+	 * transaction once we lock the inode(s) and check for quotaon, we can
+	 * depend on the quota inodes (and other things) being valid as long as
+	 * we keep the lock(s).
+	 */
+	xfs_qm_dqrele_all_inodes(mp, flags);
+
+	/*
+	 * Next we make the changes in the quota flag in the mount struct.
+	 * This isn't protected by a particular lock directly, because we
+	 * don't want to take a mrlock everytime we depend on quotas being on.
+	 */
+	mp->m_qflags &= ~(flags);
+
+	/*
+	 * Go through all the dquots of this file system and purge them,
+	 * according to what was turned off. We may not be able to get rid
+	 * of all dquots, because dquots can have temporary references that
+	 * are not attached to inodes. eg. xfs_setattr, xfs_create.
+	 * So, if we couldn't purge all the dquots from the filesystem,
+	 * we can't get rid of the incore data structures.
+	 */
+	while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) {
+		delay(10 * nculprits);
+	}
+
+	/*
+	 * Transactions that had started before ACTIVE state bit was cleared
+	 * could have logged many dquots, so they'd have higher LSNs than
+	 * the first QUOTAOFF log record does. If we happen to crash when
+	 * the tail of the log has gone past the QUOTAOFF record, but
+	 * before the last dquot modification, those dquots __will__
+	 * recover, and that's not good.
+	 *
+	 * So, we have QUOTAOFF start and end logitems; the start
+	 * logitem won't get overwritten until the end logitem appears...
+	 */
+	xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+
+	/*
+	 * If quotas is completely disabled, close shop.
+	 */
+	if ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_ALL) {
+		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+		xfs_qm_destroy_quotainfo(mp);
+		return (0);
+	}
+
+	/*
+	 * Release our quotainode references, and vn_purge them,
+	 * if we don't need them anymore.
+	 */
+	if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
+		XFS_PURGE_INODE(XFS_QI_UQIP(mp));
+		XFS_QI_UQIP(mp) = NULL;
+	}
+	if ((dqtype & XFS_QMOPT_GQUOTA) && XFS_QI_GQIP(mp)) {
+		XFS_PURGE_INODE(XFS_QI_GQIP(mp));
+		XFS_QI_GQIP(mp) = NULL;
+	}
+	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+
+	return (error);
+}
+
+STATIC int
+xfs_qm_scall_trunc_qfiles(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error;
+	xfs_inode_t	*qip;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+	error = 0;
+	if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) {
+		qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
+		return XFS_ERROR(EINVAL);
+	}
+
+	if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
+		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, &qip, 0);
+		if (! error) {
+			(void) xfs_truncate_file(mp, qip);
+			VN_RELE(XFS_ITOV(qip));
+		}
+	}
+
+	if ((flags & XFS_DQ_GROUP) && mp->m_sb.sb_gquotino != NULLFSINO) {
+		error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, &qip, 0);
+		if (! error) {
+			(void) xfs_truncate_file(mp, qip);
+			VN_RELE(XFS_ITOV(qip));
+		}
+	}
+
+	return (error);
+}
+
+
+/*
+ * This does two separate functions:
+ * Switch on quotas for the root file system. This will take effect only
+ * on reboot.
+ * Switch on (a given) quota enforcement for both root and non-root filesystems.
+ * This takes effect immediately.
+ */
+STATIC int
+xfs_qm_scall_quotaon(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error;
+	unsigned long s;
+	uint		qf;
+	uint		accflags;
+	__int64_t	sbflags;
+	boolean_t	rootfs;
+	boolean_t	delay;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+
+	rootfs = (boolean_t) (mp->m_dev == rootdev);
+
+	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+	/*
+	 * If caller wants to turn on accounting on /, but accounting
+	 * is already turned on, ignore ACCTing flags.
+	 * Switching on quota accounting for non-root filesystems
+	 * must be done at mount time.
+	 */
+	accflags = flags & XFS_ALL_QUOTA_ACCT;
+	if (!rootfs ||
+	    (accflags && rootfs && ((mp->m_qflags & accflags) == accflags))) {
+		flags &= ~(XFS_ALL_QUOTA_ACCT);
+	}
+
+	sbflags = 0;
+	delay = (boolean_t) ((flags & XFS_ALL_QUOTA_ACCT) != 0);
+
+	if (flags == 0) {
+		qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
+		return XFS_ERROR(EINVAL);
+	}
+
+	/* Only rootfs can turn on quotas with a delayed effect */
+	ASSERT(!delay || rootfs);
+
+	/*
+	 * Can't enforce without accounting. We check the superblock
+	 * qflags here instead of m_qflags because rootfs can have
+	 * quota acct on ondisk without m_qflags' knowing.
+	 */
+	if (((flags & XFS_UQUOTA_ACCT) == 0 &&
+	    (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+	    (flags & XFS_UQUOTA_ENFD))
+	    ||
+	    ((flags & XFS_GQUOTA_ACCT) == 0 &&
+	    (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+	    (flags & XFS_GQUOTA_ENFD))) {
+		qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
+			flags, mp->m_sb.sb_qflags);
+		return XFS_ERROR(EINVAL);
+	}
+	/*
+	 * If everything's upto-date incore, then don't waste time.
+	 */
+	if ((mp->m_qflags & flags) == flags)
+		return XFS_ERROR(EEXIST);
+
+	/*
+	 * Change superblock version (if needed) for the root filesystem
+	 */
+	if (rootfs && !XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+		qdprintk("Old superblock version %x\n", mp->m_sb.sb_versionnum);
+		s = XFS_SB_LOCK(mp);
+		XFS_SB_VERSION_ADDQUOTA(&mp->m_sb);
+		mp->m_sb.sb_uquotino = NULLFSINO;
+		mp->m_sb.sb_gquotino = NULLFSINO;
+		mp->m_sb.sb_qflags = 0;
+		XFS_SB_UNLOCK(mp, s);
+		qdprintk("Converted to version %x\n", mp->m_sb.sb_versionnum);
+		sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+			    XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+	}
+
+	/*
+	 * Change sb_qflags on disk but not incore mp->qflags
+	 * if this is the root filesystem.
+	 */
+	s = XFS_SB_LOCK(mp);
+	qf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = qf | flags;
+	XFS_SB_UNLOCK(mp, s);
+
+	/*
+	 * There's nothing to change if it's the same.
+	 */
+	if ((qf & flags) == flags && sbflags == 0)
+		return XFS_ERROR(EEXIST);
+	sbflags |= XFS_SB_QFLAGS;
+
+	if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
+		return (error);
+	/*
+	 * If we had just turned on quotas (ondisk) for rootfs, or if we aren't
+	 * trying to switch on quota enforcement, we are done.
+	 */
+	if (delay ||
+	    ((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
+	    (flags & XFS_ALL_QUOTA_ENFD) == 0)
+		return (0);
+
+	if (! XFS_IS_QUOTA_RUNNING(mp))
+		return XFS_ERROR(ESRCH);
+
+	/*
+	 * Switch on quota enforcement in core. This applies to both root
+	 * and non-root file systems.
+	 */
+	mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+	mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
+	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+
+	return (0);
+}
+
+
+
+/*
+ * Return quota status information, such as uquota-off, enforcements, etc.
+ */
+STATIC int
+xfs_qm_scall_getqstat(
+	xfs_mount_t	*mp,
+	fs_quota_stat_t *out)
+{
+	xfs_inode_t	*uip, *gip;
+	boolean_t	tempuqip, tempgqip;
+	__uint16_t	sbflags;
+
+	uip = gip = NULL;
+	tempuqip = tempgqip = B_FALSE;
+	bzero(out, sizeof(fs_quota_stat_t));
+
+	out->qs_version = FS_QSTAT_VERSION;
+	if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+		out->qs_uquota.qfs_ino = NULLFSINO;
+		out->qs_gquota.qfs_ino = NULLFSINO;
+		return (0);
+	}
+	out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
+							(XFS_ALL_QUOTA_ACCT|
+							 XFS_ALL_QUOTA_ENFD));
+	/*
+	 * If the qflags are different on disk, as can be the case when
+	 * root filesystem's quotas are being turned on, return them in the
+	 * HI 8 bits.
+	 */
+	if (mp->m_dev == rootdev) {
+		sbflags = (__uint16_t) xfs_qm_export_flags(mp->m_sb.sb_qflags &
+							   (XFS_ALL_QUOTA_ACCT|
+							    XFS_ALL_QUOTA_ENFD));
+		ASSERT((out->qs_flags & 0xff00) == 0);
+		if (sbflags != out->qs_flags)
+			out->qs_flags |= ((sbflags & 0x00ff) << 8);
+	}
+
+	out->qs_pad = 0;
+	out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
+	out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+	if (mp->m_quotainfo) {
+		uip = mp->m_quotainfo->qi_uquotaip;
+		gip = mp->m_quotainfo->qi_gquotaip;
+	}
+	if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
+		if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, &uip, 0) == 0)
+			tempuqip = B_TRUE;
+	}
+	if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
+		if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, &gip, 0) == 0)
+			tempgqip = B_TRUE;
+	}
+	if (uip) {
+		out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
+		out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
+		if (tempuqip)
+			VN_RELE(XFS_ITOV(uip));
+	}
+	if (gip) {
+		out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
+		out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
+		if (tempgqip)
+			VN_RELE(XFS_ITOV(gip));
+	}
+	if (mp->m_quotainfo) {
+		out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
+		out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
+		out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
+		out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
+		out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
+		out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
+	}
+	return (0);
+}
+
+/*
+ * Adjust quota limits, and start/stop timers accordingly.
+ */
+STATIC int
+xfs_qm_scall_setqlim(
+	xfs_mount_t		*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	fs_disk_quota_t		*newlim)
+{
+	xfs_disk_dquot_t	*ddq;
+	xfs_dquot_t		*dqp;
+	xfs_trans_t		*tp;
+	int			error;
+	xfs_qcnt_t		hard, soft;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+
+	if ((newlim->d_fieldmask & (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK)) == 0)
+		return (0);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
+				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+
+	/*
+	 * We don't want to race with a quotaoff so take the quotaoff lock.
+	 * (We don't hold an inode lock, so there's nothing else to stop
+	 * a quotaoff from happening). (XXXThis doesn't currently happen
+	 * because we take the vfslock before calling xfs_qm_sysent).
+	 */
+	mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+
+	/*
+	 * Get the dquot (locked), and join it to the transaction.
+	 * Allocate the dquot if this doesn't exist.
+	 */
+	if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
+		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+		ASSERT(error != ENOENT);
+		return (error);
+	}
+	xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET");
+	xfs_trans_dqjoin(tp, dqp);
+	ddq = &dqp->q_core;
+
+	/*
+	 * Make sure that hardlimits are >= soft limits before changing.
+	 */
+	hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+			INT_GET(ddq->d_blk_hardlimit, ARCH_CONVERT);
+	soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+			INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT);
+	if (hard == 0 || hard >= soft) {
+		INT_SET(ddq->d_blk_hardlimit, ARCH_CONVERT, hard);
+		INT_SET(ddq->d_blk_softlimit, ARCH_CONVERT, soft);
+	}
+	else {
+		qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
+	}
+	hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+			INT_GET(ddq->d_rtb_hardlimit, ARCH_CONVERT);
+	soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+			INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT);
+	if (hard == 0 || hard >= soft) {
+		INT_SET(ddq->d_rtb_hardlimit, ARCH_CONVERT, hard);
+		INT_SET(ddq->d_rtb_softlimit, ARCH_CONVERT, soft);
+	}
+	else
+		qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+
+	hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+		(xfs_qcnt_t) newlim->d_ino_hardlimit :
+		INT_GET(ddq->d_ino_hardlimit, ARCH_CONVERT);
+	soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+		(xfs_qcnt_t) newlim->d_ino_softlimit :
+		INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT);
+	if (hard == 0 || hard >= soft) {
+		INT_SET(ddq->d_ino_hardlimit, ARCH_CONVERT, hard);
+		INT_SET(ddq->d_ino_softlimit, ARCH_CONVERT, soft);
+	}
+	else
+		qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
+
+	if (id == 0) {
+		/*
+		 * Timelimits for the super user set the relative time
+		 * the other users can be over quota for this file system.
+		 * If it is zero a default is used.
+		 */
+		if (newlim->d_fieldmask & FS_DQ_BTIMER) {
+			mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
+			INT_SET(dqp->q_core.d_btimer, ARCH_CONVERT, newlim->d_btimer);
+		}
+		if (newlim->d_fieldmask & FS_DQ_ITIMER) {
+			mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
+			INT_SET(dqp->q_core.d_itimer, ARCH_CONVERT, newlim->d_itimer);
+		}
+		if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
+			mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
+			INT_SET(dqp->q_core.d_rtbtimer, ARCH_CONVERT, newlim->d_rtbtimer);
+		}
+	} else /* if (XFS_IS_QUOTA_ENFORCED(mp)) */ {
+		/*
+		 * If the user is now over quota, start the timelimit.
+		 * The user will not be 'warned'.
+		 * Note that we keep the timers ticking, whether enforcement
+		 * is on or off. We don't really want to bother with iterating
+		 * over all ondisk dquots and turning the timers on/off.
+		 */
+		xfs_qm_adjust_dqtimers(mp, ddq);
+	}
+	dqp->dq_flags |= XFS_DQ_DIRTY;
+	xfs_trans_log_dquot(tp, dqp);
+
+	xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
+	xfs_trans_commit(tp, 0, NULL);
+	xfs_qm_dqprint(dqp);
+	xfs_qm_dqrele(dqp);
+	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+
+	return (0);
+}
+
+STATIC int
+xfs_qm_scall_getquota(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	fs_disk_quota_t *out)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	/*
+	 * Try to get the dquot. We don't want it allocated on disk, so
+	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+	 * exist, we'll get ENOENT back.
+	 */
+	if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
+		return (error);
+	}
+
+	xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS");
+	/*
+	 * If everything's NULL, this dquot doesn't quite exist as far as
+	 * our utility programs are concerned.
+	 */
+	if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+		xfs_qm_dqput(dqp);
+		return XFS_ERROR(ENOENT);
+	}
+	/* xfs_qm_dqprint(dqp); */
+	/*
+	 * Convert the disk dquot to the exportable format
+	 */
+	xfs_qm_export_dquot(mp, &dqp->q_core, out);
+	xfs_qm_dqput(dqp);
+	return (error ? XFS_ERROR(EFAULT) : 0);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff_end(
+	xfs_mount_t		*mp,
+	xfs_qoff_logitem_t	*startqoff,
+	uint			flags)
+{
+	xfs_trans_t	       *tp;
+	int			error;
+	xfs_qoff_logitem_t     *qoffi;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
+
+	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
+				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+
+	qoffi = xfs_trans_get_qoff_item(tp, startqoff,
+					flags & XFS_ALL_QUOTA_ACCT);
+	xfs_trans_log_quotaoff_item(tp, qoffi);
+
+	/*
+	 * We have to make sure that the transaction is secure on disk before we
+	 * return and actually stop quota accounting. So, make it synchronous.
+	 * We don't care about quotoff's performance.
+	 */
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0, NULL);
+	return (error);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff(
+	xfs_mount_t	       *mp,
+	xfs_qoff_logitem_t     **qoffstartp,
+	uint		       flags)
+{
+	xfs_trans_t	       *tp;
+	int			error;
+	unsigned long	s;
+	xfs_qoff_logitem_t     *qoffi=NULL;
+	uint			oldsbqflag=0;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
+	if ((error = xfs_trans_reserve(tp, 0,
+				      sizeof(xfs_qoff_logitem_t) * 2 +
+				      mp->m_sb.sb_sectsize + 128,
+				      0,
+				      0,
+				      XFS_DEFAULT_LOG_COUNT))) {
+		goto error0;
+	}
+
+	qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+	xfs_trans_log_quotaoff_item(tp, qoffi);
+
+	s = XFS_SB_LOCK(mp);
+	oldsbqflag = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+	XFS_SB_UNLOCK(mp, s);
+
+	xfs_mod_sb(tp, XFS_SB_QFLAGS);
+
+	/*
+	 * We have to make sure that the transaction is secure on disk before we
+	 * return and actually stop quota accounting. So, make it synchronous.
+	 * We don't care about quotoff's performance.
+	 */
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0, NULL);
+
+error0:
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		/*
+		 * No one else is modifying sb_qflags, so this is OK.
+		 * We still hold the quotaofflock.
+		 */
+		s = XFS_SB_LOCK(mp);
+		mp->m_sb.sb_qflags = oldsbqflag;
+		XFS_SB_UNLOCK(mp, s);
+	}
+	*qoffstartp = qoffi;
+	return (error);
+}
+
+
+/*
+ * Translate an internal style on-disk-dquot to the exportable format.
+ * The main differences are that the counters/limits are all in Basic
+ * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
+ * to be converted to the native endianness.
+ */
+STATIC void
+xfs_qm_export_dquot(
+	xfs_mount_t		*mp,
+	xfs_disk_dquot_t	*src,
+	struct fs_disk_quota	*dst)
+{
+	bzero(dst, sizeof(*dst));
+	dst->d_version = FS_DQUOT_VERSION;  /* different from src->d_version */
+	dst->d_flags =
+		xfs_qm_export_qtype_flags(INT_GET(src->d_flags, ARCH_CONVERT));
+	dst->d_id = INT_GET(src->d_id, ARCH_CONVERT);
+	dst->d_blk_hardlimit = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_hardlimit, ARCH_CONVERT));
+	dst->d_blk_softlimit = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_softlimit, ARCH_CONVERT));
+	dst->d_ino_hardlimit = (__uint64_t)
+		INT_GET(src->d_ino_hardlimit, ARCH_CONVERT);
+	dst->d_ino_softlimit = (__uint64_t)
+		INT_GET(src->d_ino_softlimit, ARCH_CONVERT);
+	dst->d_bcount = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_bcount, ARCH_CONVERT));
+	dst->d_icount = (__uint64_t) INT_GET(src->d_icount, ARCH_CONVERT);
+	dst->d_btimer = (__uint32_t) INT_GET(src->d_btimer, ARCH_CONVERT);
+	dst->d_itimer = (__uint32_t) INT_GET(src->d_itimer, ARCH_CONVERT);
+	dst->d_iwarns = INT_GET(src->d_iwarns, ARCH_CONVERT);
+	dst->d_bwarns = INT_GET(src->d_bwarns, ARCH_CONVERT);
+
+	dst->d_rtb_hardlimit = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_hardlimit, ARCH_CONVERT));
+	dst->d_rtb_softlimit = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_softlimit, ARCH_CONVERT));
+	dst->d_rtbcount = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_rtbcount, ARCH_CONVERT));
+	dst->d_rtbtimer = (__uint32_t) INT_GET(src->d_rtbtimer, ARCH_CONVERT);
+	dst->d_rtbwarns = INT_GET(src->d_rtbwarns, ARCH_CONVERT);
+
+	/*
+	 * Internally, we don't reset all the timers when quota enforcement
+	 * gets turned off. No need to confuse the userlevel code,
+	 * so return zeroes in that case.
+	 */
+	if (! XFS_IS_QUOTA_ENFORCED(mp)) {
+		dst->d_btimer = 0;
+		dst->d_itimer = 0;
+		dst->d_rtbtimer = 0;
+	}
+
+#ifdef DEBUG
+	if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) {
+		if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+		    (dst->d_blk_softlimit > 0)) {
+			ASSERT(dst->d_btimer != 0);
+		}
+		if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+		    (dst->d_ino_softlimit > 0)) {
+			ASSERT(dst->d_itimer != 0);
+		}
+	}
+#endif
+}
+
+STATIC uint
+xfs_qm_import_qtype_flags(
+	uint uflags)
+{
+	/*
+	 * Can't be both at the same time.
+	 */
+	if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
+	     (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
+	    ((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == 0))
+		return (0);
+
+	return (uflags & XFS_USER_QUOTA) ?
+		XFS_DQ_USER : XFS_DQ_GROUP;
+}
+
+STATIC uint
+xfs_qm_export_qtype_flags(
+	uint flags)
+{
+	/*
+	 * Can't be both at the same time.
+	 */
+	ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) !=
+		(XFS_GROUP_QUOTA | XFS_USER_QUOTA));
+	ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) != 0);
+
+	return (flags & XFS_DQ_USER) ?
+		XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+}
+
+STATIC uint
+xfs_qm_import_flags(
+	uint uflags)
+{
+	uint flags = 0;
+
+	if (uflags & XFS_QUOTA_UDQ_ACCT)
+		flags |= XFS_UQUOTA_ACCT;
+	if (uflags & XFS_QUOTA_GDQ_ACCT)
+		flags |= XFS_GQUOTA_ACCT;
+	if (uflags & XFS_QUOTA_UDQ_ENFD)
+		flags |= XFS_UQUOTA_ENFD;
+	if (uflags & XFS_QUOTA_GDQ_ENFD)
+		flags |= XFS_GQUOTA_ENFD;
+	return (flags);
+}
+
+
+STATIC uint
+xfs_qm_export_flags(
+	uint flags)
+{
+	uint uflags;
+
+	uflags = 0;
+	if (flags & XFS_UQUOTA_ACCT)
+		uflags |= XFS_QUOTA_UDQ_ACCT;
+	if (flags & XFS_GQUOTA_ACCT)
+		uflags |= XFS_QUOTA_GDQ_ACCT;
+	if (flags & XFS_UQUOTA_ENFD)
+		uflags |= XFS_QUOTA_UDQ_ENFD;
+	if (flags & XFS_GQUOTA_ENFD)
+		uflags |= XFS_QUOTA_GDQ_ENFD;
+	return (uflags);
+}
+
+
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff. This also gets called from
+ * xfs_rootumount.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+	struct xfs_mount *mp,
+	uint		 flags)
+{
+	vmap_t		vmap;
+	xfs_inode_t	*ip, *topino;
+	uint		ireclaims;
+	vnode_t		*vp;
+	boolean_t	vnode_refd;
+
+	ASSERT(mp->m_quotainfo);
+
+again:
+	XFS_MOUNT_ILOCK(mp);
+	ip = mp->m_inodes;
+	if (ip == NULL) {
+		XFS_MOUNT_IUNLOCK(mp);
+		return;
+	}
+	do {
+		/* Skip markers inserted by xfs_sync */
+		if (ip->i_mount == NULL) {
+			ip = ip->i_mnext;
+			continue;
+		}
+		/* Root inode, rbmip and rsumip have associated blocks */
+		if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
+			ASSERT(ip->i_udquot == NULL);
+			ASSERT(ip->i_gdquot == NULL);
+			ip = ip->i_mnext;
+			continue;
+		}
+		vp = XFS_ITOV_NULL(ip);
+		if (!vp) {
+			ASSERT(ip->i_udquot == NULL);
+			ASSERT(ip->i_gdquot == NULL);
+			ip = ip->i_mnext;
+			continue;
+		}
+		vnode_refd = B_FALSE;
+		if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
+			/*
+			 * Sample vp mapping while holding the mplock, lest
+			 * we come across a non-existent vnode.
+			 */
+			VMAP(vp, ip, vmap);
+			ireclaims = mp->m_ireclaims;
+			topino = mp->m_inodes;
+			XFS_MOUNT_IUNLOCK(mp);
+
+			/* XXX restart limit ? */
+			if ( ! (vp = vn_get(vp, &vmap)))
+				goto again;
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			vnode_refd = B_TRUE;
+		} else {
+			ireclaims = mp->m_ireclaims;
+			topino = mp->m_inodes;
+			XFS_MOUNT_IUNLOCK(mp);
+		}
+
+		/*
+		 * We don't keep the mountlock across the dqrele() call,
+		 * since it can take a while..
+		 */
+		if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+			xfs_qm_dqrele(ip->i_udquot);
+			ip->i_udquot = NULL;
+		}
+		if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+			xfs_qm_dqrele(ip->i_gdquot);
+			ip->i_gdquot = NULL;
+		}
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		/*
+		 * Wait until we've dropped the ilock and mountlock to
+		 * do the vn_rele. Or be condemned to an eternity in the
+		 * inactive code in hell.
+		 */
+		if (vnode_refd)
+			VN_RELE(vp);
+		XFS_MOUNT_ILOCK(mp);
+		/*
+		 * If an inode was inserted or removed, we gotta
+		 * start over again.
+		 */
+		if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
+			/* XXX use a sentinel */
+			XFS_MOUNT_IUNLOCK(mp);
+			goto again;
+		}
+		ip = ip->i_mnext;
+	} while (ip != mp->m_inodes);
+
+	XFS_MOUNT_IUNLOCK(mp);
+}
+
+/*------------------------------------------------------------------------*/
+#ifdef DEBUG
+/*
+ * This contains all the test functions for XFS disk quotas.
+ * Currently it does a quota accounting check. ie. it walks through
+ * all inodes in the file system, calculating the dquot accounting fields,
+ * and prints out any inconsistencies.
+ */
+xfs_dqhash_t *qmtest_udqtab;
+xfs_dqhash_t *qmtest_gdqtab;
+int	      qmtest_hashmask;
+int	      qmtest_nfails;
+mutex_t	      qcheck_lock;
+
+#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+				 (__psunsigned_t)(id)) & \
+				(qmtest_hashmask - 1))
+
+#define DQTEST_HASH(mp, id, type)   ((type & XFS_DQ_USER) ? \
+				     (qmtest_udqtab + \
+				      DQTEST_HASHVAL(mp, id)) : \
+				     (qmtest_gdqtab + \
+				      DQTEST_HASHVAL(mp, id)))
+
+#define DQTEST_LIST_PRINT(l, NXT, title) \
+{ \
+	  xfs_dqtest_t	*dqp; int i = 0;\
+	  printk("%s (#%d)\n", title, (int) (l)->qh_nelems); \
+	  for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
+	       dqp = (xfs_dqtest_t *)dqp->NXT) { \
+	    printk("\t%d\.\t\"%d (%s)\"\t bcnt = %d, icnt = %d\n", \
+			 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),	     \
+			 dqp->d_bcount, dqp->d_icount); } \
+}
+
+typedef struct dqtest {
+	xfs_dqmarker_t	q_lists;
+	xfs_dqhash_t	*q_hash;	/* the hashchain header */
+	xfs_mount_t	*q_mount;	/* filesystem this relates to */
+	xfs_dqid_t	d_id;		/* user id or group id */
+	xfs_qcnt_t	d_bcount;	/* # disk blocks owned by the user */
+	xfs_qcnt_t	d_icount;	/* # inodes owned by the user */
+} xfs_dqtest_t;
+
+STATIC void
+xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
+{
+	xfs_dquot_t *d;
+	if (((d) = (h)->qh_next))
+		(d)->HL_PREVP = &((dqp)->HL_NEXT);
+	(dqp)->HL_NEXT = d;
+	(dqp)->HL_PREVP = &((h)->qh_next);
+	(h)->qh_next = (xfs_dquot_t *)dqp;
+	(h)->qh_version++;
+	(h)->qh_nelems++;
+}
+STATIC void
+xfs_qm_dqtest_print(
+	xfs_dqtest_t	*d)
+{
+	printk("-----------DQTEST DQUOT----------------\n");
+	printk("---- dquot ID =	 %d\n", d->d_id);
+	printk("---- type     =	 %s\n", XFS_QM_ISUDQ(d) ? "USR" : "GRP");
+	printk("---- fs	      =	 0x%p\n", d->q_mount);
+	printk("---- bcount   =	 %Lu (0x%x)\n", d->d_bcount, (int)d->d_bcount);
+	printk("---- icount   =	 %Lu (0x%x)\n", d->d_icount, (int)d->d_icount);
+	printk("---------------------------\n");
+}
+
+STATIC void
+xfs_qm_dqtest_failed(
+	xfs_dqtest_t	*d,
+	xfs_dquot_t	*dqp,
+	char		*reason,
+	xfs_qcnt_t	a,
+	xfs_qcnt_t	b,
+	int		error)
+{
+	qmtest_nfails++;
+	if (error)
+		printk("quotacheck failed for %d, error = %d\nreason = %s\n",
+		       INT_GET(d->d_id, ARCH_CONVERT), error, reason);
+	else
+		printk("quotacheck failed for %d (%s) [%d != %d]\n",
+		       INT_GET(d->d_id, ARCH_CONVERT), reason, (int)a, (int)b);
+	xfs_qm_dqtest_print(d);
+	if (dqp)
+		xfs_qm_dqprint(dqp);
+}
+
+STATIC int
+xfs_dqtest_cmp2(
+	xfs_dqtest_t	*d,
+	xfs_dquot_t	*dqp)
+{
+	int err = 0;
+	if (INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) != d->d_icount) {
+		xfs_qm_dqtest_failed(d, dqp, "icount mismatch",
+				     INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), d->d_icount, 0);
+		err++;
+	}
+	if (INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) != d->d_bcount) {
+		xfs_qm_dqtest_failed(d, dqp, "bcount mismatch",
+				     INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), d->d_bcount, 0);
+		err++;
+	}
+	if (INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT) &&
+	    INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) >= INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT)) {
+		if (INT_ISZERO(dqp->q_core.d_btimer, ARCH_CONVERT) &&
+		    !INT_ISZERO(dqp->q_core.d_id, ARCH_CONVERT)) {
+			printk("%d [%s] [0x%p] BLK TIMER NOT STARTED\n",
+			       d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+			err++;
+		}
+	}
+	if (INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT) &&
+	    INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT)) {
+		if (INT_ISZERO(dqp->q_core.d_itimer, ARCH_CONVERT) &&
+		    !INT_ISZERO(dqp->q_core.d_id, ARCH_CONVERT)) {
+			printk("%d [%s] [0x%p] INO TIMER NOT STARTED\n",
+			       d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+			err++;
+		}
+	}
+#if 0
+	if (!err) {
+		printk("%d [%s] [0x%p] qchecked\n",
+		       d->d_id, XFS_QM_ISUDQ(d) ? "USR" : "GRP", d->q_mount);
+	}
+#endif
+	return (err);
+}
+
+STATIC void
+xfs_dqtest_cmp(
+	xfs_dqtest_t	*d)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	/* xfs_qm_dqtest_print(d); */
+	if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0,
+				 &dqp))) {
+		xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error);
+		return;
+	}
+	xfs_dqtest_cmp2(d, dqp);
+	xfs_qm_dqput(dqp);
+}
+
+STATIC int
+xfs_qm_internalqcheck_dqget(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	xfs_dqtest_t	**O_dq)
+{
+	xfs_dqtest_t	*d;
+	xfs_dqhash_t	*h;
+
+	h = DQTEST_HASH(mp, id, type);
+	for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
+	     d = (xfs_dqtest_t *) d->HL_NEXT) {
+		/* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
+		if (d->d_id == id && mp == d->q_mount) {
+			*O_dq = d;
+			return (0);
+		}
+	}
+	d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP);
+	d->dq_flags = type;
+	d->d_id = id;
+	d->q_mount = mp;
+	d->q_hash = h;
+	xfs_qm_hashinsert(h, d);
+	*O_dq = d;
+	return (0);
+}
+
+STATIC void
+xfs_qm_internalqcheck_get_dquots(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	uid,
+	xfs_dqid_t	gid,
+	xfs_dqtest_t	**ud,
+	xfs_dqtest_t	**gd)
+{
+	if (XFS_IS_UQUOTA_ON(mp))
+		xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud);
+	if (XFS_IS_GQUOTA_ON(mp))
+		xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd);
+}
+
+
+STATIC void
+xfs_qm_internalqcheck_dqadjust(
+	xfs_inode_t		*ip,
+	xfs_dqtest_t		*d)
+{
+	d->d_icount++;
+	d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks;
+}
+
+STATIC int
+xfs_qm_internalqcheck_adjust(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		*buffer,	/* not used */
+	xfs_daddr_t	bno,		/* starting block of inode cluster */
+	void		*dip,		/* not used */
+	int		*res)		/* bulkstat result code */
+{
+	xfs_inode_t		*ip;
+	xfs_dqtest_t		*ud, *gd;
+	uint			lock_flags;
+	boolean_t		ipreleased;
+	int			error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+		*res = BULKSTAT_RV_NOTHING;
+		qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
+			(unsigned long long) ino,
+			(unsigned long long) mp->m_sb.sb_uquotino,
+			(unsigned long long) mp->m_sb.sb_gquotino);
+		return XFS_ERROR(EINVAL);
+	}
+	ipreleased = B_FALSE;
+ again:
+	lock_flags = XFS_ILOCK_SHARED;
+	if ((error = xfs_iget(mp, tp, ino, lock_flags, &ip, bno))) {
+		*res = BULKSTAT_RV_NOTHING;
+		return (error);
+	}
+
+	if (ip->i_d.di_mode == 0) {
+		xfs_iput_new(ip, lock_flags);
+		*res = BULKSTAT_RV_NOTHING;
+		return XFS_ERROR(ENOENT);
+	}
+
+	/*
+	 * This inode can have blocks after eof which can get released
+	 * when we send it to inactive. Since we don't check the dquot
+	 * until the after all our calculations are done, we must get rid
+	 * of those now.
+	 */
+	if (! ipreleased) {
+		xfs_iput(ip, lock_flags);
+		ipreleased = B_TRUE;
+		goto again;
+	}
+	xfs_qm_internalqcheck_get_dquots(mp,
+					(xfs_dqid_t) ip->i_d.di_uid,
+					(xfs_dqid_t) ip->i_d.di_gid,
+					&ud, &gd);
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(ud);
+		xfs_qm_internalqcheck_dqadjust(ip, ud);
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		ASSERT(gd);
+		xfs_qm_internalqcheck_dqadjust(ip, gd);
+	}
+	xfs_iput(ip, lock_flags);
+	*res = BULKSTAT_RV_DIDONE;
+	return (0);
+}
+
+
+/* PRIVATE, debugging */
+int
+xfs_qm_internalqcheck(
+	xfs_mount_t	*mp)
+{
+	xfs_ino_t	lastino;
+	int		done, count;
+	int		i;
+	xfs_dqtest_t	*d, *e;
+	xfs_dqhash_t	*h1;
+	int		error;
+
+	lastino = 0;
+	qmtest_hashmask = 32;
+	count = 5;
+	done = 0;
+	qmtest_nfails = 0;
+
+	if (! XFS_IS_QUOTA_ON(mp))
+		return XFS_ERROR(ESRCH);
+
+	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	XFS_bflush(mp->m_ddev_targp);
+	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	XFS_bflush(mp->m_ddev_targp);
+
+	mutex_lock(&qcheck_lock, PINOD);
+	/* There should be absolutely no quota activity while this
+	   is going on. */
+	qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
+				    sizeof(xfs_dqhash_t), KM_SLEEP);
+	qmtest_gdqtab = kmem_zalloc(qmtest_hashmask *
+				    sizeof(xfs_dqhash_t), KM_SLEEP);
+	do {
+		/*
+		 * Iterate thru all the inodes in the file system,
+		 * adjusting the corresponding dquot counters
+		 */
+		if ((error = xfs_bulkstat(mp, NULL, &lastino, &count,
+				 xfs_qm_internalqcheck_adjust,
+				 0, NULL, BULKSTAT_FG_IGET, &done))) {
+			break;
+		}
+	} while (! done);
+	if (error) {
+		printk("Bulkstat returned error 0x%x\n",
+		       error);
+	}
+	printk("Checking results against system dquots\n");
+	for (i = 0; i < qmtest_hashmask; i++) {
+		h1 = &qmtest_udqtab[i];
+		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+			xfs_dqtest_cmp(d);
+			e = (xfs_dqtest_t *) d->HL_NEXT;
+			kmem_free(d, sizeof(xfs_dqtest_t));
+			d = e;
+		}
+		h1 = &qmtest_gdqtab[i];
+		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+			xfs_dqtest_cmp(d);
+			e = (xfs_dqtest_t *) d->HL_NEXT;
+			kmem_free(d, sizeof(xfs_dqtest_t));
+			d = e;
+		}
+	}
+
+	if (qmtest_nfails) {
+		printk("**************	quotacheck failed  **************\n");
+		printk("failures = %d\n", qmtest_nfails);
+	} else {
+		printk("**************	quotacheck successful! **************\n");
+	}
+	kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+	kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+	mutex_unlock(&qcheck_lock);
+	return (qmtest_nfails);
+}
+
+#endif /* DEBUG */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_quota.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_quota.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_quota.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_quota.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QUOTA_H__
+#define __XFS_QUOTA_H__
+
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef __int32_t	xfs_dqid_t;
+
+/*
+ * Eventhough users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t	xfs_qcnt_t;
+typedef __uint16_t	xfs_qwarncnt_t;
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT 0x0001	/* user quota accounting ON */
+#define XFS_UQUOTA_ENFD 0x0002	/* user quota limits enforced */
+#define XFS_UQUOTA_CHKD 0x0004	/* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT 0x0008	/* (IRIX) project quota accounting ON */
+#define XFS_GQUOTA_ENFD 0x0010	/* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0020	/* quotacheck run on grp quotas */
+#define XFS_GQUOTA_ACCT 0x0040	/* group quota accounting ON */
+
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE	0x0080	/* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE	0x0100	/* gquotas are being turned off */
+
+/*
+ * Typically, we turn quotas off if we weren't explicitly asked to
+ * mount quotas. This is the mount option not to do that.
+ * This option is handy in the miniroot, when trying to mount /root.
+ * We can't really know what's in /etc/fstab until /root is already mounted!
+ * This stops quotas getting turned off in the root filesystem everytime
+ * the system boots up a miniroot.
+ */
+#define XFS_QUOTA_MAYBE		0x0200 /* Turn quotas on if SB has quotas on */
+
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp)	((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+						   XFS_GQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp)	((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp)	((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQLOCK	0x0000001 /* dqlock */
+#define XFS_QMOPT_DQALLOC	0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA	0x0000004 /* user dquot requested */
+#define XFS_QMOPT_GQUOTA	0x0000008 /* group dquot requested */
+#define XFS_QMOPT_FORCE_RES	0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_DQSUSER	0x0000020 /* don't cache super users dquot */
+#define XFS_QMOPT_SBVERSION	0x0000040 /* change superblock version num */
+#define XFS_QMOPT_QUOTAOFF	0x0000080 /* quotas are being turned off */
+#define XFS_QMOPT_UMOUNTING	0x0000100 /* filesys is being unmounted */
+#define XFS_QMOPT_DOLOG		0x0000200 /* log buf changes (in quotacheck) */
+#define XFS_QMOPT_DOWARN	0x0000400 /* increase warning cnt if necessary */
+#define XFS_QMOPT_ILOCKED	0x0000800 /* inode is already locked (excl) */
+#define XFS_QMOPT_DQREPAIR	0x0001000 /* repair dquot, if damaged. */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS	0x0010000
+#define XFS_QMOPT_RES_RTBLKS	0x0020000
+#define XFS_QMOPT_BCOUNT	0x0040000
+#define XFS_QMOPT_ICOUNT	0x0080000
+#define XFS_QMOPT_RTBCOUNT	0x0100000
+#define XFS_QMOPT_DELBCOUNT	0x0200000
+#define XFS_QMOPT_DELRTBCOUNT	0x0400000
+#define XFS_QMOPT_RES_INOS	0x0800000
+
+/*
+ * flags for dqflush and dqflush_all.
+ */
+#define XFS_QMOPT_SYNC		0x1000000
+#define XFS_QMOPT_ASYNC		0x2000000
+#define XFS_QMOPT_DELWRI	0x4000000
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT	0x8000000
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS	XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS	XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT	XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT	XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT	XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT	XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL	(XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK	(XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+/*
+ * This check is done typically without holding the inode lock;
+ * that may seem racey, but it is harmless in the context that it is used.
+ * The inode cannot go inactive as long a reference is kept, and
+ * therefore if dquot(s) were attached, they'll stay consistent.
+ * If, for example, the ownership of the inode changes while
+ * we didnt have the inode locked, the appropriate dquot(s) will be
+ * attached atomically.
+ */
+#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
+				     (ip)->i_udquot == NULL) || \
+				    (XFS_IS_GQUOTA_ON(mp) && \
+				     (ip)->i_gdquot == NULL))
+
+#define XFS_QM_NEED_QUOTACHECK(mp) ((XFS_IS_UQUOTA_ON(mp) && \
+				     (mp->m_sb.sb_qflags & \
+				      XFS_UQUOTA_CHKD) == 0) || \
+				    (XFS_IS_GQUOTA_ON(mp) && \
+				     (mp->m_sb.sb_qflags & \
+				      XFS_GQUOTA_CHKD) == 0))
+
+#define XFS_MOUNT_QUOTA_ALL	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+				 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+				 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD)
+#define XFS_MOUNT_QUOTA_MASK	(XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \
+				 XFS_GQUOTA_ACTIVE)
+
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * External Interface to the XFS disk quota subsystem.
+ */
+struct	xfs_disk_dquot;
+struct	xfs_dqhash;
+struct	xfs_dquot;
+struct	xfs_inode;
+struct	xfs_mount;
+struct	xfs_trans;
+
+/*
+ * Quota Manager Interface.
+ */
+extern struct xfs_qm   *xfs_qm_init(void);
+extern void		xfs_qm_destroy(struct xfs_qm *);
+extern int		xfs_qm_dqflush_all(struct xfs_mount *, int);
+extern int		xfs_qm_dqattach(struct xfs_inode *, uint);
+extern int		xfs_qm_dqpurge_all(struct xfs_mount *, uint);
+extern void		xfs_qm_mount_quotainit(struct xfs_mount *, uint);
+extern void		xfs_qm_unmount_quotadestroy(struct xfs_mount *);
+extern int		xfs_qm_mount_quotas(struct xfs_mount *);
+extern int		xfs_qm_unmount_quotas(struct xfs_mount *);
+extern void		xfs_qm_dqdettach_inode(struct xfs_inode *);
+extern int		xfs_qm_sync(struct xfs_mount *, short);
+
+/*
+ * Dquot interface.
+ */
+extern void		xfs_dqlock(struct xfs_dquot *);
+extern void		xfs_dqunlock(struct xfs_dquot *);
+extern void		xfs_dqunlock_nonotify(struct xfs_dquot *);
+extern void		xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+extern void		xfs_qm_dqput(struct xfs_dquot *);
+extern void		xfs_qm_dqrele(struct xfs_dquot *);
+extern xfs_dqid_t	xfs_qm_dqid(struct xfs_dquot *);
+extern int		xfs_qm_dqget(struct xfs_mount *,
+				     struct xfs_inode *, xfs_dqid_t,
+				      uint, uint, struct xfs_dquot **);
+extern int		xfs_qm_dqcheck(struct xfs_disk_dquot *,
+				       xfs_dqid_t, uint, uint, char *);
+
+/*
+ * Vnodeops specific code that should actually be _in_ xfs_vnodeops.c, but
+ * is here because it's nicer to keep vnodeops (therefore, XFS) lean
+ * and clean.
+ */
+extern struct xfs_dquot *	xfs_qm_vop_chown(struct xfs_trans *,
+						 struct xfs_inode *,
+						 struct xfs_dquot **,
+						 struct xfs_dquot *);
+extern int		xfs_qm_vop_dqalloc(struct xfs_mount *,
+					   struct xfs_inode *,
+					   uid_t, gid_t, uint,
+					   struct xfs_dquot	**,
+					   struct xfs_dquot	**);
+
+extern int		xfs_qm_vop_chown_reserve(struct xfs_trans *,
+						 struct xfs_inode *,
+						 struct xfs_dquot *,
+						 struct xfs_dquot *,
+						 uint);
+
+extern int		xfs_qm_vop_rename_dqattach(struct xfs_inode **);
+extern void		xfs_qm_vop_dqattach_and_dqmod_newinode(
+						struct xfs_trans *,
+						struct xfs_inode *,
+						struct xfs_dquot *,
+						struct xfs_dquot *);
+
+
+/*
+ * Dquot Transaction interface
+ */
+extern void		xfs_trans_alloc_dqinfo(struct xfs_trans *);
+extern void		xfs_trans_free_dqinfo(struct xfs_trans *);
+extern void		xfs_trans_dup_dqinfo(struct xfs_trans *,
+					     struct xfs_trans *);
+extern void		xfs_trans_mod_dquot(struct xfs_trans *,
+					    struct xfs_dquot *,
+					    uint, long);
+extern void		xfs_trans_mod_dquot_byino(struct xfs_trans *,
+						  struct xfs_inode *,
+						  uint, long);
+extern void		xfs_trans_apply_dquot_deltas(struct xfs_trans *);
+extern void		xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+
+extern int		xfs_trans_reserve_quota_nblks(struct xfs_trans *,
+						      struct xfs_inode *,
+						      long, long, uint);
+
+
+extern int		xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+							 struct xfs_dquot *,
+							 struct xfs_dquot *,
+							 long, long, uint);
+extern void		xfs_trans_log_dquot(struct xfs_trans *,
+					    struct xfs_dquot *);
+extern void		xfs_trans_dqjoin(struct xfs_trans *,
+					 struct xfs_dquot *);
+extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
+
+# define _XQM_ZONE_DESTROY(z)	((z)? kmem_cache_destroy(z) : (void)0)
+
+#else
+# define xfs_qm_init()					(NULL)
+# define xfs_qm_destroy(xqm)				do { } while (0)
+# define xfs_qm_dqflush_all(m,t)			(ENOSYS)
+# define xfs_qm_dqattach(i,t)				(ENOSYS)
+# define xfs_qm_dqpurge_all(m,t)			(ENOSYS)
+# define xfs_qm_mount_quotainit(m,t)			do { } while (0)
+# define xfs_qm_unmount_quotadestroy(m)			do { } while (0)
+# define xfs_qm_mount_quotas(m)				(ENOSYS)
+# define xfs_qm_unmount_quotas(m)			(ENOSYS)
+# define xfs_qm_dqdettach_inode(i)			do { } while (0)
+# define xfs_qm_sync(m,t)				(ENOSYS)
+# define xfs_dqlock(d)					do { } while (0)
+# define xfs_dqunlock(d)				do { } while (0)
+# define xfs_dqunlock_nonotify(d)			do { } while (0)
+# define xfs_dqlock2(d1,d2)				do { } while (0)
+# define xfs_qm_dqput(d)				do { } while (0)
+# define xfs_qm_dqrele(d)				do { } while (0)
+# define xfs_qm_dqid(d)					(-1)
+# define xfs_qm_dqget(m,i,di,t,f,d)			(ENOSYS)
+# define xfs_qm_dqcheck(dd,di,t,f,s)			(ENOSYS)
+# define xfs_trans_alloc_dqinfo(t)			do { } while (0)
+# define xfs_trans_free_dqinfo(t)			do { } while (0)
+# define xfs_trans_dup_dqinfo(t1,t2)			do { } while (0)
+# define xfs_trans_mod_dquot(t,d,f,x)			do { } while (0)
+# define xfs_trans_mod_dquot_byino(t,i,f,x)		do { } while (0)
+# define xfs_trans_apply_dquot_deltas(t)		do { } while (0)
+# define xfs_trans_unreserve_and_mod_dquots(t)		do { } while (0)
+# define xfs_trans_reserve_quota_nblks(t,i,nb,ni,f)	(ENOSYS)
+# define xfs_trans_reserve_quota_bydquots(t,x,y,b,i,f)	(ENOSYS)
+# define xfs_trans_log_dquot(t,d)			do { } while (0)
+# define xfs_trans_dqjoin(t,d)				do { } while (0)
+# define xfs_qm_dqrele_all_inodes(m,t)			do { } while (0)
+# define xfs_qm_vop_chown(t,i,d1,d2)			(NULL)
+# define xfs_qm_vop_dqalloc(m,i,u,g,f,d1,d2)		(ENOSYS)
+# define xfs_qm_vop_chown_reserve(t,i,d1,d2,f)		(ENOSYS)
+# define xfs_qm_vop_rename_dqattach(i)			(ENOSYS)
+# define xfs_qm_vop_dqattach_and_dqmod_newinode(t,i,x,y) do { } while (0)
+# define _XQM_ZONE_DESTROY(z)				do { } while (0)
+#endif	/* CONFIG_XFS_QUOTA */
+
+/*
+ * Regular disk block quota reservations
+ */
+#define		xfs_trans_reserve_blkquota(tp, ip, nblks) \
+xfs_trans_reserve_quota_nblks(tp, ip, nblks, 0, XFS_QMOPT_RES_REGBLKS)
+
+#define		xfs_trans_reserve_blkquota_force(tp, ip, nblks) \
+xfs_trans_reserve_quota_nblks(tp, ip, nblks, 0, \
+		XFS_QMOPT_RES_REGBLKS|XFS_QMOPT_FORCE_RES)
+
+#define		xfs_trans_unreserve_blkquota(tp, ip, nblks) \
+(void)xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), 0, XFS_QMOPT_RES_REGBLKS)
+
+#define		xfs_trans_reserve_quota(tp, udq, gdq, nb, ni, f) \
+xfs_trans_reserve_quota_bydquots(tp, udq, gdq, nb, ni, f|XFS_QMOPT_RES_REGBLKS)
+
+#define		xfs_trans_unreserve_quota(tp, ud, gd, b, i, f) \
+xfs_trans_reserve_quota_bydquots(tp, ud, gd, -(b), -(i), f|XFS_QMOPT_RES_REGBLKS)
+
+/*
+ * Realtime disk block quota reservations
+ */
+#define		xfs_trans_reserve_rtblkquota(mp, tp, ip, nblks) \
+xfs_trans_reserve_quota_nblks(tp, ip, nblks, 0, XFS_QMOPT_RES_RTBLKS)
+
+#define		xfs_trans_unreserve_rtblkquota(tp, ip, nblks) \
+(void)xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), 0, XFS_QMOPT_RES_RTBLKS)
+
+#define		xfs_trans_reserve_rtquota(mp, tp, uq, pq, blks, f) \
+xfs_trans_reserve_quota_bydquots(mp, tp, uq, pq, blks, 0, f|XFS_QMOPT_RES_RTBLKS)
+
+#define		xfs_trans_unreserve_rtquota(tp, uq, pq, blks) \
+xfs_trans_reserve_quota_bydquots(tp, uq, pq, -(blks), XFS_QMOPT_RES_RTBLKS)
+
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_QUOTA_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_quota_priv.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_quota_priv.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_quota_priv.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_quota_priv.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QUOTA_PRIV_H__
+#define __XFS_QUOTA_PRIV_H__
+
+/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE	10
+
+/* Number of dquots that fit in to a dquot block */
+#define XFS_QM_DQPERBLK(mp)	((mp)->m_quotainfo->qi_dqperchunk)
+
+#define XFS_ISLOCKED_INODE(ip)		(ismrlocked(&(ip)->i_lock, \
+					    MR_UPDATE | MR_ACCESS) != 0)
+#define XFS_ISLOCKED_INODE_EXCL(ip)	(ismrlocked(&(ip)->i_lock, \
+					    MR_UPDATE) != 0)
+
+#define XFS_DQ_IS_ADDEDTO_TRX(t, d)	((d)->q_transp == (t))
+
+#define XFS_QI_MPLRECLAIMS(mp)	((mp)->m_quotainfo->qi_dqreclaims)
+#define XFS_QI_UQIP(mp)		((mp)->m_quotainfo->qi_uquotaip)
+#define XFS_QI_GQIP(mp)		((mp)->m_quotainfo->qi_gquotaip)
+#define XFS_QI_DQCHUNKLEN(mp)	((mp)->m_quotainfo->qi_dqchunklen)
+#define XFS_QI_BTIMELIMIT(mp)	((mp)->m_quotainfo->qi_btimelimit)
+#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
+#define XFS_QI_ITIMELIMIT(mp)	((mp)->m_quotainfo->qi_itimelimit)
+#define XFS_QI_BWARNLIMIT(mp)	((mp)->m_quotainfo->qi_bwarnlimit)
+#define XFS_QI_IWARNLIMIT(mp)	((mp)->m_quotainfo->qi_iwarnlimit)
+#define XFS_QI_QOFFLOCK(mp)	((mp)->m_quotainfo->qi_quotaofflock)
+
+#define XFS_QI_MPL_LIST(mp)	((mp)->m_quotainfo->qi_dqlist)
+#define XFS_QI_MPLLOCK(mp)	((mp)->m_quotainfo->qi_dqlist.qh_lock)
+#define XFS_QI_MPLNEXT(mp)	((mp)->m_quotainfo->qi_dqlist.qh_next)
+#define XFS_QI_MPLNDQUOTS(mp)	((mp)->m_quotainfo->qi_dqlist.qh_nelems)
+
+#define XQMLCK(h)			(mutex_lock(&((h)->qh_lock), PINOD))
+#define XQMUNLCK(h)			(mutex_unlock(&((h)->qh_lock)))
+#ifdef DEBUG
+static inline int
+XQMISLCKD(xfs_dqhash_t *h)
+{
+	if (mutex_trylock(&h->qh_lock)) {
+		mutex_unlock(&h->qh_lock);
+		return 0;
+	}
+	return 1;
+}
+#endif
+
+#define XFS_DQ_HASH_LOCK(h)		XQMLCK(h)
+#define XFS_DQ_HASH_UNLOCK(h)		XQMUNLCK(h)
+#define XFS_DQ_IS_HASH_LOCKED(h)	XQMISLCKD(h)
+
+#define xfs_qm_mplist_lock(mp)		XQMLCK(&(XFS_QI_MPL_LIST(mp)))
+#define xfs_qm_mplist_unlock(mp)	XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
+#define XFS_QM_IS_MPLIST_LOCKED(mp)	XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
+
+#define xfs_qm_freelist_lock(qm)	XQMLCK(&((qm)->qm_dqfreelist))
+#define xfs_qm_freelist_unlock(qm)	XQMUNLCK(&((qm)->qm_dqfreelist))
+#define XFS_QM_IS_FREELIST_LOCKED(qm)	XQMISLCKD(&((qm)->qm_dqfreelist))
+
+/*
+ * Hash into a bucket in the dquot hash table, based on <mp, id>.
+ */
+#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+				 (__psunsigned_t)(id)) & \
+				(xfs_Gqm->qm_dqhashmask - 1))
+#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
+				     (xfs_Gqm->qm_usr_dqhtable + \
+				      XFS_DQ_HASHVAL(mp, id)) : \
+				     (xfs_Gqm->qm_grp_dqhtable + \
+				      XFS_DQ_HASHVAL(mp, id)))
+#define XFS_IS_DQTYPE_ON(mp, type)   (type == XFS_DQ_USER ? \
+				      XFS_IS_UQUOTA_ON(mp):XFS_IS_GQUOTA_ON(mp))
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+	INT_ISZERO(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_blk_softlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_rtb_hardlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_rtb_softlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_ino_softlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_bcount, ARCH_CONVERT)	      && \
+	INT_ISZERO(dqp->q_core.d_rtbcount, ARCH_CONVERT)      && \
+	INT_ISZERO(dqp->q_core.d_icount, ARCH_CONVERT))
+
+#define HL_PREVP	dq_hashlist.ql_prevp
+#define HL_NEXT		dq_hashlist.ql_next
+#define MPL_PREVP	dq_mplist.ql_prevp
+#define MPL_NEXT	dq_mplist.ql_next
+
+
+#define _LIST_REMOVE(h, dqp, PVP, NXT)				\
+	{							\
+		 xfs_dquot_t *d;				\
+		 if (((d) = (dqp)->NXT))				\
+			 (d)->PVP = (dqp)->PVP;			\
+		 *((dqp)->PVP) = d;				\
+		 (dqp)->NXT = NULL;				\
+		 (dqp)->PVP = NULL;				\
+		 (h)->qh_version++;				\
+		 (h)->qh_nelems--;				\
+	}
+
+#define _LIST_INSERT(h, dqp, PVP, NXT)				\
+	{							\
+		 xfs_dquot_t *d;				\
+		 if (((d) = (h)->qh_next))			\
+			 (d)->PVP = &((dqp)->NXT);		\
+		 (dqp)->NXT = d;				\
+		 (dqp)->PVP = &((h)->qh_next);			\
+		 (h)->qh_next = dqp;				\
+		 (h)->qh_version++;				\
+		 (h)->qh_nelems++;				\
+	 }
+
+#define FOREACH_DQUOT_IN_MP(dqp, mp) \
+	for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
+
+#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist)	\
+for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
+     (dqp) = (dqp)->dq_flnext)
+
+#define XQM_HASHLIST_INSERT(h, dqp)	\
+	 _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
+
+#define XQM_FREELIST_INSERT(h, dqp)	\
+	 xfs_qm_freelist_append(h, dqp)
+
+#define XQM_MPLIST_INSERT(h, dqp)	\
+	 _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
+
+#define XQM_HASHLIST_REMOVE(h, dqp)	\
+	 _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
+#define XQM_FREELIST_REMOVE(dqp)	\
+	 xfs_qm_freelist_unlink(dqp)
+#define XQM_MPLIST_REMOVE(h, dqp)	\
+	{ _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
+	  XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
+
+#define XFS_DQ_IS_LOGITEM_INITD(dqp)	((dqp)->q_logitem.qli_dquot == (dqp))
+
+#define XFS_QM_DQP_TO_DQACCT(tp, dqp)	(XFS_QM_ISUDQ(dqp) ? \
+					 (tp)->t_dqinfo->dqa_usrdquots : \
+					 (tp)->t_dqinfo->dqa_grpdquots)
+#define XFS_IS_SUSER_DQUOT(dqp)		\
+	(INT_ISZERO((dqp)->q_core.d_id, ARCH_CONVERT))
+
+#define XFS_PURGE_INODE(ip)		\
+	{				\
+	  vmap_t dqvmap;		\
+	  vnode_t *dqvp;		\
+	  dqvp = XFS_ITOV(ip);		\
+	  VMAP(dqvp, ip, dqvmap);	\
+	  VN_RELE(dqvp);		\
+	}
+
+#define DQFLAGTO_TYPESTR(d)	(((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
+				 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : "???"))
+#define DQFLAGTO_DIRTYSTR(d)	(XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY")
+
+#endif	/* __XFS_QUOTA_PRIV_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rename.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rename.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rename.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rename.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
+ * If there are fewer than 4 entries in the array, the empty entries will
+ * be at the end and will have NULL pointers in them.
+ */
+STATIC void
+xfs_rename_unlock4(
+	xfs_inode_t	**i_tab,
+	uint		lock_mode)
+{
+	int	i;
+
+	xfs_iunlock(i_tab[0], lock_mode);
+	for (i = 1; i < 4; i++) {
+		if (i_tab[i] == NULL) {
+			break;
+		}
+		/*
+		 * Watch out for duplicate entries in the table.
+		 */
+		if (i_tab[i] != i_tab[i-1]) {
+			xfs_iunlock(i_tab[i], lock_mode);
+		}
+	}
+}
+
+#ifdef DEBUG
+int xfs_rename_skip, xfs_rename_nskip;
+#endif
+
+/*
+ * The following routine will acquire the locks required for a rename
+ * operation. The code understands the semantics of renames and will
+ * validate that name1 exists under dp1 & that name2 may or may not
+ * exist under dp2.
+ *
+ * We are renaming dp1/name1 to dp2/name2.
+ *
+ * Return ENOENT if dp1 does not exist, other lookup errors, or 0 for success.
+ * Return EAGAIN if the caller needs to try again.
+ */
+STATIC int
+xfs_lock_for_rename(
+	xfs_inode_t	*dp1,	/* old (source) directory inode */
+	xfs_inode_t	*dp2,	/* new (target) directory inode */
+	struct dentry	*dentry1, /* old entry name */
+	struct dentry	*dentry2, /* new entry name */
+	xfs_inode_t	**ipp1, /* inode of old entry */
+	xfs_inode_t	**ipp2, /* inode of new entry, if it
+				   already exists, NULL otherwise. */
+	xfs_inode_t	**i_tab,/* array of inode returned, sorted */
+	int		*num_inodes)  /* number of inodes in array */
+{
+	xfs_inode_t		*ip1, *ip2, *temp;
+	xfs_ino_t		inum1, inum2;
+	int			error;
+	int			i, j;
+	uint			lock_mode;
+	uint			lookup_flags;
+	int			diff_dirs = (dp1 != dp2);
+
+	ip2 = NULL;
+
+	/*
+	 * First, find out the current inums of the entries so that we
+	 * can determine the initial locking order.  We'll have to
+	 * sanity check stuff after all the locks have been acquired
+	 * to see if we still have the right inodes, directories, etc.
+	 */
+	lock_mode = xfs_ilock_map_shared(dp1);
+	error = xfs_get_dir_entry(dentry1, &ip1);
+	if (error) {
+		xfs_iunlock_map_shared(dp1, lock_mode);
+		return error;
+	}
+
+	inum1 = ip1->i_ino;
+
+	ASSERT(ip1);
+	ITRACE(ip1);
+
+	/*
+	 * Unlock dp1 and lock dp2 if they are different.
+	 */
+
+	if (diff_dirs) {
+		xfs_iunlock_map_shared(dp1, lock_mode);
+		lock_mode = xfs_ilock_map_shared(dp2);
+	}
+
+	lookup_flags = DLF_IGET;
+	if (lock_mode == XFS_ILOCK_SHARED) {
+		lookup_flags |= DLF_LOCK_SHARED;
+	}
+	error = xfs_dir_lookup_int(XFS_ITOBHV(dp2), lookup_flags,
+				   dentry2, &inum2, &ip2);
+	if (error == ENOENT) {		/* target does not need to exist. */
+		inum2 = 0;
+	} else if (error) {
+		/*
+		 * If dp2 and dp1 are the same, the next line unlocks dp1.
+		 * Got it?
+		 */
+		xfs_iunlock_map_shared(dp2, lock_mode);
+		IRELE (ip1);
+		return error;
+	} else {
+		ITRACE(ip2);
+	}
+
+	/*
+	 * i_tab contains a list of pointers to inodes.	 We initialize
+	 * the table here & we'll sort it.  We will then use it to
+	 * order the acquisition of the inode locks.
+	 *
+	 * Note that the table may contain duplicates.	e.g., dp1 == dp2.
+	 */
+	i_tab[0] = dp1;
+	i_tab[1] = dp2;
+	i_tab[2] = ip1;
+	if (inum2 == 0) {
+		*num_inodes = 3;
+		i_tab[3] = NULL;
+	} else {
+		*num_inodes = 4;
+		i_tab[3] = ip2;
+	}
+
+	/*
+	 * Sort the elements via bubble sort.  (Remember, there are at
+	 * most 4 elements to sort, so this is adequate.)
+	 */
+	for (i=0; i < *num_inodes; i++) {
+		for (j=1; j < *num_inodes; j++) {
+			if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+				temp = i_tab[j];
+				i_tab[j] = i_tab[j-1];
+				i_tab[j-1] = temp;
+			}
+		}
+	}
+
+	/*
+	 * We have dp2 locked. If it isn't first, unlock it.
+	 * If it is first, tell xfs_lock_inodes so it can skip it
+	 * when locking. if dp1 == dp2, xfs_lock_inodes will skip both
+	 * since they are equal. xfs_lock_inodes needs all these inodes
+	 * so that it can unlock and retry if there might be a dead-lock
+	 * potential with the log.
+	 */
+
+	if (i_tab[0] == dp2 && lock_mode == XFS_ILOCK_SHARED) {
+#ifdef DEBUG
+		xfs_rename_skip++;
+#endif
+		xfs_lock_inodes(i_tab, *num_inodes, 1, XFS_ILOCK_SHARED);
+	} else {
+#ifdef DEBUG
+		xfs_rename_nskip++;
+#endif
+		xfs_iunlock_map_shared(dp2, lock_mode);
+		xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
+	}
+
+	/*
+	 * Set the return value. Null out any unused entries in i_tab.
+	 */
+	*ipp1 = *ipp2 = NULL;
+	for (i=0; i < *num_inodes; i++) {
+		if (i_tab[i]->i_ino == inum1) {
+			*ipp1 = i_tab[i];
+		}
+		if (i_tab[i]->i_ino == inum2) {
+			*ipp2 = i_tab[i];
+		}
+	}
+	for (;i < 4; i++) {
+		i_tab[i] = NULL;
+	}
+	return 0;
+}
+
+
+int rename_which_error_return = 0;
+
+#ifdef DEBUG
+int xfs_rename_agains;
+int xfs_renames;
+#endif
+
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+	bhv_desc_t	*src_dir_bdp,
+	struct dentry	*src_dentry,
+	vnode_t		*target_dir_vp,
+	struct dentry	*target_dentry,
+	cred_t		*credp)
+{
+	xfs_trans_t	*tp;
+	xfs_inode_t	*src_dp, *target_dp, *src_ip, *target_ip;
+	xfs_mount_t	*mp;
+	int		new_parent;		/* moving to a new dir */
+	int		src_is_directory;	/* src_name is a directory */
+	int		error;
+	xfs_bmap_free_t free_list;
+	xfs_fsblock_t	first_block;
+	int		cancel_flags;
+	int		committed;
+	xfs_inode_t	*inodes[4];
+	int		target_ip_dropped = 0;	/* dropped target_ip link? */
+	vnode_t		*src_dir_vp;
+	bhv_desc_t	*target_dir_bdp;
+	int		spaceres;
+	int		target_link_zero = 0;
+	int		num_inodes;
+	char		*src_name = (char *)src_dentry->d_name.name;
+	char		*target_name = (char *)target_dentry->d_name.name;
+	int		src_namelen;
+	int		target_namelen;
+#ifdef DEBUG
+	int		retries;
+
+	xfs_renames++;
+#endif
+	src_dir_vp = BHV_TO_VNODE(src_dir_bdp);
+	vn_trace_entry(src_dir_vp, "xfs_rename", (inst_t *)__return_address);
+	vn_trace_entry(target_dir_vp, "xfs_rename", (inst_t *)__return_address);
+
+	/*
+	 * Find the XFS behavior descriptor for the target directory
+	 * vnode since it was not handed to us.
+	 */
+	target_dir_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(target_dir_vp),
+						&xfs_vnodeops);
+	if (target_dir_bdp == NULL) {
+		return XFS_ERROR(EXDEV);
+	}
+	src_namelen = src_dentry->d_name.len;
+	if (src_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+	target_namelen = target_dentry->d_name.len;
+	if (target_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+	src_dp = XFS_BHVTOI(src_dir_bdp);
+	target_dp = XFS_BHVTOI(target_dir_bdp);
+	if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) ||
+	    DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
+				target_dp, DM_EVENT_RENAME)) {
+		error = dm_send_namesp_event(DM_EVENT_RENAME,
+					src_dir_bdp, DM_RIGHT_NULL,
+					target_dir_bdp, DM_RIGHT_NULL,
+					src_name, target_name,
+					0, 0, 0);
+		if (error) {
+			return error;
+		}
+	}
+	/* Return through std_return after this point. */
+
+#ifdef DEBUG
+	retries = 0;
+#endif
+	/*
+	 * Lock all the participating inodes. Depending upon whether
+	 * the target_name exists in the target directory, and
+	 * whether the target directory is the same as the source
+	 * directory, we can lock from 2 to 4 inodes.
+	 * xfs_lock_for_rename() will return ENOENT if src_name
+	 * does not exist in the source directory.
+	 */
+	tp = NULL;
+	do {
+		error = xfs_lock_for_rename(src_dp, target_dp, src_dentry,
+				target_dentry, &src_ip, &target_ip, inodes,
+				&num_inodes);
+#ifdef DEBUG
+		if (error == EAGAIN)
+			xfs_rename_agains++;
+#endif
+	} while (error == EAGAIN);
+
+	if (error) {
+		rename_which_error_return = __LINE__;
+		/*
+		 * We have nothing locked, no inode references, and
+		 * no transaction, so just get out.
+		 */
+		goto std_return;
+	}
+
+	ASSERT(src_ip != NULL);
+
+	if ((src_ip->i_d.di_mode & IFMT) == IFDIR) {
+		/*
+		 * Check for link count overflow on target_dp
+		 */
+		if (target_ip == NULL && (src_dp != target_dp) &&
+		    target_dp->i_d.di_nlink >= XFS_MAXLINK) {
+			rename_which_error_return = __LINE__;
+			error = XFS_ERROR(EMLINK);
+			xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+			goto rele_return;
+		}
+	}
+
+	new_parent = (src_dp != target_dp);
+	src_is_directory = ((src_ip->i_d.di_mode & IFMT) == IFDIR);
+
+	/*
+	 * Drop the locks on our inodes so that we can do the ancestor
+	 * check if necessary and start the transaction.
+	 */
+	xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+
+	XFS_BMAP_INIT(&free_list, &first_block);
+	mp = src_dp->i_mount;
+	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
+	error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+	if (error == ENOSPC) {
+		spaceres = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+	}
+	if (error) {
+		rename_which_error_return = __LINE__;
+		xfs_trans_cancel(tp, 0);
+		goto rele_return;
+	}
+
+	/*
+	 * Attach the dquots to the inodes
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if ((error = xfs_qm_vop_rename_dqattach(inodes))) {
+			xfs_trans_cancel(tp, cancel_flags);
+			rename_which_error_return = __LINE__;
+			goto rele_return;
+		}
+	}
+
+	/*
+	 * Reacquire the inode locks we dropped above.
+	 */
+	xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL);
+
+	/*
+	 * Join all the inodes to the transaction. From this point on,
+	 * we can rely on either trans_commit or trans_cancel to unlock
+	 * them.  Note that we need to add a vnode reference to the
+	 * directories since trans_commit & trans_cancel will decrement
+	 * them when they unlock the inodes.  Also, we need to be careful
+	 * not to add an inode to the transaction more than once.
+	 */
+	VN_HOLD(src_dir_vp);
+	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+	if (new_parent) {
+		VN_HOLD(target_dir_vp);
+		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+	}
+	if ((src_ip != src_dp) && (src_ip != target_dp)) {
+		xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+	}
+	if ((target_ip != NULL) &&
+	    (target_ip != src_ip) &&
+	    (target_ip != src_dp) &&
+	    (target_ip != target_dp)) {
+		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+	}
+
+	/*
+	 * Set up the target.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If there's no space reservation, check the entry will
+		 * fit before actually inserting it.
+		 */
+		if (spaceres == 0 &&
+		    (error = XFS_DIR_CANENTER(mp, tp, target_dp, target_name,
+				target_namelen))) {
+			rename_which_error_return = __LINE__;
+			goto error_return;
+		}
+		/*
+		 * If target does not exist and the rename crosses
+		 * directories, adjust the target directory link count
+		 * to account for the ".." reference from the new entry.
+		 */
+		error = XFS_DIR_CREATENAME(mp, tp, target_dp, target_name,
+					   target_namelen, src_ip->i_ino,
+					   &first_block, &free_list, spaceres);
+		if (error == ENOSPC) {
+			rename_which_error_return = __LINE__;
+			goto error_return;
+		}
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;
+		}
+		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		if (new_parent && src_is_directory) {
+			error = xfs_bumplink(tp, target_dp);
+			if (error) {
+				rename_which_error_return = __LINE__;
+				goto abort_return;
+			}
+		}
+	} else { /* target_ip != NULL */
+
+		/*
+		 * If target exists and it's a directory, check that both
+		 * target and source are directories and that target can be
+		 * destroyed, or that neither is a directory.
+		 */
+		if ((target_ip->i_d.di_mode & IFMT) == IFDIR) {
+			/*
+			 * Make sure target dir is empty.
+			 */
+			if (!(XFS_DIR_ISEMPTY(target_ip->i_mount, target_ip)) ||
+			    (target_ip->i_d.di_nlink > 2)) {
+				error = XFS_ERROR(EEXIST);
+				rename_which_error_return = __LINE__;
+				goto error_return;
+			}
+		}
+
+		/*
+		 * Link the source inode under the target name.
+		 * If the source inode is a directory and we are moving
+		 * it across directories, its ".." entry will be
+		 * inconsistent until we replace that down below.
+		 *
+		 * In case there is already an entry with the same
+		 * name at the destination directory, remove it first.
+		 */
+		error = XFS_DIR_REPLACE(mp, tp, target_dp, target_name,
+			target_namelen, src_ip->i_ino, &first_block,
+			&free_list, spaceres);
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;
+		}
+		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		/*
+		 * Decrement the link count on the target since the target
+		 * dir no longer points to it.
+		 */
+		error = xfs_droplink(tp, target_ip);
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;;
+		}
+		target_ip_dropped = 1;
+
+		/* Do this test while we still hold the locks */
+		target_link_zero = (target_ip)->i_d.di_nlink==0;
+
+		if (src_is_directory) {
+			/*
+			 * Drop the link from the old "." entry.
+			 */
+			error = xfs_droplink(tp, target_ip);
+			if (error) {
+				rename_which_error_return = __LINE__;
+				goto abort_return;
+			}
+		}
+
+	} /* target_ip != NULL */
+
+	/*
+	 * Remove the source.
+	 */
+	if (new_parent && src_is_directory) {
+
+		/*
+		 * Rewrite the ".." entry to point to the new
+		 * directory.
+		 */
+		error = XFS_DIR_REPLACE(mp, tp, src_ip, "..", 2,
+					target_dp->i_ino, &first_block,
+					&free_list, spaceres);
+		ASSERT(error != EEXIST);
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;
+		}
+		xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	} else {
+		/*
+		 * We always want to hit the ctime on the source inode.
+		 * We do it in the if clause above for the 'new_parent &&
+		 * src_is_directory' case, and here we get all the other
+		 * cases.  This isn't strictly required by the standards
+		 * since the source inode isn't really being changed,
+		 * but old unix file systems did it and some incremental
+		 * backup programs won't work without it.
+		 */
+		xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+	}
+
+	/*
+	 * Adjust the link count on src_dp.  This is necessary when
+	 * renaming a directory, either within one parent when
+	 * the target existed, or across two parent directories.
+	 */
+	if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+		/*
+		 * Decrement link count on src_directory since the
+		 * entry that's moved no longer points to it.
+		 */
+		error = xfs_droplink(tp, src_dp);
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;
+		}
+	}
+
+	error = XFS_DIR_REMOVENAME(mp, tp, src_dp, src_name, src_namelen,
+			src_ip->i_ino, &first_block, &free_list, spaceres);
+	if (error) {
+		rename_which_error_return = __LINE__;
+		goto abort_return;
+	}
+	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/*
+	 * Update the generation counts on all the directory inodes
+	 * that we're modifying.
+	 */
+	src_dp->i_gen++;
+	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+
+	if (new_parent) {
+		target_dp->i_gen++;
+		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+	}
+
+	/*
+	 * If there was a target inode, take an extra reference on
+	 * it here so that it doesn't go to xfs_inactive() from
+	 * within the commit.
+	 */
+	if (target_ip != NULL) {
+		IHOLD(target_ip);
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * rename transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	/*
+	 * Take refs. for vop_link_removed calls below.	 No need to worry
+	 * about directory refs. because the caller holds them.
+	 *
+	 * Do holds before the xfs_bmap_finish since it might rele them down
+	 * to zero.
+	 */
+
+	if (target_ip_dropped)
+		IHOLD(target_ip);
+	IHOLD(src_ip);
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT));
+		if (target_ip != NULL) {
+			IRELE(target_ip);
+		}
+		if (target_ip_dropped) {
+			IRELE(target_ip);
+		}
+		IRELE(src_ip);
+		goto std_return;
+	}
+
+	/*
+	 * trans_commit will unlock src_ip, target_ip & decrement
+	 * the vnode references.
+	 */
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (target_ip != NULL) {
+		xfs_refcache_purge_ip(target_ip);
+		IRELE(target_ip);
+	}
+	/*
+	 * Let interposed file systems know about removed links.
+	 */
+	if (target_ip_dropped) {
+		VOP_LINK_REMOVED(XFS_ITOV(target_ip), target_dir_vp,
+					target_link_zero);
+		IRELE(target_ip);
+	}
+
+	FSC_NOTIFY_NAME_CHANGED(XFS_ITOV(src_ip));
+
+	IRELE(src_ip);
+
+	/* Fall through to std_return with error = 0 or errno from
+	 * xfs_trans_commit	 */
+std_return:
+	if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_POSTRENAME) ||
+	    DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
+				target_dp, DM_EVENT_POSTRENAME)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTRENAME,
+					src_dir_bdp, DM_RIGHT_NULL,
+					target_dir_bdp, DM_RIGHT_NULL,
+					src_name, target_name,
+					0, error, 0);
+	}
+	return error;
+
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+	/* FALLTHROUGH */
+ error_return:
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_cancel(tp, cancel_flags);
+	goto std_return;
+
+ rele_return:
+	IRELE(src_ip);
+	if (target_ip != NULL) {
+		IRELE(target_ip);
+	}
+	goto std_return;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rtalloc.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rtalloc.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rtalloc.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rtalloc.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,2438 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Free realtime space allocation for XFS.
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Prototypes for internal functions.
+ */
+
+
+STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *);
+STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int,
+		xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *);
+STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_extlen_t, int, xfs_rtblock_t *, int *);
+STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int,
+		xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *);
+STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_extlen_t, int);
+STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
+		xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *);
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
+ */
+STATIC int
+xfs_lowbit32(
+	__uint32_t	v)
+{
+	return ffs(v)-1;
+}
+
+/*
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ */
+STATIC int				/* error */
+xfs_growfs_rt_alloc(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_extlen_t	oblocks,	/* old count of blocks */
+	xfs_extlen_t	nblocks,	/* new count of blocks */
+	xfs_ino_t	ino)		/* inode number (bitmap/summary) */
+{
+	xfs_fileoff_t	bno;		/* block number in file */
+	xfs_buf_t	*bp;		/* temporary buffer for zeroing */
+	int		cancelflags;	/* flags for xfs_trans_cancel */
+	int		committed;	/* transaction committed flag */
+	xfs_daddr_t	d;		/* disk block address */
+	int		error;		/* error return value */
+	xfs_fsblock_t	firstblock;	/* first block allocated in xaction */
+	xfs_bmap_free_t flist;		/* list of freed blocks */
+	xfs_fsblock_t	fsbno;		/* filesystem block for bno */
+	xfs_inode_t	*ip;		/* pointer to incore inode */
+	xfs_bmbt_irec_t map;		/* block map output */
+	int		nmap;		/* number of block maps */
+	int		resblks;	/* space reservation */
+	xfs_trans_t	*tp;		/* transaction pointer */
+
+	/*
+	 * Allocate space to the file, as necessary.
+	 */
+	while (oblocks < nblocks) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
+		resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
+		cancelflags = 0;
+		/*
+		 * Reserve space & log for one extent added to the file.
+		 */
+		if ((error = xfs_trans_reserve(tp, resblks,
+				XFS_GROWRTALLOC_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES,
+				XFS_DEFAULT_PERM_LOG_COUNT)))
+			goto error_exit;
+		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+		/*
+		 * Lock the inode.
+		 */
+		if ((error = xfs_trans_iget(mp, tp, ino, XFS_ILOCK_EXCL, &ip)))
+			goto error_exit;
+		XFS_BMAP_INIT(&flist, &firstblock);
+		/*
+		 * Allocate blocks to the bitmap file.
+		 */
+		nmap = 1;
+		cancelflags |= XFS_TRANS_ABORT;
+		error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
+			XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
+			resblks, &map, &nmap, &flist);
+		if (!error && nmap < 1)
+			error = XFS_ERROR(ENOSPC);
+		if (error)
+			goto error_exit;
+		/*
+		 * Free any blocks freed up in the transaction, then commit.
+		 */
+		error = xfs_bmap_finish(&tp, &flist, firstblock, &committed);
+		if (error)
+			goto error_exit;
+		xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		/*
+		 * Now we need to clear the allocated blocks.
+		 * Do this one block per transaction, to keep it simple.
+		 */
+		cancelflags = 0;
+		for (bno = map.br_startoff, fsbno = map.br_startblock;
+		     bno < map.br_startoff + map.br_blockcount;
+		     bno++, fsbno++) {
+			tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
+			/*
+			 * Reserve log for one block zeroing.
+			 */
+			if ((error = xfs_trans_reserve(tp, 0,
+					XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
+				goto error_exit;
+			/*
+			 * Lock the bitmap inode.
+			 */
+			if ((error = xfs_trans_iget(mp, tp, ino, XFS_ILOCK_EXCL,
+					&ip)))
+				goto error_exit;
+			/*
+			 * Get a buffer for the block.
+			 */
+			d = XFS_FSB_TO_DADDR(mp, fsbno);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+				mp->m_bsize, 0);
+			if (bp == NULL) {
+				error = XFS_ERROR(EIO);
+				goto error_exit;
+			}
+			bzero(XFS_BUF_PTR(bp), mp->m_sb.sb_blocksize);
+			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+			/*
+			 * Commit the transaction.
+			 */
+			xfs_trans_commit(tp, 0, NULL);
+		}
+		/*
+		 * Go on to the next extent, if any.
+		 */
+		oblocks = map.br_startoff + map.br_blockcount;
+	}
+	return 0;
+error_exit:
+	xfs_trans_cancel(tp, cancelflags);
+	return error;
+}
+
+/*
+ * Attempt to allocate an extent minlen<=len<=maxlen starting from
+ * bitmap block bbno.  If we don't get maxlen then use prod to trim
+ * the length, if given.  Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_block(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_rtblock_t	*nextp,		/* out: next block to try */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	xfs_rtblock_t	besti;		/* best rtblock found so far */
+	xfs_rtblock_t	bestlen;	/* best length found so far */
+	xfs_rtblock_t	end;		/* last rtblock in chunk */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current rtblock trying */
+	xfs_rtblock_t	next;		/* next rtblock to try */
+	int		stat;		/* status from internal calls */
+
+	/*
+	 * Loop over all the extents starting in this bitmap block,
+	 * looking for one that's long enough.
+	 */
+	for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
+		end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+	     i <= end;
+	     i++) {
+		/*
+		 * See if there's a free extent of maxlen starting at i.
+		 * If it's not so then next will contain the first non-free.
+		 */
+		error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+		if (error) {
+			return error;
+		}
+		if (stat) {
+			/*
+			 * i for maxlen is all free, allocate and return that.
+			 */
+			error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
+				rsb);
+			if (error) {
+				return error;
+			}
+			*len = maxlen;
+			*rtblock = i;
+			return 0;
+		}
+		/*
+		 * In the case where we have a variable-sized allocation
+		 * request, figure out how big this free piece is,
+		 * and if it's big enough for the minimum, and the best
+		 * so far, remember it.
+		 */
+		if (minlen < maxlen) {
+			xfs_rtblock_t	thislen;	/* this extent size */
+
+			thislen = next - i;
+			if (thislen >= minlen && thislen > bestlen) {
+				besti = i;
+				bestlen = thislen;
+			}
+		}
+		/*
+		 * If not done yet, find the start of the next free space.
+		 */
+		if (next < end) {
+			error = xfs_rtfind_forw(mp, tp, next, end, &i);
+			if (error) {
+				return error;
+			}
+		} else
+			break;
+	}
+	/*
+	 * Searched the whole thing & didn't find a maxlen free extent.
+	 */
+	if (minlen < maxlen && besti != -1) {
+		xfs_extlen_t	p;	/* amount to trim length by */
+
+		/*
+		 * If size should be a multiple of prod, make that so.
+		 */
+		if (prod > 1 && (p = do_mod(bestlen, prod)))
+			bestlen -= p;
+		/*
+		 * Allocate besti for bestlen & return that.
+		 */
+		error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = bestlen;
+		*rtblock = besti;
+		return 0;
+	}
+	/*
+	 * Allocation failed.  Set *nextp to the next block to try.
+	 */
+	*nextp = next;
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting at block
+ * bno.	 If we don't get maxlen then use prod to trim the length, if given.
+ * Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_exact(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	xfs_extlen_t	i;		/* extent length trimmed due to prod */
+	int		isfree;		/* extent is free */
+	xfs_rtblock_t	next;		/* next block to try (dummy) */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * Check if the range in question (for maxlen) is free.
+	 */
+	error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+	if (error) {
+		return error;
+	}
+	if (isfree) {
+		/*
+		 * If it is, allocate it and return success.
+		 */
+		error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = maxlen;
+		*rtblock = bno;
+		return 0;
+	}
+	/*
+	 * If not, allocate what there is, if it's at least minlen.
+	 */
+	maxlen = next - bno;
+	if (maxlen < minlen) {
+		/*
+		 * Failed, return failure status.
+		 */
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	/*
+	 * Trim off tail of extent, if prod is specified.
+	 */
+	if (prod > 1 && (i = maxlen % prod)) {
+		maxlen -= i;
+		if (maxlen < minlen) {
+			/*
+			 * Now we can't do it, return failure status.
+			 */
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+	/*
+	 * Allocate what we can and return it.
+	 */
+	error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	*len = maxlen;
+	*rtblock = bno;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting as near
+ * to bno as possible.	If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_near(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		any;		/* any useful extents from summary */
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	int		error;		/* error value */
+	int		i;		/* bitmap block offset (loop control) */
+	int		j;		/* secondary loop control */
+	int		log2len;	/* log2 of minlen */
+	xfs_rtblock_t	n;		/* next block to try */
+	xfs_rtblock_t	r;		/* result block */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * If the block number given is off the end, silently set it to
+	 * the last block.
+	 */
+	if (bno >= mp->m_sb.sb_rextents)
+		bno = mp->m_sb.sb_rextents - 1;
+	/*
+	 * Try the exact allocation first.
+	 */
+	error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
+		rbpp, rsb, prod, &r);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If the exact allocation worked, return that.
+	 */
+	if (r != NULLRTBLOCK) {
+		*rtblock = r;
+		return 0;
+	}
+	bbno = XFS_BITTOBLOCK(mp, bno);
+	i = 0;
+	log2len = xfs_highbit32(minlen);
+	/*
+	 * Loop over all bitmap blocks (bbno + i is current block).
+	 */
+	for (;;) {
+		/*
+		 * Get summary information of extents of all useful levels
+		 * starting in this bitmap block.
+		 */
+		error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
+			bbno + i, rbpp, rsb, &any);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any useful extents starting here, try
+		 * allocating one.
+		 */
+		if (any) {
+			/*
+			 * On the positive side of the starting location.
+			 */
+			if (i >= 0) {
+				/*
+				 * Try to allocate an extent starting in
+				 * this block.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it worked, return it.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+			/*
+			 * On the negative side of the starting location.
+			 */
+			else {		/* i < 0 */
+				/*
+				 * Loop backwards through the bitmap blocks from
+				 * the starting point-1 up to where we are now.
+				 * There should be an extent which ends in this
+				 * bitmap block and is long enough.
+				 */
+				for (j = -1; j > i; j--) {
+					/*
+					 * Grab the summary information for
+					 * this bitmap block.
+					 */
+					error = xfs_rtany_summary(mp, tp,
+						log2len, mp->m_rsumlevels - 1,
+						bbno + j, rbpp, rsb, &any);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If there's no extent given in the
+					 * summary that means the extent we
+					 * found must carry over from an
+					 * earlier block.  If there is an
+					 * extent given, we've already tried
+					 * that allocation, don't do it again.
+					 */
+					if (any)
+						continue;
+					error = xfs_rtallocate_extent_block(mp,
+						tp, bbno + j, minlen, maxlen,
+						len, &n, rbpp, rsb, prod, &r);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If it works, return the extent.
+					 */
+					if (r != NULLRTBLOCK) {
+						*rtblock = r;
+						return 0;
+					}
+				}
+				/*
+				 * There weren't intervening bitmap blocks
+				 * with a long enough extent, or the
+				 * allocation didn't work for some reason
+				 * (i.e. it's a little * too short).
+				 * Try to allocate from the summary block
+				 * that we found.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it works, return the extent.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+		}
+		/*
+		 * Loop control.  If we were on the positive side, and there's
+		 * still more blocks on the negative side, go there.
+		 */
+		if (i > 0 && (int)bbno - i >= 0)
+			i = -i;
+		/*
+		 * If positive, and no more negative, but there are more
+		 * positive, go there.
+		 */
+		else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1)
+			i++;
+		/*
+		 * If negative or 0 (just started), and there are positive
+		 * blocks to go, go there.  The 0 case moves to block 1.
+		 */
+		else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1)
+			i = 1 - i;
+		/*
+		 * If negative or 0 and there are more negative blocks,
+		 * go there.
+		 */
+		else if (i <= 0 && (int)bbno + i > 0)
+			i--;
+		/*
+		 * Must be done.  Return failure.
+		 */
+		else
+			break;
+	}
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, with no position
+ * specified.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_size(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	int		i;		/* bitmap block number */
+	int		l;		/* level number (loop control) */
+	xfs_rtblock_t	n;		/* next block to be tried */
+	xfs_rtblock_t	r;		/* result block number */
+	xfs_suminfo_t	sum;		/* summary information for extents */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * Loop over all the levels starting with maxlen.
+	 * At each level, look at all the bitmap blocks, to see if there
+	 * are extents starting there that are long enough (>= maxlen).
+	 * Note, only on the initial level can the allocation fail if
+	 * the summary says there's an extent.
+	 */
+	for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
+		/*
+		 * Loop over all the bitmap blocks.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary for this level/block.
+			 */
+			error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+				&sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * Nothing there, on to the next block.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try allocating the extent.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
+				maxlen, len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Didn't find any maxlen blocks.  Try smaller ones, unless
+	 * we're asking for a fixed size extent.
+	 */
+	if (minlen > --maxlen) {
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	/*
+	 * Loop over sizes, from maxlen down to minlen.
+	 * This time, when we do the allocations, allow smaller ones
+	 * to succeed.
+	 */
+	for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
+		/*
+		 * Loop over all the bitmap blocks, try an allocation
+		 * starting in that block.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary information for this level/block.
+			 */
+			error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+						  &sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If nothing there, go on to next.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try the allocation.	Make sure the specified
+			 * minlen/maxlen are in the possible range for
+			 * this summary level.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i,
+					XFS_RTMAX(minlen, 1 << l),
+					XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
+					len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that extent.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Got nothing, return failure.
+	 */
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int				/* error */
+xfs_rtallocate_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* start block to allocate */
+	xfs_extlen_t	len,		/* length to allocate */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the allocated extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block allocated > end */
+	xfs_rtblock_t	preblock;	/* first block allocated < start */
+
+	end = start + len - 1;
+	/*
+	 * Assume we're allocating out of the middle of a free extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of free extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Decrement the summary information corresponding to the entire
+	 * (old) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If there are blocks not being allocated at the front of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being allocated at the end of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Modify the bitmap to mark this extent allocated.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 0);
+	return error;
+}
+
+/*
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
+ */
+STATIC int				/* error */
+xfs_rtany_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		low,		/* low log2 extent size */
+	int		high,		/* high log2 extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	int		*stat)		/* out: any good extents here? */
+{
+	int		error;		/* error value */
+	int		log;		/* loop counter, log2 of ext. size */
+	xfs_suminfo_t	sum;		/* summary data */
+
+	/*
+	 * Loop over logs of extent sizes.  Order is irrelevant.
+	 */
+	for (log = low; log <= high; log++) {
+		/*
+		 * Get one summary datum.
+		 */
+		error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any, return success.
+		 */
+		if (sum) {
+			*stat = 1;
+			return 0;
+		}
+	}
+	/*
+	 * Found nothing, return failure.
+	 */
+	*stat = 0;
+	return 0;
+}
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+STATIC int				/* error */
+xfs_rtbuf_get(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	block,		/* block number in bitmap or summary */
+	int		issum,		/* is summary not bitmap */
+	xfs_buf_t	**bpp)		/* output: buffer for the block */
+{
+	xfs_buf_t	*bp;		/* block buffer, result */
+	xfs_daddr_t	d;		/* disk addr of block */
+	int		error;		/* error value */
+	xfs_fsblock_t	fsb;		/* fs block number for block */
+	xfs_inode_t	*ip;		/* bitmap or summary inode */
+
+	ip = issum ? mp->m_rsumip : mp->m_rbmip;
+	/*
+	 * Map from the file offset (block) and inode number to the
+	 * file system block.
+	 */
+	error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
+	if (error) {
+		return error;
+	}
+	ASSERT(fsb != NULLFSBLOCK);
+	/*
+	 * Convert to disk address for buffer cache.
+	 */
+	d = XFS_FSB_TO_DADDR(mp, fsb);
+	/*
+	 * Read the buffer.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, 0, &bp);
+	if (error) {
+		return error;
+	}
+	ASSERT(bp && !XFS_BUF_GETERROR(bp));
+	*bpp = bp;
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int				/* error */
+xfs_rtcheck_alloc_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* out: 1 for allocated, 0 for not */
+{
+	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
+
+	return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat);
+}
+#endif
+
+#ifdef DEBUG
+/*
+ * Check whether the given block in the bitmap has the given value.
+ */
+STATIC int				/* 1 for matches, 0 for not */
+xfs_rtcheck_bit(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* bit (block) to check */
+	int		val)		/* 1 for free, 0 for allocated */
+{
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* pointer into the buffer */
+	/* REFERENCED */
+	int		error;		/* error value */
+	xfs_rtword_t	wdiff;		/* difference between bit & expected */
+	int		word;		/* word number in the buffer */
+	xfs_rtword_t	wval;		/* word value from buffer */
+
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	word = XFS_BITTOWORD(mp, start);
+	bit = (int)(start & (XFS_NBWORD - 1));
+	wval = bufp[word];
+	xfs_trans_brelse(tp, bp);
+	wdiff = (wval ^ -val) & ((xfs_rtword_t)1 << bit);
+	return !wdiff;
+}
+#endif	/* DEBUG */
+
+#if 0
+/*
+ * Check that the given extent (block range) is free already.
+ */
+STATIC int				/* error */
+xfs_rtcheck_free_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* out: 1 for free, 0 for not */
+{
+	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
+
+	return xfs_rtcheck_range(mp, tp, bno, len, 1, &new, stat);
+}
+#endif
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+STATIC int				/* error */
+xfs_rtcheck_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		val,		/* 1 for free, 0 for allocated */
+	xfs_rtblock_t	*new,		/* out: first block not matching */
+	int		*stat)		/* out: 1 for matches, 0 for not */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zero's; 1 (free) => all one's.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not examined.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ val)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * Successful, return.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*new = start + i;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int				/* error */
+xfs_rtcopy_summary(
+	xfs_mount_t	*omp,		/* old file system mount point */
+	xfs_mount_t	*nmp,		/* new file system mount point */
+	xfs_trans_t	*tp)		/* transaction pointer */
+{
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* summary buffer */
+	int		error;		/* error return value */
+	int		log;		/* summary level number (log length) */
+	xfs_suminfo_t	sum;		/* summary data */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+
+	bp = NULL;
+	for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+		for (bbno = omp->m_sb.sb_rbmblocks - 1;
+		     (xfs_srtblock_t)bbno >= 0;
+		     bbno--) {
+			error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+				&sumbno, &sum);
+			if (error)
+				return error;
+			if (sum == 0)
+				continue;
+			error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			ASSERT(sum > 0);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int				/* error */
+xfs_rtfind_back(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = start - limit + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit < XFS_NBWORD - 1) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+			firstbit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = bit - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i = bit - firstbit + 1;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the previous one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if (len - i) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_NBWORD - (len - i);
+		mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start - i + 1;
+	return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int				/* error */
+xfs_rtfind_forw(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = limit - start + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit) {
+		/*
+		 * Calculate last (rightmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Calculate mask for all the relevant bits in this word.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start + i - 1;
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int				/* error */
+xfs_rtfree_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to free */
+	xfs_extlen_t	len,		/* length to free */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the freed extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block freed > end */
+	xfs_rtblock_t	preblock;	/* first block freed < start */
+
+	end = start + len - 1;
+	/*
+	 * Modify the bitmap to mark this extent freed.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 1);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Assume we're freeing out of the middle of an allocated extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of allocated extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	/*
+	 * If there are blocks not being freed at the front of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being freed at the end of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Increment the summary information corresponding to the entire
+	 * (new) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+	return error;
+}
+
+/*
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int				/* error */
+xfs_rtget_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_suminfo_t	*sum)		/* out: summary info for this block */
+{
+	xfs_buf_t	*bp;		/* buffer for summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (rbpp && *rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (rbpp && *rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		if (rbpp) {
+			*rbpp = bp;
+			*rsb = sb;
+		}
+	}
+	/*
+	 * Point to the summary information & copy it out.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	*sum = *sp;
+	/*
+	 * Drop the buffer if we're not asked to remember it.
+	 */
+	if (!rbpp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+STATIC int				/* error */
+xfs_rtmodify_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to modify */
+	xfs_extlen_t	len,		/* length of extent to modify */
+	int		val)		/* 1 for free, 0 for allocated */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtword_t	*first;		/* first used word in the buffer */
+	int		i;		/* current bit number rel. to start */
+	int		lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block, and point to its data.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	first = b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not changed and mask of relevant bits.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		i = lastbit - bit;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Set the word value correctly.
+		 */
+		*b = val;
+		i += XFS_NBWORD;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Compute a mask of relevant bits.
+		 */
+		bit = 0;
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		b++;
+	}
+	/*
+	 * Log any remaining changed bytes.
+	 */
+	if (b > first)
+		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+			(uint)((char *)b - (char *)bufp - 1));
+	return 0;
+}
+
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int				/* error */
+xfs_rtmodify_summary(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_buf_t	*bp;		/* buffer for the summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (rbpp && *rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (rbpp && *rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		if (rbpp) {
+			*rbpp = bp;
+			*rsb = sb;
+		}
+	}
+	/*
+	 * Point to the summary information, modify and log it.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	*sp += delta;
+	xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)),
+		(uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1));
+	return 0;
+}
+
+/*
+ * Visible (exported) functions.
+ */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_growfs_rt_t *in)		/* growfs rt input struct */
+{
+	xfs_rtblock_t	bmbno;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* temporary buffer */
+	int		cancelflags;	/* flags for xfs_trans_cancel */
+	int		error;		/* error return value */
+	xfs_inode_t	*ip;		/* bitmap inode, used as lock */
+	xfs_mount_t	*nmp;		/* new (fake) mount structure */
+	xfs_drfsbno_t	nrblocks;	/* new number of realtime blocks */
+	xfs_extlen_t	nrbmblocks;	/* new number of rt bitmap blocks */
+	xfs_drtbno_t	nrextents;	/* new number of realtime extents */
+	uint8_t		nrextslog;	/* new log2 of sb_rextents */
+	xfs_extlen_t	nrsumblocks;	/* new number of summary blocks */
+	uint		nrsumlevels;	/* new rt summary levels */
+	uint		nrsumsize;	/* new size of rt summary, bytes */
+	xfs_sb_t	*nsbp;		/* new superblock */
+	xfs_extlen_t	rbmblocks;	/* current number of rt bitmap blocks */
+	xfs_extlen_t	rsumblocks;	/* current number of rt summary blks */
+	xfs_sb_t	*sbp;		/* old superblock */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+	xfs_trans_t	*tp;		/* transaction pointer */
+
+	sbp = &mp->m_sb;
+	/*
+	 * Initial error checking.
+	 */
+	if (mp->m_rtdev_targp || mp->m_rbmip == NULL ||
+	    (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
+	    (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
+		return XFS_ERROR(EINVAL);
+	/*
+	 * Read in the last block of the device, make sure it exists.
+	 */
+	error = xfs_read_buf(mp, mp->m_rtdev_targp,
+		XFS_FSB_TO_BB(mp, in->newblocks) - 1, 1, 0, &bp);
+	if (error)
+		return error;
+	ASSERT(bp);
+	xfs_buf_relse(bp);
+	/*
+	 * Calculate new parameters.  These are the final values to be reached.
+	 */
+	nrextents = do_div(nrblocks, in->extsize);
+	nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize);
+	nrextslog = xfs_highbit32(nrextents);
+	nrsumlevels = nrextslog + 1;
+	nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
+	nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+	nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+	/*
+	 * New summary size can't be more than half the size of
+	 * the log.  This prevents us from getting a log overflow,
+	 * since we'll log basically the whole summary file at once.
+	 */
+	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
+		return XFS_ERROR(EINVAL);
+	/*
+	 * Get the old block counts for bitmap and summary inodes.
+	 * These can't change since other growfs callers are locked out.
+	 */
+	rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size);
+	rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size);
+	/*
+	 * Allocate space to the bitmap and summary files, as necessary.
+	 */
+	if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks,
+			mp->m_sb.sb_rbmino)))
+		return error;
+	if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
+			mp->m_sb.sb_rsumino)))
+		return error;
+	nmp = NULL;
+	/*
+	 * Loop over the bitmap blocks.
+	 * We will do everything one bitmap block at a time.
+	 * Skip the current block if it is exactly full.
+	 * This also deals with the case where there were no rtextents before.
+	 */
+	for (bmbno = sbp->sb_rbmblocks -
+		     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
+	     bmbno < nrbmblocks;
+	     bmbno++) {
+		/*
+		 * Allocate a new (fake) mount/sb.
+		 */
+		nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+		*nmp = *mp;
+		nsbp = &nmp->m_sb;
+		/*
+		 * Calculate new sb and mount fields for this round.
+		 */
+		nsbp->sb_rextsize = in->extsize;
+		nsbp->sb_rbmblocks = bmbno + 1;
+		nsbp->sb_rblocks =
+			XFS_RTMIN(nrblocks,
+				  nsbp->sb_rbmblocks * NBBY *
+				  nsbp->sb_blocksize * nsbp->sb_rextsize);
+		nsbp->sb_rextents = do_div(nsbp->sb_rblocks, nsbp->sb_rextsize);
+		nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+		nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
+		nrsumsize =
+			(uint)sizeof(xfs_suminfo_t) * nrsumlevels *
+			nsbp->sb_rbmblocks;
+		nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+		nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+		/*
+		 * Start a transaction, get the log reservation.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
+		cancelflags = 0;
+		if ((error = xfs_trans_reserve(tp, 0,
+				XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
+			goto error_exit;
+		/*
+		 * Lock out other callers by grabbing the bitmap inode lock.
+		 */
+		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino,
+				XFS_ILOCK_EXCL, &ip)))
+			goto error_exit;
+		ASSERT(ip == mp->m_rbmip);
+		/*
+		 * Update the bitmap inode's size.
+		 */
+		mp->m_rbmip->i_d.di_size =
+			nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+		cancelflags |= XFS_TRANS_ABORT;
+		/*
+		 * Get the summary inode into the transaction.
+		 */
+		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino,
+				XFS_ILOCK_EXCL, &ip)))
+			goto error_exit;
+		ASSERT(ip == mp->m_rsumip);
+		/*
+		 * Update the summary inode's size.
+		 */
+		mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
+		xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
+		/*
+		 * Copy summary data from old to new sizes.
+		 * Do this when the real size (not block-aligned) changes.
+		 */
+		if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
+		    mp->m_rsumlevels != nmp->m_rsumlevels) {
+			error = xfs_rtcopy_summary(mp, nmp, tp);
+			if (error)
+				goto error_exit;
+		}
+		/*
+		 * Update superblock fields.
+		 */
+		if (nsbp->sb_rextsize != sbp->sb_rextsize)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+				nsbp->sb_rextsize - sbp->sb_rextsize);
+		if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+				nsbp->sb_rbmblocks - sbp->sb_rbmblocks);
+		if (nsbp->sb_rblocks != sbp->sb_rblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+				nsbp->sb_rblocks - sbp->sb_rblocks);
+		if (nsbp->sb_rextents != sbp->sb_rextents)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+				nsbp->sb_rextents - sbp->sb_rextents);
+		if (nsbp->sb_rextslog != sbp->sb_rextslog)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+				nsbp->sb_rextslog - sbp->sb_rextslog);
+		/*
+		 * Free new extent.
+		 */
+		bp = NULL;
+		error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
+			nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+		if (error)
+			goto error_exit;
+		/*
+		 * Mark more blocks free in the superblock.
+		 */
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
+			nsbp->sb_rextents - sbp->sb_rextents);
+		/*
+		 * Free the fake mp structure.
+		 */
+		kmem_free(nmp, sizeof(*nmp));
+		nmp = NULL;
+		/*
+		 * Update mp values into the real mp structure.
+		 */
+		mp->m_rsumlevels = nrsumlevels;
+		mp->m_rsumsize = nrsumsize;
+		/*
+		 * Commit the transaction.
+		 */
+		xfs_trans_commit(tp, 0, NULL);
+	}
+	return 0;
+
+	/*
+	 * Error paths come here.
+	 */
+error_exit:
+	if (nmp)
+		kmem_free(nmp, sizeof(*nmp));
+	xfs_trans_cancel(tp, cancelflags);
+	return error;
+}
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.	The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_alloctype_t type,		/* allocation type XFS_ALLOCTYPE... */
+	int		wasdel,		/* was a delayed allocation extent */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	xfs_inode_t	*ip;		/* inode for bitmap file */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	xfs_rtblock_t	r;		/* result allocated block */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp;		/* summary file block buffer */
+
+	ASSERT(minlen > 0 && minlen <= maxlen);
+	mp = tp->t_mountp;
+	/*
+	 * If prod is set then figure out what to do to minlen and maxlen.
+	 */
+	if (prod > 1) {
+		xfs_extlen_t	i;
+
+		if ((i = maxlen % prod))
+			maxlen -= i;
+		if ((i = minlen % prod))
+			minlen += prod - i;
+		if (maxlen < minlen) {
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+	/*
+	 * Lock out other callers by grabbing the bitmap inode lock.
+	 */
+	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, XFS_ILOCK_EXCL, &ip);
+	if (error) {
+		return error;
+	}
+	sumbp = NULL;
+	/*
+	 * Allocate by size, or near another block, or exactly at some block.
+	 */
+	switch (type) {
+	case XFS_ALLOCTYPE_ANY_AG:
+		error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
+				&sumbp, &sb, prod, &r);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
+				len, &sumbp, &sb, prod, &r);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
+				len, &sumbp, &sb, prod, &r);
+		break;
+	default:
+		ASSERT(0);
+	}
+	if (error) {
+		return error;
+	}
+	/*
+	 * If it worked, update the superblock.
+	 */
+	if (r != NULLRTBLOCK) {
+		long	slen = (long)*len;
+
+		ASSERT(*len >= minlen && *len <= maxlen);
+		if (wasdel)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
+		else
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
+	}
+	*rtblock = r;
+	return 0;
+}
+
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to free */
+	xfs_extlen_t	len)		/* length of extent freed */
+{
+	int		error;		/* error value */
+	xfs_inode_t	*ip;		/* bitmap file inode */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp;		/* summary file block buffer */
+
+	mp = tp->t_mountp;
+	/*
+	 * Synchronize by locking the bitmap inode.
+	 */
+	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, XFS_ILOCK_EXCL, &ip);
+	if (error) {
+		return error;
+	}
+#if defined(__KERNEL__) && defined(DEBUG)
+	/*
+	 * Check to see that this whole range is currently allocated.
+	 */
+	{
+		int	stat;		/* result from checking range */
+
+		error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat);
+		if (error) {
+			return error;
+		}
+		ASSERT(stat);
+	}
+#endif
+	sumbp = NULL;
+	/*
+	 * Free the range of realtime blocks.
+	 */
+	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Mark more blocks free in the superblock.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+	/*
+	 * If we've now freed all the blocks, reset the file sequence
+	 * number to 0.
+	 */
+	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+	    mp->m_sb.sb_rextents) {
+		if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+			ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*(__uint64_t *)&ip->i_d.di_atime = 0;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	}
+	return 0;
+}
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int				/* error */
+xfs_rtmount_init(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	xfs_buf_t	*bp;	/* buffer for last block of subvolume */
+	xfs_daddr_t	d;	/* address of last block of subvolume */
+	int		error;	/* error return value */
+	xfs_sb_t	*sbp;	/* filesystem superblock copy in mount */
+
+	sbp = &mp->m_sb;
+	if (sbp->sb_rblocks == 0)
+		return 0;
+	if (mp->m_rtdev_targp != NULL) {
+		printk(KERN_WARNING
+		"XFS: This FS has an RT subvol - specify -o rtdev on mount\n");
+		return XFS_ERROR(ENODEV);
+	}
+	mp->m_rsumlevels = sbp->sb_rextslog + 1;
+	mp->m_rsumsize =
+		(uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
+		sbp->sb_rbmblocks;
+	mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+	mp->m_rbmip = mp->m_rsumip = NULL;
+	/*
+	 * Check that the realtime section is an ok size.
+	 */
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
+		printk(KERN_WARNING "XFS: RT mount - %llu != %llu\n",
+			(unsigned long long) XFS_BB_TO_FSB(mp, d),
+			(unsigned long long) mp->m_sb.sb_rblocks);
+		return XFS_ERROR(E2BIG);
+	}
+	error = xfs_read_buf(mp, mp->m_rtdev_targp, d - 1, 1, 0, &bp);
+	if (error) {
+		printk(KERN_WARNING
+			"XFS: RT mount - xfs_read_buf returned %d\n", error);
+		if (error == ENOSPC)
+			return XFS_ERROR(E2BIG);
+		return error;
+	}
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		error;		/* error return value */
+	xfs_sb_t	*sbp;
+
+	sbp = &mp->m_sb;
+	if (sbp->sb_rbmino == NULLFSINO)
+		return 0;
+	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, &mp->m_rbmip, 0);
+	if (error)
+		return error;
+	ASSERT(mp->m_rbmip != NULL);
+	ASSERT(sbp->sb_rsumino != NULLFSINO);
+	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, &mp->m_rsumip, 0);
+	if (error) {
+		vnode_t		*rbmvp;		/* vnode for bitmap file */
+		vmap_t		vmap;		/* vmap to delete vnode */
+
+		rbmvp = XFS_ITOV(mp->m_rbmip);
+		VMAP(rbmvp, mp->m_rbmip, vmap);
+		VN_RELE(rbmvp);
+		vn_purge(rbmvp, &vmap);
+		return error;
+	}
+	ASSERT(mp->m_rsumip != NULL);
+	return 0;
+}
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	len,		/* allocation length (rtextents) */
+	xfs_rtblock_t	*pick)		/* result rt extent */
+{
+	xfs_rtblock_t	b;		/* result block */
+	int		error;		/* error return value */
+	xfs_inode_t	*ip;		/* bitmap incore inode */
+	int		log2;		/* log of sequence number */
+	__uint64_t	resid;		/* residual after log removed */
+	__uint64_t	seq;		/* sequence number of file creation */
+	__uint64_t	*seqp;		/* pointer to seqno in inode */
+
+	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, XFS_ILOCK_EXCL, &ip);
+	if (error)
+		return error;
+	ASSERT(ip == mp->m_rbmip);
+	seqp = (__uint64_t *)&ip->i_d.di_atime;
+	if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
+		ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*seqp = 0;
+	}
+	seq = *seqp;
+	if ((log2 = xfs_highbit64(seq)) == -1)
+		b = 0;
+	else {
+		resid = seq - (1ULL << log2);
+		b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >>
+		    (log2 + 1);
+		if (b >= mp->m_sb.sb_rextents)
+			b = do_mod(b, mp->m_sb.sb_rextents);
+		if (b + len > mp->m_sb.sb_rextents)
+			b = mp->m_sb.sb_rextents - len;
+	}
+	*seqp = seq + 1;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	*pick = b;
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Debug code: print out the value of a range in the bitmap.
+ */
+void
+xfs_rtprint_range(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to print */
+	xfs_extlen_t	len)		/* length to print */
+{
+	xfs_extlen_t	i;		/* block number in the extent */
+
+	printk("%Ld: ", (long long)start);
+	for (i = 0; i < len; i++)
+		printk("%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
+	printk("\n");
+}
+
+/*
+ * Debug code: print the summary file.
+ */
+void
+xfs_rtprint_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp)		/* transaction pointer */
+{
+	xfs_suminfo_t	c;		/* summary data */
+	xfs_rtblock_t	i;		/* bitmap block number */
+	int		l;		/* summary information level */
+	int		p;		/* flag for printed anything */
+	xfs_fsblock_t	sb;		/* summary block number */
+	xfs_buf_t	*sumbp;		/* summary block buffer */
+
+	sumbp = NULL;
+	for (l = 0; l < mp->m_rsumlevels; l++) {
+		for (p = 0, i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			(void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c);
+			if (c) {
+				if (!p) {
+					printk("%Ld-%Ld:", 1LL << l,
+						XFS_RTMIN((1LL << l) +
+							  ((1LL << l) - 1LL),
+							 mp->m_sb.sb_rextents));
+					p = 1;
+				}
+				printk(" %Ld:%d", (long long)i, c);
+			}
+		}
+		if (p)
+			printk("\n");
+	}
+	if (sumbp)
+		xfs_trans_brelse(tp, sumbp);
+}
+#endif	/* DEBUG */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rtalloc.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rtalloc.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rtalloc.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rtalloc.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_RTALLOC_H__
+#define __XFS_RTALLOC_H__
+
+struct xfs_mount;
+struct xfs_trans;
+
+/* Min and max rt extent sizes, specified in bytes */
+#define XFS_MAX_RTEXTSIZE	(1024 * 1024 * 1024)	/* 1GB */
+#define XFS_DFL_RTEXTSIZE	(64 * 1024)		/* 64KB */
+#define XFS_MIN_RTEXTSIZE	(4 * 1024)		/* 4KB */
+
+/*
+ * Constants for bit manipulations.
+ */
+#define XFS_NBBYLOG	3		/* log2(NBBY) */
+#define XFS_WORDLOG	2		/* log2(sizeof(xfs_rtword_t)) */
+#define XFS_NBWORDLOG	(XFS_NBBYLOG + XFS_WORDLOG)
+#define XFS_NBWORD	(1 << XFS_NBWORDLOG)
+#define XFS_WORDMASK	((1 << XFS_WORDLOG) - 1)
+
+#define XFS_BLOCKSIZE(mp)	((mp)->m_sb.sb_blocksize)
+#define XFS_BLOCKMASK(mp)	((mp)->m_blockmask)
+#define XFS_BLOCKWSIZE(mp)	((mp)->m_blockwsize)
+#define XFS_BLOCKWMASK(mp)	((mp)->m_blockwmask)
+
+/*
+ * Summary and bit manipulation macros.
+ */
+#define XFS_SUMOFFS(mp,ls,bb)	((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define XFS_SUMOFFSTOBLOCK(mp,s)	\
+	(((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_SUMPTR(mp,bp,so)	\
+	((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \
+		(((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define XFS_BITTOBLOCK(mp,bi)	((bi) >> (mp)->m_blkbit_log)
+#define XFS_BLOCKTOBIT(mp,bb)	((bb) << (mp)->m_blkbit_log)
+#define XFS_BITTOWORD(mp,bi)	\
+	((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define XFS_RTMIN(a,b)	((a) < (b) ? (a) : (b))
+#define XFS_RTMAX(a,b)	((a) > (b) ? (a) : (b))
+
+#define XFS_RTLOBIT(w)	xfs_lowbit32(w)
+#define XFS_RTHIBIT(w)	xfs_highbit32(w)
+
+#if XFS_BIG_FILESYSTEMS
+#define XFS_RTBLOCKLOG(b)	xfs_highbit64(b)
+#else
+#define XFS_RTBLOCKLOG(b)	xfs_highbit32(b)
+#endif
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_XFS_RT
+/*
+ * Function prototypes for exported functions.
+ */
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.	The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to allocate */
+	xfs_extlen_t		minlen, /* minimum length to allocate */
+	xfs_extlen_t		maxlen, /* maximum length to allocate */
+	xfs_extlen_t		*len,	/* out: actual length allocated */
+	xfs_alloctype_t		type,	/* allocation type XFS_ALLOCTYPE... */
+	int			wasdel, /* was a delayed allocation extent */
+	xfs_extlen_t		prod,	/* extent product factor */
+	xfs_rtblock_t		*rtblock); /* out: start block allocated */
+
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to free */
+	xfs_extlen_t		len);	/* length of extent freed */
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int					/* error */
+xfs_rtmount_init(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_extlen_t		len,	/* allocation length (rtextents) */
+	xfs_rtblock_t		*pick); /* result rt extent */
+
+/*
+ * Debug code: print out the value of a range in the bitmap.
+ */
+void
+xfs_rtprint_range(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		start,	/* starting block to print */
+	xfs_extlen_t		len);	/* length to print */
+
+/*
+ * Debug code: print the summary file.
+ */
+void
+xfs_rtprint_summary(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp);	/* transaction pointer */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	xfs_growfs_rt_t		*in);	/* user supplied growfs struct */
+
+#else
+# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb)	(ENOSYS)
+# define xfs_rtfree_extent(t,b,l)			(ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb)			(ENOSYS)
+# define xfs_growfs_rt(mp,in)				(ENOSYS)
+# define xfs_rtmount_init(m)	(((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtmount_inodes(m)	(((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+#endif	/* CONFIG_XFS_RT */
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_RTALLOC_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rw.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rw.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rw.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rw.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
+ * which clears the setuid and setgid bits when a file is written.
+ */
+int
+xfs_write_clear_setuid(
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_trans_t	*tp;
+	int		error;
+
+	mp = ip->i_mount;
+	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+	if ((error = xfs_trans_reserve(tp, 0,
+				      XFS_WRITEID_LOG_RES(mp),
+				      0, 0, 0))) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	ip->i_d.di_mode &= ~ISUID;
+
+	/*
+	 * Note that we don't have to worry about mandatory
+	 * file locking being disabled here because we only
+	 * clear the ISGID bit if the Group execute bit is
+	 * on, but if it was on then mandatory locking wouldn't
+	 * have been enabled.
+	 */
+	if (ip->i_d.di_mode & (IEXEC >> 3)) {
+		ip->i_d.di_mode &= ~ISGID;
+	}
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0, NULL);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+}
+
+/*
+ * Force a shutdown of the filesystem instantly while keeping
+ * the filesystem consistent. We don't do an unmount here; just shutdown
+ * the shop, make sure that absolutely nothing persistent happens to
+ * this filesystem after this point.
+ */
+
+void
+xfs_do_force_shutdown(
+	bhv_desc_t	*bdp,
+	int		flags,
+	char		*fname,
+	int		lnnum)
+{
+	int		logerror;
+	xfs_mount_t	*mp;
+
+	mp = XFS_BHVTOM(bdp);
+	logerror = flags & XFS_LOG_IO_ERROR;
+
+	if (!(flags & XFS_FORCE_UMOUNT)) {
+		cmn_err(CE_NOTE,
+		"xfs_force_shutdown(%s,0x%x) called from line %d of file %s.  Return address = 0x%x",
+			mp->m_fsname,flags,lnnum,fname,__return_address);
+	}
+	/*
+	 * No need to duplicate efforts.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
+		return;
+
+	/*
+	 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
+	 * queue up anybody new on the log reservations, and wakes up
+	 * everybody who's sleeping on log reservations and tells
+	 * them the bad news.
+	 */
+	if (xfs_log_force_umount(mp, logerror))
+		return;
+
+	if (flags & XFS_CORRUPT_INCORE) {
+		cmn_err(CE_ALERT,
+    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
+			mp->m_fsname);
+	} else if (!(flags & XFS_FORCE_UMOUNT)) {
+		if (logerror) {
+			cmn_err(CE_ALERT,
+			"Log I/O Error Detected.  Shutting down filesystem: %s",
+				mp->m_fsname);
+		} else if (!(flags & XFS_SHUTDOWN_REMOTE_REQ)) {
+			cmn_err(CE_ALERT,
+				"I/O Error Detected.  Shutting down filesystem: %s",
+				mp->m_fsname);
+		}
+	}
+	if (!(flags & XFS_FORCE_UMOUNT)) {
+		cmn_err(CE_ALERT,
+		"Please umount the filesystem, and rectify the problem(s)");
+	}
+}
+
+
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call biodone
+ * so that the proper iodone callbacks get called.
+ */
+int
+xfs_bioerror(
+	xfs_buf_t *bp)
+{
+
+#ifdef XFSERRORDEBUG
+	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+	/*
+	 * No need to wait until the buffer is unpinned.
+	 * We aren't flushing it.
+	 */
+	xfs_buftrace("XFS IOERROR", bp);
+	XFS_BUF_ERROR(bp, EIO);
+	/*
+	 * We're calling biodone, so delete B_DONE flag. Either way
+	 * we have to call the iodone callback, and calling biodone
+	 * probably is the best way since it takes care of
+	 * GRIO as well.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_UNDONE(bp);
+	XFS_BUF_STALE(bp);
+
+	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+	xfs_biodone(bp);
+
+	return (EIO);
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the biodone call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+int
+xfs_bioerror_relse(
+	xfs_buf_t *bp)
+{
+	int64_t fl;
+
+	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
+	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
+
+	xfs_buftrace("XFS IOERRELSE", bp);
+	fl = XFS_BUF_BFLAGS(bp);
+	/*
+	 * No need to wait until the buffer is unpinned.
+	 * We aren't flushing it.
+	 *
+	 * chunkhold expects B_DONE to be set, whether
+	 * we actually finish the I/O or not. We don't want to
+	 * change that interface.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_DONE(bp);
+	XFS_BUF_STALE(bp);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+	if (!(fl & XFS_B_ASYNC)) {
+		/*
+		 * Mark b_error and B_ERROR _both_.
+		 * Lot's of chunkcache code assumes that.
+		 * There's no reason to mark error for
+		 * ASYNC buffers.
+		 */
+		XFS_BUF_ERROR(bp, EIO);
+		XFS_BUF_V_IODONESEMA(bp);
+	} else {
+		xfs_buf_relse(bp);
+	}
+	return (EIO);
+}
+/*
+ * Prints out an ALERT message about I/O error.
+ */
+void
+xfs_ioerror_alert(
+	char			*func,
+	struct xfs_mount	*mp,
+	xfs_buf_t		*bp,
+	xfs_daddr_t		blkno)
+{
+	cmn_err(CE_ALERT,
+ "I/O error in filesystem (\"%s\") meta-data dev 0x%x block 0x%llx\n"
+ "	 (\"%s\") error %d buf count %u",
+		(!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
+		XFS_BUF_TARGET_DEV(bp),
+		(__uint64_t)blkno,
+		func,
+		XFS_BUF_GETERROR(bp),
+		XFS_BUF_COUNT(bp));
+}
+
+/*
+ * This isn't an absolute requirement, but it is
+ * just a good idea to call xfs_read_buf instead of
+ * directly doing a read_buf call. For one, we shouldn't
+ * be doing this disk read if we are in SHUTDOWN state anyway,
+ * so this stops that from happening. Secondly, this does all
+ * the error checking stuff and the brelse if appropriate for
+ * the caller, so the code can be a little leaner.
+ */
+
+int
+xfs_read_buf(
+	struct xfs_mount *mp,
+	xfs_buftarg_t	 *target,
+	xfs_daddr_t	 blkno,
+	int		 len,
+	uint		 flags,
+	xfs_buf_t	 **bpp)
+{
+	xfs_buf_t	 *bp;
+	int		 error;
+
+	if (flags)
+		bp = xfs_buf_read_flags(target, blkno, len, flags);
+	else
+		bp = xfs_buf_read(target, blkno, len, flags);
+	if (!bp)
+		return XFS_ERROR(EIO);
+	error = XFS_BUF_GETERROR(bp);
+	if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
+		*bpp = bp;
+	} else {
+		*bpp = NULL;
+		if (error) {
+			xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
+		} else {
+			error = XFS_ERROR(EIO);
+		}
+		if (bp) {
+			XFS_BUF_UNDONE(bp);
+			XFS_BUF_UNDELAYWRITE(bp);
+			XFS_BUF_STALE(bp);
+			/*
+			 * brelse clears B_ERROR and b_error
+			 */
+			xfs_buf_relse(bp);
+		}
+	}
+	return (error);
+}
+
+/*
+ * Wrapper around bwrite() so that we can trap
+ * write errors, and act accordingly.
+ */
+int
+xfs_bwrite(
+	struct xfs_mount *mp,
+	struct xfs_buf	 *bp)
+{
+	int	error;
+
+	/*
+	 * XXXsup how does this work for quotas.
+	 */
+	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
+	XFS_BUF_SET_FSPRIVATE3(bp, mp);
+	XFS_BUF_WRITE(bp);
+
+	if ((error = XFS_bwrite(bp))) {
+		ASSERT(mp);
+		/*
+		 * Cannot put a buftrace here since if the buffer is not
+		 * B_HOLD then we will brelse() the buffer before returning
+		 * from bwrite and we could be tracing a buffer that has
+		 * been reused.
+		 */
+		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+	}
+	return (error);
+}
+
+/*
+ * xfs_inval_cached_pages()
+ * This routine is responsible for keeping direct I/O and buffered I/O
+ * somewhat coherent.  From here we make sure that we're at least
+ * temporarily holding the inode I/O lock exclusively and then call
+ * the page cache to flush and invalidate any cached pages.  If there
+ * are no cached pages this routine will be very quick.
+ */
+void
+xfs_inval_cached_pages(
+	vnode_t		*vp,
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,
+	int		write,
+	int		relock)
+{
+	xfs_mount_t	*mp;
+
+	if (!VN_CACHED(vp)) {
+		return;
+	}
+
+	mp = io->io_mount;
+
+	/*
+	 * We need to get the I/O lock exclusively in order
+	 * to safely invalidate pages and mappings.
+	 */
+	if (relock) {
+		XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
+		XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL);
+	}
+
+	/* Writing beyond EOF creates a hole that must be zeroed */
+	if (write && (offset > XFS_SIZE(mp, io))) {
+		xfs_fsize_t	isize;
+
+		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+		isize = XFS_SIZE(mp, io);
+		if (offset > isize) {
+			xfs_zero_eof(vp, io, offset, isize, offset, NULL);
+		}
+		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	}
+
+	VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
+	if (relock) {
+		XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
+	}
+}
+
+
+
+spinlock_t	xfs_refcache_lock = SPIN_LOCK_UNLOCKED;
+xfs_inode_t	**xfs_refcache;
+int		xfs_refcache_size;
+int		xfs_refcache_index;
+int		xfs_refcache_busy;
+int		xfs_refcache_count;
+
+/*
+ * Timer callback to mark SB dirty, make sure we keep purging refcache
+ */
+void
+xfs_refcache_sbdirty(struct super_block *sb)
+{
+	sb->s_dirt = 1;
+}
+
+/*
+ * Insert the given inode into the reference cache.
+ */
+void
+xfs_refcache_insert(
+	xfs_inode_t	*ip)
+{
+	vnode_t		*vp;
+	xfs_inode_t	*release_ip;
+	xfs_inode_t	**refcache;
+
+	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE));
+
+	/*
+	 * If an unmount is busy blowing entries out of the cache,
+	 * then don't bother.
+	 */
+	if (xfs_refcache_busy) {
+		return;
+	}
+
+	/*
+	 * If we tuned the refcache down to zero, don't do anything.
+	 */
+	 if (!xfs_refcache_size) {
+		return;
+	}
+
+	/*
+	 * The inode is already in the refcache, so don't bother
+	 * with it.
+	 */
+	if (ip->i_refcache != NULL) {
+		return;
+	}
+
+	vp = XFS_ITOV(ip);
+	/* ASSERT(vp->v_count > 0); */
+	VN_HOLD(vp);
+
+	/*
+	 * We allocate the reference cache on use so that we don't
+	 * waste the memory on systems not being used as NFS servers.
+	 */
+	if (xfs_refcache == NULL) {
+		refcache = (xfs_inode_t **)kmem_zalloc(XFS_REFCACHE_SIZE_MAX *
+						       sizeof(xfs_inode_t *),
+						       KM_SLEEP);
+	} else {
+		refcache = NULL;
+	}
+
+	spin_lock(&xfs_refcache_lock);
+
+	/*
+	 * If we allocated memory for the refcache above and it still
+	 * needs it, then use the memory we allocated.	Otherwise we'll
+	 * free the memory below.
+	 */
+	if (refcache != NULL) {
+		if (xfs_refcache == NULL) {
+			xfs_refcache = refcache;
+			refcache = NULL;
+		}
+	}
+
+	/*
+	 * If an unmount is busy clearing out the cache, don't add new
+	 * entries to it.
+	 */
+	if (xfs_refcache_busy) {
+		spin_unlock(&xfs_refcache_lock);
+		VN_RELE(vp);
+		/*
+		 * If we allocated memory for the refcache above but someone
+		 * else beat us to using it, then free the memory now.
+		 */
+		if (refcache != NULL) {
+			kmem_free(refcache,
+				  XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *));
+		}
+		return;
+	}
+	release_ip = xfs_refcache[xfs_refcache_index];
+	if (release_ip != NULL) {
+		release_ip->i_refcache = NULL;
+		xfs_refcache_count--;
+		ASSERT(xfs_refcache_count >= 0);
+	}
+	xfs_refcache[xfs_refcache_index] = ip;
+	ASSERT(ip->i_refcache == NULL);
+	ip->i_refcache = &(xfs_refcache[xfs_refcache_index]);
+	xfs_refcache_count++;
+	ASSERT(xfs_refcache_count <= xfs_refcache_size);
+	xfs_refcache_index++;
+	if (xfs_refcache_index == xfs_refcache_size) {
+		xfs_refcache_index = 0;
+	}
+	spin_unlock(&xfs_refcache_lock);
+
+	/*
+	 * Save the pointer to the inode to be released so that we can
+	 * VN_RELE it once we've dropped our inode locks in xfs_rwunlock().
+	 * The pointer may be NULL, but that's OK.
+	 */
+	ip->i_release = release_ip;
+
+	/*
+	 * If we allocated memory for the refcache above but someone
+	 * else beat us to using it, then free the memory now.
+	 */
+	if (refcache != NULL) {
+		kmem_free(refcache,
+			  XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *));
+	}
+	return;
+}
+
+
+/*
+ * If the given inode is in the reference cache, purge its entry and
+ * release the reference on the vnode.
+ */
+void
+xfs_refcache_purge_ip(
+	xfs_inode_t	*ip)
+{
+	vnode_t *vp;
+	int	error;
+
+	/*
+	 * If we're not pointing to our entry in the cache, then
+	 * we must not be in the cache.
+	 */
+	if (ip->i_refcache == NULL) {
+		return;
+	}
+
+	spin_lock(&xfs_refcache_lock);
+	if (ip->i_refcache == NULL) {
+		spin_unlock(&xfs_refcache_lock);
+		return;
+	}
+
+	/*
+	 * Clear both our pointer to the cache entry and its pointer
+	 * back to us.
+	 */
+	ASSERT(*(ip->i_refcache) == ip);
+	*(ip->i_refcache) = NULL;
+	ip->i_refcache = NULL;
+	xfs_refcache_count--;
+	ASSERT(xfs_refcache_count >= 0);
+	spin_unlock(&xfs_refcache_lock);
+
+	vp = XFS_ITOV(ip);
+	/* ASSERT(vp->v_count > 1); */
+	VOP_RELEASE(vp, error);
+	VN_RELE(vp);
+
+	return;
+}
+
+
+/*
+ * This is called from the XFS unmount code to purge all entries for the
+ * given mount from the cache.	It uses the refcache busy counter to
+ * make sure that new entries are not added to the cache as we purge them.
+ */
+void
+xfs_refcache_purge_mp(
+	xfs_mount_t	*mp)
+{
+	vnode_t		*vp;
+	int		error, i;
+	xfs_inode_t	*ip;
+
+	if (xfs_refcache == NULL) {
+		return;
+	}
+
+	spin_lock(&xfs_refcache_lock);
+	/*
+	 * Bumping the busy counter keeps new entries from being added
+	 * to the cache.  We use a counter since multiple unmounts could
+	 * be in here simultaneously.
+	 */
+	xfs_refcache_busy++;
+
+	for (i = 0; i < xfs_refcache_size; i++) {
+		ip = xfs_refcache[i];
+		if ((ip != NULL) && (ip->i_mount == mp)) {
+			xfs_refcache[i] = NULL;
+			ip->i_refcache = NULL;
+			xfs_refcache_count--;
+			ASSERT(xfs_refcache_count >= 0);
+			spin_unlock(&xfs_refcache_lock);
+			vp = XFS_ITOV(ip);
+			VOP_RELEASE(vp, error);
+			VN_RELE(vp);
+			spin_lock(&xfs_refcache_lock);
+		}
+	}
+
+	xfs_refcache_busy--;
+	ASSERT(xfs_refcache_busy >= 0);
+	spin_unlock(&xfs_refcache_lock);
+}
+
+
+/*
+ * This is called from the XFS sync code to ensure that the refcache
+ * is emptied out over time.  We purge a small number of entries with
+ * each call.
+ */
+void
+xfs_refcache_purge_some(xfs_mount_t *mp)
+{
+	int		error, i;
+	xfs_inode_t	*ip;
+	int		iplist_index;
+	xfs_inode_t	**iplist;
+	int		purge_count;
+
+	if ((xfs_refcache == NULL) || (xfs_refcache_count == 0)) {
+		return;
+	}
+
+	iplist_index = 0;
+	purge_count = xfs_params.refcache_purge;
+	iplist = (xfs_inode_t **)kmem_zalloc(purge_count *
+					  sizeof(xfs_inode_t *), KM_SLEEP);
+
+	spin_lock(&xfs_refcache_lock);
+
+	/*
+	 * Store any inodes we find in the next several entries
+	 * into the iplist array to be released after dropping
+	 * the spinlock.  We always start looking from the currently
+	 * oldest place in the cache.  We move the refcache index
+	 * forward as we go so that we are sure to eventually clear
+	 * out the entire cache when the system goes idle.
+	 */
+	for (i = 0; i < purge_count; i++) {
+		ip = xfs_refcache[xfs_refcache_index];
+		if (ip != NULL) {
+			xfs_refcache[xfs_refcache_index] = NULL;
+			ip->i_refcache = NULL;
+			xfs_refcache_count--;
+			ASSERT(xfs_refcache_count >= 0);
+			iplist[iplist_index] = ip;
+			iplist_index++;
+		}
+		xfs_refcache_index++;
+		if (xfs_refcache_index == xfs_refcache_size) {
+			xfs_refcache_index = 0;
+		}
+	}
+
+	spin_unlock(&xfs_refcache_lock);
+
+	/*
+	 * If there are still entries in the refcache,
+	 * set timer to mark the SB dirty to make sure that
+	 * we hit sync even if filesystem is idle, so that we'll
+	 * purge some more later.
+	 */
+	if (xfs_refcache_count) {
+		del_timer_sync(&mp->m_sbdirty_timer);
+		mp->m_sbdirty_timer.data =
+		    (unsigned long)LINVFS_GET_IP(XFS_ITOV(mp->m_rootip))->i_sb;
+		mp->m_sbdirty_timer.expires = jiffies + 2*HZ;
+		add_timer(&mp->m_sbdirty_timer);
+	}
+
+	/*
+	 * Now drop the inodes we collected.
+	 */
+	for (i = 0; i < iplist_index; i++) {
+		VOP_RELEASE(XFS_ITOV(iplist[i]), error);
+		VN_RELE(XFS_ITOV(iplist[i]));
+	}
+
+	kmem_free(iplist, purge_count *
+			  sizeof(xfs_inode_t *));
+}
+
+/*
+ * This is called when the refcache is dynamically resized
+ * via a sysctl.
+ *
+ * If the new size is smaller than the old size, purge all
+ * entries in slots greater than the new size, and move
+ * the index if necessary.
+ *
+ * If the refcache hasn't even been allocated yet, or the
+ * new size is larger than the old size, just set the value
+ * of xfs_refcache_size.
+ */
+
+void
+xfs_refcache_resize(int xfs_refcache_new_size)
+{
+	int		i;
+	xfs_inode_t	*ip;
+	int		iplist_index = 0;
+	xfs_inode_t	**iplist;
+	int		error;
+
+	/*
+	 * If the new size is smaller than the current size,
+	 * purge entries to create smaller cache, and
+	 * reposition index if necessary.
+	 * Don't bother if no refcache yet.
+	 */
+	if (xfs_refcache && (xfs_refcache_new_size < xfs_refcache_size)) {
+
+		iplist = (xfs_inode_t **)kmem_zalloc(XFS_REFCACHE_SIZE_MAX *
+				sizeof(xfs_inode_t *), KM_SLEEP);
+
+		spin_lock(&xfs_refcache_lock);
+
+		for (i = xfs_refcache_new_size; i < xfs_refcache_size; i++) {
+			ip = xfs_refcache[i];
+			if (ip != NULL) {
+				xfs_refcache[i] = NULL;
+				ip->i_refcache = NULL;
+				xfs_refcache_count--;
+				ASSERT(xfs_refcache_count >= 0);
+				iplist[iplist_index] = ip;
+				iplist_index++;
+			}
+		}
+
+		xfs_refcache_size = xfs_refcache_new_size;
+
+		/*
+		 * Move index to beginning of cache if it's now past the end
+		 */
+		if (xfs_refcache_index >= xfs_refcache_new_size)
+			xfs_refcache_index = 0;
+
+		spin_unlock(&xfs_refcache_lock);
+
+		/*
+		 * Now drop the inodes we collected.
+		 */
+		for (i = 0; i < iplist_index; i++) {
+			VOP_RELEASE(XFS_ITOV(iplist[i]), error);
+			VN_RELE(XFS_ITOV(iplist[i]));
+		}
+
+		kmem_free(iplist, XFS_REFCACHE_SIZE_MAX *
+				  sizeof(xfs_inode_t *));
+	} else {
+		spin_lock(&xfs_refcache_lock);
+		xfs_refcache_size = xfs_refcache_new_size;
+		spin_unlock(&xfs_refcache_lock);
+	}
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rw.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rw.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_rw.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_rw.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_RW_H__
+#define __XFS_RW_H__
+
+struct bhv_desc;
+struct bmapval;
+struct xfs_buf;
+struct cred;
+struct uio;
+struct vnode;
+struct xfs_inode;
+struct xfs_iocore;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dio;
+struct pm;
+
+/*
+ * Maximum count of bmaps used by read and write paths.
+ */
+#define XFS_MAX_RW_NBMAPS	4
+
+/*
+ * Counts of readahead buffers to use based on physical memory size.
+ * None of these should be more than XFS_MAX_RW_NBMAPS.
+ */
+#define XFS_RW_NREADAHEAD_16MB	2
+#define XFS_RW_NREADAHEAD_32MB	3
+#define XFS_RW_NREADAHEAD_K32	4
+#define XFS_RW_NREADAHEAD_K64	4
+
+/*
+ * Maximum size of a buffer that we\'ll map.  Making this
+ * too big will degrade performance due to the number of
+ * pages which need to be gathered.  Making it too small
+ * will prevent us from doing large I/O\'s to hardware that
+ * needs it.
+ *
+ * This is currently set to 512 KB.
+ */
+#define XFS_MAX_BMAP_LEN_BB	1024
+#define XFS_MAX_BMAP_LEN_BYTES	524288
+
+/*
+ * Maximum size (in inodes) for the nfs refcache
+ */
+#define XFS_REFCACHE_SIZE_MAX	512
+
+
+/*
+ * Convert the given file system block to a disk block.
+ * We have to treat it differently based on whether the
+ * file is a real time file or not, because the bmap code
+ * does.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DB)
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+#define XFS_FSB_TO_DB(ip,fsb)	xfs_fsb_to_db(ip,fsb)
+#else
+#define XFS_FSB_TO_DB(ip,fsb) \
+		(((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) ? \
+		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)))
+#endif
+
+#define XFS_FSB_TO_DB_IO(io,fsb) \
+		(((io)->io_flags & XFS_IOCORE_RT) ? \
+		 XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \
+		 XFS_FSB_TO_DADDR((io)->io_mount, (fsb)))
+
+/*
+ * Defines for the trace mechanisms in xfs_rw.c.
+ */
+#define XFS_RW_KTRACE_SIZE	64
+#define XFS_STRAT_KTRACE_SIZE	64
+#define XFS_STRAT_GTRACE_SIZE	512
+
+#define XFS_READ_ENTER		1
+#define XFS_WRITE_ENTER		2
+#define XFS_IOMAP_READ_ENTER	3
+#define XFS_IOMAP_WRITE_ENTER	4
+#define XFS_IOMAP_READ_MAP	5
+#define XFS_IOMAP_WRITE_MAP	6
+#define XFS_IOMAP_WRITE_NOSPACE 7
+#define XFS_ITRUNC_START	8
+#define XFS_ITRUNC_FINISH1	9
+#define XFS_ITRUNC_FINISH2	10
+#define XFS_CTRUNC1		11
+#define XFS_CTRUNC2		12
+#define XFS_CTRUNC3		13
+#define XFS_CTRUNC4		14
+#define XFS_CTRUNC5		15
+#define XFS_CTRUNC6		16
+#define XFS_BUNMAPI		17
+#define XFS_INVAL_CACHED	18
+#define XFS_DIORD_ENTER		19
+#define XFS_DIOWR_ENTER		20
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_RW_TRACE
+#define XFS_STRAT_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_RW_TRACE
+#undef XFS_STRAT_TRACE
+#endif
+
+/*
+ * Prototypes for functions in xfs_rw.c.
+ */
+
+int
+xfs_write_clear_setuid(
+	struct xfs_inode	*ip);
+
+int
+xfs_bwrite(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp);
+
+void
+xfs_inval_cached_pages(
+	struct vnode		*vp,
+	struct xfs_iocore	*io,
+	xfs_off_t		offset,
+	int			write,
+	int			relock);
+
+void
+xfs_refcache_insert(
+	struct xfs_inode	*ip);
+
+void
+xfs_refcache_purge_ip(
+	struct xfs_inode	*ip);
+
+void
+xfs_refcache_purge_mp(
+	struct xfs_mount	*mp);
+
+void
+xfs_refcache_purge_some(
+	struct xfs_mount	*mp);
+
+void
+xfs_refcache_resize(
+	int			xfs_refcache_new_size);
+
+int
+xfs_bioerror(
+	struct xfs_buf		*b);
+
+/*
+ * XFS I/O core functions
+ */
+extern int xfs_bioerror_relse(struct xfs_buf *);
+
+
+/*
+ * Needed by xfs_rw.c
+ */
+int
+xfs_rwlock(
+	bhv_desc_t	*bdp,
+	vrwlock_t	write_lock);
+
+void
+xfs_rwunlock(
+	bhv_desc_t	*bdp,
+	vrwlock_t	write_lock);
+
+int
+xfs_read_buf(
+	struct xfs_mount *mp,
+	xfs_buftarg_t	 *target,
+	xfs_daddr_t	 blkno,
+	int		 len,
+	uint		 flags,
+	struct xfs_buf	 **bpp);
+
+void
+xfs_ioerror_alert(
+	char			*func,
+	struct xfs_mount	*mp,
+	xfs_buf_t		*bp,
+	xfs_daddr_t		blkno);
+
+#endif /* __XFS_RW_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_sb.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_sb.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_sb.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_sb.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SB_H__
+#define __XFS_SB_H__
+
+/*
+ * Super block
+ * Fits into a 512-byte buffer at daddr_t 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+
+#define XFS_SB_MAGIC		0x58465342	/* 'XFSB' */
+#define XFS_SB_VERSION_1	1		/* 5.3, 6.0.1, 6.1 */
+#define XFS_SB_VERSION_2	2		/* 6.2 - attributes */
+#define XFS_SB_VERSION_3	3		/* 6.2 - new inode version */
+#define XFS_SB_VERSION_4	4		/* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_NUMBITS		0x000f
+#define XFS_SB_VERSION_ALLFBITS		0xfff0
+#define XFS_SB_VERSION_SASHFBITS	0xf000
+#define XFS_SB_VERSION_REALFBITS	0x0ff0
+#define XFS_SB_VERSION_ATTRBIT		0x0010
+#define XFS_SB_VERSION_NLINKBIT		0x0020
+#define XFS_SB_VERSION_QUOTABIT		0x0040
+#define XFS_SB_VERSION_ALIGNBIT		0x0080
+#define XFS_SB_VERSION_DALIGNBIT	0x0100
+#define XFS_SB_VERSION_SHAREDBIT	0x0200
+#define XFS_SB_VERSION_LOGV2BIT		0x0400
+#define XFS_SB_VERSION_EXTFLGBIT	0x1000
+#define XFS_SB_VERSION_DIRV2BIT		0x2000
+#define XFS_SB_VERSION_OKSASHFBITS	\
+	(XFS_SB_VERSION_EXTFLGBIT | \
+	 XFS_SB_VERSION_DIRV2BIT)
+#define XFS_SB_VERSION_OKREALFBITS	\
+	(XFS_SB_VERSION_ATTRBIT | \
+	 XFS_SB_VERSION_NLINKBIT | \
+	 XFS_SB_VERSION_QUOTABIT | \
+	 XFS_SB_VERSION_ALIGNBIT | \
+	 XFS_SB_VERSION_DALIGNBIT | \
+	 XFS_SB_VERSION_SHAREDBIT | \
+	 XFS_SB_VERSION_LOGV2BIT)
+#define XFS_SB_VERSION_OKSASHBITS	\
+	(XFS_SB_VERSION_NUMBITS | \
+	 XFS_SB_VERSION_REALFBITS | \
+	 XFS_SB_VERSION_OKSASHFBITS)
+#define XFS_SB_VERSION_OKREALBITS	\
+	(XFS_SB_VERSION_NUMBITS | \
+	 XFS_SB_VERSION_OKREALFBITS | \
+	 XFS_SB_VERSION_OKSASHFBITS)
+#define XFS_SB_VERSION_MKFS(ia,dia,extflag,dirv2,na)	\
+	(((ia) || (dia) || (extflag) || (dirv2) || (na)) ? \
+		(XFS_SB_VERSION_4 | \
+		 ((ia) ? XFS_SB_VERSION_ALIGNBIT : 0) | \
+		 ((dia) ? XFS_SB_VERSION_DALIGNBIT : 0) | \
+		 ((extflag) ? XFS_SB_VERSION_EXTFLGBIT : 0) | \
+		 ((dirv2) ? XFS_SB_VERSION_DIRV2BIT : 0) | \
+		 ((na) ? XFS_SB_VERSION_LOGV2BIT : 0)) : \
+		XFS_SB_VERSION_1)
+
+typedef struct xfs_sb
+{
+	__uint32_t	sb_magicnum;	/* magic number == XFS_SB_MAGIC */
+	__uint32_t	sb_blocksize;	/* logical block size, bytes */
+	xfs_drfsbno_t	sb_dblocks;	/* number of data blocks */
+	xfs_drfsbno_t	sb_rblocks;	/* number of realtime blocks */
+	xfs_drtbno_t	sb_rextents;	/* number of realtime extents */
+	uuid_t		sb_uuid;	/* file system unique id */
+	xfs_dfsbno_t	sb_logstart;	/* starting block of log if internal */
+	xfs_ino_t	sb_rootino;	/* root inode number */
+	xfs_ino_t	sb_rbmino;	/* bitmap inode for realtime extents */
+	xfs_ino_t	sb_rsumino;	/* summary inode for rt bitmap */
+	xfs_agblock_t	sb_rextsize;	/* realtime extent size, blocks */
+	xfs_agblock_t	sb_agblocks;	/* size of an allocation group */
+	xfs_agnumber_t	sb_agcount;	/* number of allocation groups */
+	xfs_extlen_t	sb_rbmblocks;	/* number of rt bitmap blocks */
+	xfs_extlen_t	sb_logblocks;	/* number of log blocks */
+	__uint16_t	sb_versionnum;	/* header version == XFS_SB_VERSION */
+	__uint16_t	sb_sectsize;	/* volume sector size, bytes */
+	__uint16_t	sb_inodesize;	/* inode size, bytes */
+	__uint16_t	sb_inopblock;	/* inodes per block */
+	char		sb_fname[12];	/* file system name */
+	__uint8_t	sb_blocklog;	/* log2 of sb_blocksize */
+	__uint8_t	sb_sectlog;	/* log2 of sb_sectsize */
+	__uint8_t	sb_inodelog;	/* log2 of sb_inodesize */
+	__uint8_t	sb_inopblog;	/* log2 of sb_inopblock */
+	__uint8_t	sb_agblklog;	/* log2 of sb_agblocks (rounded up) */
+	__uint8_t	sb_rextslog;	/* log2 of sb_rextents */
+	__uint8_t	sb_inprogress;	/* mkfs is in progress, don't mount */
+	__uint8_t	sb_imax_pct;	/* max % of fs for inode space */
+					/* statistics */
+	/*
+	 * These fields must remain contiguous.	 If you really
+	 * want to change their layout, make sure you fix the
+	 * code in xfs_trans_apply_sb_deltas().
+	 */
+	__uint64_t	sb_icount;	/* allocated inodes */
+	__uint64_t	sb_ifree;	/* free inodes */
+	__uint64_t	sb_fdblocks;	/* free data blocks */
+	__uint64_t	sb_frextents;	/* free realtime extents */
+	/*
+	 * End contiguous fields.
+	 */
+	xfs_ino_t	sb_uquotino;	/* user quota inode */
+	xfs_ino_t	sb_gquotino;	/* group quota inode */
+	__uint16_t	sb_qflags;	/* quota flags */
+	__uint8_t	sb_flags;	/* misc. flags */
+	__uint8_t	sb_shared_vn;	/* shared version number */
+	xfs_extlen_t	sb_inoalignmt;	/* inode chunk alignment, fsblocks */
+	__uint32_t	sb_unit;	/* stripe or raid unit */
+	__uint32_t	sb_width;	/* stripe or raid width */
+	__uint8_t	sb_dirblklog;	/* log2 of dir block size (fsbs) */
+	__uint8_t	sb_dummy[3];	/* padding */
+	__uint32_t	sb_logsunit;	/* stripe unit size for the log */
+} xfs_sb_t;
+
+/*
+ * Sequence number values for the fields.
+ */
+typedef enum {
+	XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
+	XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
+	XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
+	XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
+	XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
+	XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
+	XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
+	XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
+	XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
+	XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
+	XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
+	XFS_SBS_DUMMY, XFS_SBS_LOGSUNIT,
+	XFS_SBS_FIELDCOUNT
+} xfs_sb_field_t;
+
+/*
+ * Mask values, defined based on the xfs_sb_field_t values.
+ * Only define the ones we're using.
+ */
+#define XFS_SB_MVAL(x)		(1LL << XFS_SBS_ ## x)
+#define XFS_SB_UUID		XFS_SB_MVAL(UUID)
+#define XFS_SB_FNAME		XFS_SB_MVAL(FNAME)
+#define XFS_SB_ROOTINO		XFS_SB_MVAL(ROOTINO)
+#define XFS_SB_RBMINO		XFS_SB_MVAL(RBMINO)
+#define XFS_SB_RSUMINO		XFS_SB_MVAL(RSUMINO)
+#define XFS_SB_VERSIONNUM	XFS_SB_MVAL(VERSIONNUM)
+#define XFS_SB_UQUOTINO		XFS_SB_MVAL(UQUOTINO)
+#define XFS_SB_GQUOTINO		XFS_SB_MVAL(GQUOTINO)
+#define XFS_SB_QFLAGS		XFS_SB_MVAL(QFLAGS)
+#define XFS_SB_SHARED_VN	XFS_SB_MVAL(SHARED_VN)
+#define XFS_SB_UNIT		XFS_SB_MVAL(UNIT)
+#define XFS_SB_WIDTH		XFS_SB_MVAL(WIDTH)
+#define XFS_SB_NUM_BITS		((int)XFS_SBS_FIELDCOUNT)
+#define XFS_SB_ALL_BITS		((1LL << XFS_SB_NUM_BITS) - 1)
+#define XFS_SB_MOD_BITS		\
+	(XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
+	 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
+	 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH)
+
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS		0x00	/* no flags set */
+#define XFS_SBF_READONLY	0x01	/* only read-only mounts allowed */
+
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN	0
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_NUM)
+int xfs_sb_version_num(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_NUM(sbp) xfs_sb_version_num(sbp)
+#else
+#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_GOOD_VERSION)
+int xfs_sb_good_version(xfs_sb_t *sbp);
+#define XFS_SB_GOOD_VERSION(sbp)	xfs_sb_good_version(sbp)
+#else
+#define XFS_SB_GOOD_VERSION_INT(sbp)	\
+	((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
+	  ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
+	 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	  !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS)
+#ifdef __KERNEL__
+#define XFS_SB_GOOD_VERSION(sbp)	\
+	(XFS_SB_GOOD_VERSION_INT(sbp) && \
+	  (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN) ))
+#else
+/*
+ * extra 2 paren's here (( to unconfuse paren-matching editors
+ * like vi because XFS_SB_GOOD_VERSION_INT is a partial expression
+ * and the two XFS_SB_GOOD_VERSION's each 2 more close paren's to
+ * complete the expression.
+ */
+#define XFS_SB_GOOD_VERSION(sbp)	\
+	(XFS_SB_GOOD_VERSION_INT(sbp) && \
+	  (!((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \
+	   (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)) ))
+#endif /* __KERNEL__ */
+#endif
+
+#define XFS_SB_GOOD_SASH_VERSION(sbp)	\
+	((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
+	  ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
+	 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	  !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS)))
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TONEW)
+unsigned xfs_sb_version_tonew(unsigned v);
+#define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v)
+#else
+#define XFS_SB_VERSION_TONEW(v) \
+	((((v) == XFS_SB_VERSION_1) ? \
+		0 : \
+		(((v) == XFS_SB_VERSION_2) ? \
+			XFS_SB_VERSION_ATTRBIT : \
+			(XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \
+	 XFS_SB_VERSION_4)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TOOLD)
+unsigned xfs_sb_version_toold(unsigned v);
+#define XFS_SB_VERSION_TOOLD(v) xfs_sb_version_toold(v)
+#else
+#define XFS_SB_VERSION_TOOLD(v) \
+	(((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
+		0 : \
+		(((v) & XFS_SB_VERSION_NLINKBIT) ? \
+			XFS_SB_VERSION_3 : \
+			(((v) & XFS_SB_VERSION_ATTRBIT) ?  \
+				XFS_SB_VERSION_2 : \
+				XFS_SB_VERSION_1)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASATTR)
+int xfs_sb_version_hasattr(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASATTR(sbp)	xfs_sb_version_hasattr(sbp)
+#else
+#define XFS_SB_VERSION_HASATTR(sbp)	\
+	(((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
+	 ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+	 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	  ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDATTR)
+void xfs_sb_version_addattr(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDATTR(sbp)	xfs_sb_version_addattr(sbp)
+#else
+#define XFS_SB_VERSION_ADDATTR(sbp)	\
+	((sbp)->sb_versionnum = \
+	 (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
+		XFS_SB_VERSION_2 : \
+		((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \
+			((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \
+			(XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT))))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASNLINK)
+int xfs_sb_version_hasnlink(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASNLINK(sbp)	xfs_sb_version_hasnlink(sbp)
+#else
+#define XFS_SB_VERSION_HASNLINK(sbp)	\
+	(((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+	 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	  ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDNLINK)
+void xfs_sb_version_addnlink(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDNLINK(sbp)	xfs_sb_version_addnlink(sbp)
+#else
+#define XFS_SB_VERSION_ADDNLINK(sbp)	\
+	((sbp)->sb_versionnum = \
+	 ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
+		XFS_SB_VERSION_3 : \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASQUOTA)
+int xfs_sb_version_hasquota(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASQUOTA(sbp)	xfs_sb_version_hasquota(sbp)
+#else
+#define XFS_SB_VERSION_HASQUOTA(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDQUOTA)
+void xfs_sb_version_addquota(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDQUOTA(sbp)	xfs_sb_version_addquota(sbp)
+#else
+#define XFS_SB_VERSION_ADDQUOTA(sbp)	\
+	((sbp)->sb_versionnum = \
+	 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
+		(XFS_SB_VERSION_TONEW((sbp)->sb_versionnum) | \
+		 XFS_SB_VERSION_QUOTABIT)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASALIGN)
+int xfs_sb_version_hasalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASALIGN(sbp)	xfs_sb_version_hasalign(sbp)
+#else
+#define XFS_SB_VERSION_HASALIGN(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBALIGN)
+void xfs_sb_version_subalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_SUBALIGN(sbp)	xfs_sb_version_subalign(sbp)
+#else
+#define XFS_SB_VERSION_SUBALIGN(sbp)	\
+	((sbp)->sb_versionnum = \
+	 XFS_SB_VERSION_TOOLD((sbp)->sb_versionnum & ~XFS_SB_VERSION_ALIGNBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDALIGN)
+int xfs_sb_version_hasdalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASDALIGN(sbp)	xfs_sb_version_hasdalign(sbp)
+#else
+#define XFS_SB_VERSION_HASDALIGN(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDDALIGN)
+int xfs_sb_version_adddalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDDALIGN(sbp)	xfs_sb_version_adddalign(sbp)
+#else
+#define XFS_SB_VERSION_ADDDALIGN(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_DALIGNBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASSHARED)
+int xfs_sb_version_hasshared(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASSHARED(sbp)	xfs_sb_version_hasshared(sbp)
+#else
+#define XFS_SB_VERSION_HASSHARED(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDSHARED)
+int xfs_sb_version_addshared(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDSHARED(sbp)	xfs_sb_version_addshared(sbp)
+#else
+#define XFS_SB_VERSION_ADDSHARED(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_SHAREDBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBSHARED)
+int xfs_sb_version_subshared(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_SUBSHARED(sbp)	xfs_sb_version_subshared(sbp)
+#else
+#define XFS_SB_VERSION_SUBSHARED(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum & ~XFS_SB_VERSION_SHAREDBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDIRV2)
+int xfs_sb_version_hasdirv2(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASDIRV2(sbp)	xfs_sb_version_hasdirv2(sbp)
+#else
+#define XFS_SB_VERSION_HASDIRV2(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASLOGV2)
+int xfs_sb_version_haslogv2(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASLOGV2(sbp)   xfs_sb_version_haslogv2(sbp)
+#else
+#define XFS_SB_VERSION_HASLOGV2(sbp)   \
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASEXTFLGBIT)
+int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASEXTFLGBIT(sbp)	xfs_sb_version_hasextflgbit(sbp)
+#else
+#define XFS_SB_VERSION_HASEXTFLGBIT(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDEXTFLGBIT)
+int xfs_sb_version_addextflgbit(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp)	xfs_sb_version_addextflgbit(sbp)
+#else
+#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_EXTFLGBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBEXTFLGBIT)
+int xfs_sb_version_subextflgbit(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp)	xfs_sb_version_subextflgbit(sbp)
+#else
+#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum & ~XFS_SB_VERSION_EXTFLGBIT))
+#endif
+
+/*
+ * end of superblock version macros
+ */
+
+#define XFS_SB_DADDR	((xfs_daddr_t)0)		/* daddr in filesystem/ag */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_BLOCK)
+xfs_agblock_t xfs_sb_block(struct xfs_mount *mp);
+#define XFS_SB_BLOCK(mp)	xfs_sb_block(mp)
+#else
+#define XFS_SB_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_HDR_BLOCK)
+xfs_agblock_t xfs_hdr_block(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_HDR_BLOCK(mp,d)	xfs_hdr_block(mp,d)
+#else
+#define XFS_HDR_BLOCK(mp,d)	((xfs_agblock_t)(XFS_BB_TO_FSBT(mp,d)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_FSB)
+xfs_fsblock_t xfs_daddr_to_fsb(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_DADDR_TO_FSB(mp,d)		xfs_daddr_to_fsb(mp,d)
+#else
+#define XFS_DADDR_TO_FSB(mp,d) \
+	XFS_AGB_TO_FSB(mp, XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DADDR)
+xfs_daddr_t xfs_fsb_to_daddr(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+#define XFS_FSB_TO_DADDR(mp,fsbno)	xfs_fsb_to_daddr(mp,fsbno)
+#else
+#define XFS_FSB_TO_DADDR(mp,fsbno) \
+	XFS_AGB_TO_DADDR(mp, XFS_FSB_TO_AGNO(mp,fsbno), \
+			 XFS_FSB_TO_AGBNO(mp,fsbno))
+#endif
+
+/*
+ * File system block to basic block conversions.
+ */
+#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSB(mp,bb)	\
+	(((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSBT(mp,bb)	((bb) >> (mp)->m_blkbb_log)
+#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
+
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno)	((xfs_fsize_t)(fsbno) << \
+				 (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b)	\
+	((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b)	(((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b)	((b) & (mp)->m_blockmask)
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBP)
+xfs_sb_t *xfs_buf_to_sbp(struct xfs_buf *bp);
+#define XFS_BUF_TO_SBP(bp)	xfs_buf_to_sbp(bp)
+#else
+#define XFS_BUF_TO_SBP(bp)	((xfs_sb_t *)XFS_BUF_PTR(bp))
+#endif
+
+#endif	/* __XFS_SB_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1268 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+STATIC void	xfs_trans_apply_sb_deltas(xfs_trans_t *);
+STATIC uint	xfs_trans_count_vecs(xfs_trans_t *);
+STATIC void	xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
+STATIC void	xfs_trans_uncommit(xfs_trans_t *, uint);
+STATIC void	xfs_trans_committed(xfs_trans_t *, int);
+STATIC void	xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
+STATIC void	xfs_trans_free(xfs_trans_t *);
+
+kmem_zone_t		*xfs_trans_zone;
+
+
+/*
+ * Initialize the precomputed transaction reservation values
+ * in the mount structure.
+ */
+void
+xfs_trans_init(
+	xfs_mount_t	*mp)
+{
+	xfs_trans_reservations_t	*resp;
+
+	resp = &(mp->m_reservations);
+	resp->tr_write =
+		(uint)(XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_itruncate =
+		(uint)(XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_rename =
+		(uint)(XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_link = (uint)XFS_CALC_LINK_LOG_RES(mp);
+	resp->tr_remove =
+		(uint)(XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_symlink =
+		(uint)(XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_create =
+		(uint)(XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_mkdir =
+		(uint)(XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_ifree =
+		(uint)(XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_ichange =
+		(uint)(XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_growdata = (uint)XFS_CALC_GROWDATA_LOG_RES(mp);
+	resp->tr_swrite = (uint)XFS_CALC_SWRITE_LOG_RES(mp);
+	resp->tr_writeid = (uint)XFS_CALC_WRITEID_LOG_RES(mp);
+	resp->tr_addafork =
+		(uint)(XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_attrinval = (uint)XFS_CALC_ATTRINVAL_LOG_RES(mp);
+	resp->tr_attrset =
+		(uint)(XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_attrrm =
+		(uint)(XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_clearagi = (uint)XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+	resp->tr_growrtalloc = (uint)XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+	resp->tr_growrtzero = (uint)XFS_CALC_GROWRTZERO_LOG_RES(mp);
+	resp->tr_growrtfree = (uint)XFS_CALC_GROWRTFREE_LOG_RES(mp);
+}
+
+/*
+ * This routine is called to allocate a transaction structure.
+ * The type parameter indicates the type of the transaction.  These
+ * are enumerated in xfs_trans.h.
+ *
+ * Dynamically allocate the transaction structure from the transaction
+ * zone, initialize it, and return it to the caller.
+ */
+xfs_trans_t *
+xfs_trans_alloc(
+	xfs_mount_t	*mp,
+	uint		type)
+{
+	xfs_check_frozen(mp, NULL, XFS_FREEZE_TRANS);
+	return (_xfs_trans_alloc(mp, type));
+
+}
+
+xfs_trans_t *
+_xfs_trans_alloc(
+	xfs_mount_t	*mp,
+	uint		type)
+{
+	xfs_trans_t	*tp;
+	ASSERT(xfs_trans_zone != NULL);
+	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+	tp->t_dqinfo = NULL;
+
+	/*
+	 * Initialize the transaction structure.
+	 */
+	tp->t_magic = XFS_TRANS_MAGIC;
+	tp->t_type = type;
+	tp->t_mountp = mp;
+	tp->t_items_free = XFS_LIC_NUM_SLOTS;
+	tp->t_busy_free = XFS_LBC_NUM_SLOTS;
+	XFS_LIC_INIT(&(tp->t_items));
+	XFS_LBC_INIT(&(tp->t_busy));
+
+	return (tp);
+}
+
+/*
+ * This is called to create a new transaction which will share the
+ * permanent log reservation of the given transaction.	The remaining
+ * unused block and rt extent reservations are also inherited.	This
+ * implies that the original transaction is no longer allowed to allocate
+ * blocks.  Locks and log items, however, are no inherited.  They must
+ * be added to the new transaction explicitly.
+ */
+xfs_trans_t *
+xfs_trans_dup(
+	xfs_trans_t	*tp)
+{
+	xfs_trans_t	*ntp;
+
+	ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+
+	/*
+	 * Initialize the new transaction structure.
+	 */
+	ntp->t_magic = XFS_TRANS_MAGIC;
+	ntp->t_type = tp->t_type;
+	ntp->t_mountp = tp->t_mountp;
+	ntp->t_items_free = XFS_LIC_NUM_SLOTS;
+	ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
+	XFS_LIC_INIT(&(ntp->t_items));
+	XFS_LBC_INIT(&(ntp->t_busy));
+
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	ASSERT(!xlog_debug || tp->t_ticket != NULL);
+#else
+	ASSERT(tp->t_ticket != NULL);
+#endif
+	ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
+	ntp->t_ticket = tp->t_ticket;
+	ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
+	tp->t_blk_res = tp->t_blk_res_used;
+	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
+	tp->t_rtx_res = tp->t_rtx_res_used;
+
+	/*
+	 * dup the dquot stuff too.
+	 */
+	if (tp->t_dqinfo)
+		xfs_trans_dup_dqinfo(tp, ntp);
+
+	atomic_inc(&tp->t_mountp->m_active_trans);
+	return ntp;
+}
+
+/*
+ * This is called to reserve free disk blocks and log space for the
+ * given transaction.  This must be done before allocating any resources
+ * within the transaction.
+ *
+ * This will return ENOSPC if there are not enough blocks available.
+ * It will sleep waiting for available log space.
+ * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
+ * is used by long running transactions.  If any one of the reservations
+ * fails then they will all be backed out.
+ *
+ * This does not do quota reservations. That typically is done by the
+ * caller afterwards.
+ */
+int
+xfs_trans_reserve(
+	xfs_trans_t	*tp,
+	uint		blocks,
+	uint		logspace,
+	uint		rtextents,
+	uint		flags,
+	uint		logcount)
+{
+	int		log_flags;
+	int		error;
+	int	rsvd;
+
+	error = 0;
+	rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+	/* Mark this thread as being in a transaction */
+	current->flags |= PF_FSTRANS;
+
+	/*
+	 * Attempt to reserve the needed disk blocks by decrementing
+	 * the number needed from the number available.	 This will
+	 * fail if the count would go below zero.
+	 */
+	if (blocks > 0) {
+		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+					  -blocks, rsvd);
+		if (error != 0) {
+			current->flags &= ~PF_FSTRANS;
+			return (XFS_ERROR(ENOSPC));
+		}
+		tp->t_blk_res += blocks;
+	}
+
+	/*
+	 * Reserve the log space needed for this transaction.
+	 */
+	if (logspace > 0) {
+		ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
+		ASSERT((tp->t_log_count == 0) ||
+			(tp->t_log_count == logcount));
+		if (flags & XFS_TRANS_PERM_LOG_RES) {
+			log_flags = XFS_LOG_PERM_RESERV;
+			tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+		} else {
+			ASSERT(tp->t_ticket == NULL);
+			ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+			log_flags = 0;
+		}
+
+		error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
+					&tp->t_ticket,
+					XFS_TRANSACTION, log_flags);
+		if (error) {
+			goto undo_blocks;
+		}
+		tp->t_log_res = logspace;
+		tp->t_log_count = logcount;
+	}
+
+	/*
+	 * Attempt to reserve the needed realtime extents by decrementing
+	 * the number needed from the number available.	 This will
+	 * fail if the count would go below zero.
+	 */
+	if (rtextents > 0) {
+		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
+					  -rtextents, rsvd);
+		if (error) {
+			error = XFS_ERROR(ENOSPC);
+			goto undo_log;
+		}
+		tp->t_rtx_res += rtextents;
+	}
+
+	return 0;
+
+	/*
+	 * Error cases jump to one of these labels to undo any
+	 * reservations which have already been performed.
+	 */
+undo_log:
+	if (logspace > 0) {
+		if (flags & XFS_TRANS_PERM_LOG_RES) {
+			log_flags = XFS_LOG_REL_PERM_RESERV;
+		} else {
+			log_flags = 0;
+		}
+		xfs_log_done(tp->t_mountp, tp->t_ticket, log_flags);
+		tp->t_ticket = NULL;
+		tp->t_log_res = 0;
+		tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
+	}
+
+undo_blocks:
+	if (blocks > 0) {
+		(void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+					 blocks, rsvd);
+		tp->t_blk_res = 0;
+	}
+
+	current->flags &= ~PF_FSTRANS;
+
+	return (error);
+}
+
+
+/*
+ * This is called to set the a callback to be called when the given
+ * transaction is committed to disk.  The transaction pointer and the
+ * argument pointer will be passed to the callback routine.
+ *
+ * Only one callback can be associated with any single transaction.
+ */
+void
+xfs_trans_callback(
+	xfs_trans_t		*tp,
+	xfs_trans_callback_t	callback,
+	void			*arg)
+{
+	ASSERT(tp->t_callback == NULL);
+	tp->t_callback = callback;
+	tp->t_callarg = arg;
+}
+
+
+/*
+ * Record the indicated change to the given field for application
+ * to the file system's superblock when the transaction commits.
+ * For now, just store the change in the transaction structure.
+ *
+ * Mark the transaction structure to indicate that the superblock
+ * needs to be updated before committing.
+ */
+void
+xfs_trans_mod_sb(
+	xfs_trans_t	*tp,
+	uint		field,
+	long		delta)
+{
+
+	switch (field) {
+	case XFS_TRANS_SB_ICOUNT:
+		ASSERT(delta > 0);
+		tp->t_icount_delta += delta;
+		break;
+	case XFS_TRANS_SB_IFREE:
+		tp->t_ifree_delta += delta;
+		break;
+	case XFS_TRANS_SB_FDBLOCKS:
+		/*
+		 * Track the number of blocks allocated in the
+		 * transaction.	 Make sure it does not exceed the
+		 * number reserved.
+		 */
+		if (delta < 0) {
+			tp->t_blk_res_used += (uint)-delta;
+			ASSERT(tp->t_blk_res_used <= tp->t_blk_res);
+		}
+		tp->t_fdblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_RES_FDBLOCKS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblock's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		ASSERT(delta < 0);
+		tp->t_res_fdblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_FREXTENTS:
+		/*
+		 * Track the number of blocks allocated in the
+		 * transaction.	 Make sure it does not exceed the
+		 * number reserved.
+		 */
+		if (delta < 0) {
+			tp->t_rtx_res_used += (uint)-delta;
+			ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
+		}
+		tp->t_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_RES_FREXTENTS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblocks's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		ASSERT(delta < 0);
+		tp->t_res_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_DBLOCKS:
+		ASSERT(delta > 0);
+		tp->t_dblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_AGCOUNT:
+		ASSERT(delta > 0);
+		tp->t_agcount_delta += delta;
+		break;
+	case XFS_TRANS_SB_IMAXPCT:
+		tp->t_imaxpct_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSIZE:
+		tp->t_rextsize_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBMBLOCKS:
+		tp->t_rbmblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBLOCKS:
+		tp->t_rblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTENTS:
+		tp->t_rextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSLOG:
+		tp->t_rextslog_delta += delta;
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	tp->t_flags |= (XFS_TRANS_SB_DIRTY | XFS_TRANS_DIRTY);
+}
+
+/*
+ * xfs_trans_apply_sb_deltas() is called from the commit code
+ * to bring the superblock buffer into the current transaction
+ * and modify it as requested by earlier calls to xfs_trans_mod_sb().
+ *
+ * For now we just look at each field allowed to change and change
+ * it if necessary.
+ */
+STATIC void
+xfs_trans_apply_sb_deltas(
+	xfs_trans_t	*tp)
+{
+	xfs_sb_t	*sbp;
+	xfs_buf_t	*bp;
+	int		whole = 0;
+
+	bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
+	sbp = XFS_BUF_TO_SBP(bp);
+
+	/*
+	 * Check that superblock mods match the mods made to AGF counters.
+	 */
+	ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
+	       (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
+		tp->t_ag_btree_delta));
+
+	if (tp->t_icount_delta != 0) {
+		INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta);
+	}
+	if (tp->t_ifree_delta != 0) {
+		INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta);
+	}
+
+	if (tp->t_fdblocks_delta != 0) {
+		INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta);
+	}
+	if (tp->t_res_fdblocks_delta != 0) {
+		INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta);
+	}
+
+	if (tp->t_frextents_delta != 0) {
+		INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_frextents_delta);
+	}
+	if (tp->t_dblocks_delta != 0) {
+		INT_MOD(sbp->sb_dblocks, ARCH_CONVERT, tp->t_dblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_agcount_delta != 0) {
+		INT_MOD(sbp->sb_agcount, ARCH_CONVERT, tp->t_agcount_delta);
+		whole = 1;
+	}
+	if (tp->t_imaxpct_delta != 0) {
+		INT_MOD(sbp->sb_imax_pct, ARCH_CONVERT, tp->t_imaxpct_delta);
+		whole = 1;
+	}
+	if (tp->t_rextsize_delta != 0) {
+		INT_MOD(sbp->sb_rextsize, ARCH_CONVERT, tp->t_rextsize_delta);
+		whole = 1;
+	}
+	if (tp->t_rbmblocks_delta != 0) {
+		INT_MOD(sbp->sb_rbmblocks, ARCH_CONVERT, tp->t_rbmblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rblocks_delta != 0) {
+		INT_MOD(sbp->sb_rblocks, ARCH_CONVERT, tp->t_rblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rextents_delta != 0) {
+		INT_MOD(sbp->sb_rextents, ARCH_CONVERT, tp->t_rextents_delta);
+		whole = 1;
+	}
+	if (tp->t_rextslog_delta != 0) {
+		INT_MOD(sbp->sb_rextslog, ARCH_CONVERT, tp->t_rextslog_delta);
+		whole = 1;
+	}
+
+	if (whole)
+		/*
+		 * Log the whole thing, the fields are discontiguous.
+		 */
+		xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_sb_t) - 1);
+	else
+		/*
+		 * Since all the modifiable fields are contiguous, we
+		 * can get away with this.
+		 */
+		xfs_trans_log_buf(tp, bp, offsetof(xfs_sb_t, sb_icount),
+				  offsetof(xfs_sb_t, sb_frextents) +
+				  sizeof(sbp->sb_frextents) - 1);
+
+	XFS_MTOVFS(tp->t_mountp)->vfs_super->s_dirt = 1;
+}
+
+/*
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused
+ * reservations and apply superblock counter changes to the in-core
+ * superblock.
+ *
+ * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
+ */
+void
+xfs_trans_unreserve_and_mod_sb(
+	xfs_trans_t	*tp)
+{
+	xfs_mod_sb_t	msb[14];	/* If you add cases, add entries */
+	xfs_mod_sb_t	*msbp;
+	/* REFERENCED */
+	int		error;
+	int		rsvd;
+
+	msbp = msb;
+	rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+	/*
+	 * Release any reserved blocks.	 Any that were allocated
+	 * will be taken back again by fdblocks_delta below.
+	 */
+	if (tp->t_blk_res > 0) {
+		msbp->msb_field = XFS_SBS_FDBLOCKS;
+		msbp->msb_delta = tp->t_blk_res;
+		msbp++;
+	}
+
+	/*
+	 * Release any reserved real time extents .  Any that were
+	 * allocated will be taken back again by frextents_delta below.
+	 */
+	if (tp->t_rtx_res > 0) {
+		msbp->msb_field = XFS_SBS_FREXTENTS;
+		msbp->msb_delta = tp->t_rtx_res;
+		msbp++;
+	}
+
+	/*
+	 * Apply any superblock modifications to the in-core version.
+	 * The t_res_fdblocks_delta and t_res_frextents_delta fields are
+	 * explicity NOT applied to the in-core superblock.
+	 * The idea is that that has already been done.
+	 */
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+		if (tp->t_icount_delta != 0) {
+			msbp->msb_field = XFS_SBS_ICOUNT;
+			msbp->msb_delta = (int)tp->t_icount_delta;
+			msbp++;
+		}
+		if (tp->t_ifree_delta != 0) {
+			msbp->msb_field = XFS_SBS_IFREE;
+			msbp->msb_delta = (int)tp->t_ifree_delta;
+			msbp++;
+		}
+		if (tp->t_fdblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_FDBLOCKS;
+			msbp->msb_delta = (int)tp->t_fdblocks_delta;
+			msbp++;
+		}
+		if (tp->t_frextents_delta != 0) {
+			msbp->msb_field = XFS_SBS_FREXTENTS;
+			msbp->msb_delta = (int)tp->t_frextents_delta;
+			msbp++;
+		}
+		if (tp->t_dblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_DBLOCKS;
+			msbp->msb_delta = (int)tp->t_dblocks_delta;
+			msbp++;
+		}
+		if (tp->t_agcount_delta != 0) {
+			msbp->msb_field = XFS_SBS_AGCOUNT;
+			msbp->msb_delta = (int)tp->t_agcount_delta;
+			msbp++;
+		}
+		if (tp->t_imaxpct_delta != 0) {
+			msbp->msb_field = XFS_SBS_IMAX_PCT;
+			msbp->msb_delta = (int)tp->t_imaxpct_delta;
+			msbp++;
+		}
+		if (tp->t_rextsize_delta != 0) {
+			msbp->msb_field = XFS_SBS_REXTSIZE;
+			msbp->msb_delta = (int)tp->t_rextsize_delta;
+			msbp++;
+		}
+		if (tp->t_rbmblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_RBMBLOCKS;
+			msbp->msb_delta = (int)tp->t_rbmblocks_delta;
+			msbp++;
+		}
+		if (tp->t_rblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_RBLOCKS;
+			msbp->msb_delta = (int)tp->t_rblocks_delta;
+			msbp++;
+		}
+		if (tp->t_rextents_delta != 0) {
+			msbp->msb_field = XFS_SBS_REXTENTS;
+			msbp->msb_delta = (int)tp->t_rextents_delta;
+			msbp++;
+		}
+		if (tp->t_rextslog_delta != 0) {
+			msbp->msb_field = XFS_SBS_REXTSLOG;
+			msbp->msb_delta = (int)tp->t_rextslog_delta;
+			msbp++;
+		}
+	}
+
+	/*
+	 * If we need to change anything, do it.
+	 */
+	if (msbp > msb) {
+		error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
+			(uint)(msbp - msb), rsvd);
+		ASSERT(error == 0);
+	}
+}
+
+
+/*
+ * xfs_trans_commit
+ *
+ * Commit the given transaction to the log a/synchronously.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * It's illegal to reference the transaction structure after this call.
+ */
+ /*ARGSUSED*/
+int
+xfs_trans_commit(
+	xfs_trans_t	*tp,
+	uint		flags,
+	xfs_lsn_t	*commit_lsn_p)
+{
+	xfs_log_iovec_t		*log_vector;
+	int			nvec;
+	xfs_mount_t		*mp;
+	xfs_lsn_t		commit_lsn;
+	/* REFERENCED */
+	int			error;
+	int			log_flags;
+	int			sync;
+#define XFS_TRANS_LOGVEC_COUNT	16
+	xfs_log_iovec_t		log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	static xfs_lsn_t	trans_lsn = 1;
+#endif
+	int			shutdown;
+
+	commit_lsn = -1;
+
+	/*
+	 * Determine whether this commit is releasing a permanent
+	 * log reservation or not.
+	 */
+	if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+		ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+	} else {
+		log_flags = 0;
+	}
+	mp = tp->t_mountp;
+
+	/*
+	 * If there is nothing to be logged by the transaction,
+	 * then unlock all of the items associated with the
+	 * transaction and free the transaction structure.
+	 * Also make sure to return any reserved blocks to
+	 * the free pool.
+	 */
+shut_us_down:
+	shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
+	if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
+		xfs_trans_unreserve_and_mod_sb(tp);
+		/*
+		 * It is indeed possible for the transaction to be
+		 * not dirty but the dqinfo portion to be. All that
+		 * means is that we have some (non-persistent) quota
+		 * reservations that need to be unreserved.
+		 */
+		if (tp->t_dqinfo && (tp->t_flags & XFS_TRANS_DQ_DIRTY)) {
+			xfs_trans_unreserve_and_mod_dquots(tp);
+		}
+		if (tp->t_ticket) {
+			commit_lsn = xfs_log_done(mp, tp->t_ticket, log_flags);
+			if (commit_lsn == -1 && !shutdown)
+				shutdown = XFS_ERROR(EIO);
+		}
+		xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
+		xfs_trans_free_busy(tp);
+		xfs_trans_free(tp);
+		XFS_STATS_INC(xfsstats.xs_trans_empty);
+		if (commit_lsn_p)
+			*commit_lsn_p = commit_lsn;
+		current->flags &= ~PF_FSTRANS;
+		return (shutdown);
+	}
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	ASSERT(!xlog_debug || tp->t_ticket != NULL);
+#else
+	ASSERT(tp->t_ticket != NULL);
+#endif
+
+	/*
+	 * If we need to update the superblock, then do it now.
+	 */
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+		xfs_trans_apply_sb_deltas(tp);
+	}
+	if (tp->t_flags & XFS_TRANS_DQ_DIRTY) {
+		xfs_trans_apply_dquot_deltas(tp);
+	}
+
+	/*
+	 * Ask each log item how many log_vector entries it will
+	 * need so we can figure out how many to allocate.
+	 * Try to avoid the kmem_alloc() call in the common case
+	 * by using a vector from the stack when it fits.
+	 */
+	nvec = xfs_trans_count_vecs(tp);
+
+	if (nvec == 0) {
+		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		goto shut_us_down;
+	}
+
+
+	if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
+		log_vector = log_vector_fast;
+	} else {
+		log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec *
+						   sizeof(xfs_log_iovec_t),
+						   KM_SLEEP);
+	}
+
+	/*
+	 * Fill in the log_vector and pin the logged items, and
+	 * then write the transaction to the log.
+	 */
+	xfs_trans_fill_vecs(tp, log_vector);
+
+	/*
+	 * Ignore errors here. xfs_log_done would do the right thing.
+	 * We need to put the ticket, etc. away.
+	 */
+	error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket,
+			     &(tp->t_lsn));
+
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	if (xlog_debug) {
+		commit_lsn = xfs_log_done(mp, tp->t_ticket,
+					  log_flags);
+	} else {
+		commit_lsn = 0;
+		tp->t_lsn = trans_lsn++;
+	}
+#else
+	/*
+	 * This is the regular case.  At this point (after the call finishes),
+	 * the transaction is committed incore and could go out to disk at
+	 * any time.  However, all the items associated with the transaction
+	 * are still locked and pinned in memory.
+	 */
+	commit_lsn = xfs_log_done(mp, tp->t_ticket, log_flags);
+#endif
+
+	if (nvec > XFS_TRANS_LOGVEC_COUNT) {
+		kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t));
+	}
+
+	if (commit_lsn_p)
+		*commit_lsn_p = commit_lsn;
+
+	/*
+	 * If we got a log write error. Unpin the logitems that we
+	 * had pinned, clean up, free trans structure, and return error.
+	 */
+	if (error || commit_lsn == -1) {
+		xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
+		current->flags &= ~PF_FSTRANS;
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * Once all the items of the transaction have been copied
+	 * to the in core log we can release them.  Do that here.
+	 * This will free descriptors pointing to items which were
+	 * not logged since there is nothing more to do with them.
+	 * For items which were logged, we will keep pointers to them
+	 * so they can be unpinned after the transaction commits to disk.
+	 * This will also stamp each modified meta-data item with
+	 * the commit lsn of this transaction for dependency tracking
+	 * purposes.
+	 */
+	xfs_trans_unlock_items(tp, commit_lsn);
+
+	/*
+	 * Once the transaction has committed, unused
+	 * reservations need to be released and changes to
+	 * the superblock need to be reflected in the in-core
+	 * version.  Do that now.
+	 */
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	sync = tp->t_flags & XFS_TRANS_SYNC;
+
+	/*
+	 * Tell the LM to call the transaction completion routine
+	 * when the log write with LSN commit_lsn completes (e.g.
+	 * when the transaction commit really hits the on-disk log).
+	 * After this call we cannot reference tp, because the call
+	 * can happen at any time and the call will free the transaction
+	 * structure pointed to by tp.	The only case where we call
+	 * the completion routine (xfs_trans_committed) directly is
+	 * if the log is turned off on a debug kernel or we're
+	 * running in simulation mode (the log is explicitly turned
+	 * off).
+	 */
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	if (xlog_debug) {
+		tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+		tp->t_logcb.cb_arg = tp;
+		xfs_log_notify(mp, commit_lsn, &(tp->t_logcb));
+	} else {
+		xfs_trans_committed(tp, 0);
+	}
+#else
+	tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+	tp->t_logcb.cb_arg = tp;
+	xfs_log_notify(mp, commit_lsn, &(tp->t_logcb));
+#endif
+	/*
+	 * If the transaction needs to be synchronous, then force the
+	 * log out now and wait for it.
+	 */
+	if (sync) {
+		error = xfs_log_force(mp, commit_lsn,
+				      XFS_LOG_FORCE | XFS_LOG_SYNC);
+		XFS_STATS_INC(xfsstats.xs_trans_sync);
+	} else {
+		XFS_STATS_INC(xfsstats.xs_trans_async);
+	}
+
+	/* mark this thread as no longer being in a transaction */
+	current->flags &= ~PF_FSTRANS;
+
+	return (error);
+}
+
+
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction.	 The transaction itself needs one for the
+ * transaction header.	Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+STATIC uint
+xfs_trans_count_vecs(
+	xfs_trans_t	*tp)
+{
+	int			nvecs;
+	xfs_log_item_desc_t	*lidp;
+
+	nvecs = 1;
+	lidp = xfs_trans_first_item(tp);
+	ASSERT(lidp != NULL);
+
+	/* In the non-debug case we need to start bailing out if we
+	 * didn't find a log_item here, return zero and let trans_commit
+	 * deal with it.
+	 */
+	if (lidp == NULL)
+		return 0;
+
+	while (lidp != NULL) {
+		/*
+		 * Skip items which aren't dirty in this transaction.
+		 */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+			lidp = xfs_trans_next_item(tp, lidp);
+			continue;
+		}
+		lidp->lid_size = IOP_SIZE(lidp->lid_item);
+		nvecs += lidp->lid_size;
+		lidp = xfs_trans_next_item(tp, lidp);
+	}
+
+	return nvecs;
+}
+
+/*
+ * Called from the trans_commit code when we notice that
+ * the filesystem is in the middle of a forced shutdown.
+ */
+STATIC void
+xfs_trans_uncommit(
+	xfs_trans_t	*tp,
+	uint		flags)
+{
+	xfs_log_item_desc_t	*lidp;
+
+	for (lidp = xfs_trans_first_item(tp);
+	     lidp != NULL;
+	     lidp = xfs_trans_next_item(tp, lidp)) {
+		/*
+		 * Unpin all but those that aren't dirty.
+		 */
+		if (lidp->lid_flags & XFS_LID_DIRTY)
+			IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+	}
+
+	xfs_trans_unreserve_and_mod_sb(tp);
+	if (tp->t_dqinfo && (tp->t_flags & XFS_TRANS_DQ_DIRTY)) {
+		xfs_trans_unreserve_and_mod_dquots(tp);
+	}
+
+	xfs_trans_free_items(tp, flags);
+	xfs_trans_free_busy(tp);
+	xfs_trans_free(tp);
+}
+
+/*
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction.	 The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
+ *
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+STATIC void
+xfs_trans_fill_vecs(
+	xfs_trans_t		*tp,
+	xfs_log_iovec_t		*log_vector)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_iovec_t		*vecp;
+	uint			nitems;
+
+	/*
+	 * Skip over the entry for the transaction header, we'll
+	 * fill that in at the end.
+	 */
+	vecp = log_vector + 1;		/* pointer arithmetic */
+
+	nitems = 0;
+	lidp = xfs_trans_first_item(tp);
+	ASSERT(lidp != NULL);
+	while (lidp != NULL) {
+		/*
+		 * Skip items which aren't dirty in this transaction.
+		 */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+			lidp = xfs_trans_next_item(tp, lidp);
+			continue;
+		}
+		/*
+		 * The item may be marked dirty but not log anything.
+		 * This can be used to get called when a transaction
+		 * is committed.
+		 */
+		if (lidp->lid_size) {
+			nitems++;
+		}
+		IOP_FORMAT(lidp->lid_item, vecp);
+		vecp += lidp->lid_size;		/* pointer arithmetic */
+		IOP_PIN(lidp->lid_item);
+		lidp = xfs_trans_next_item(tp, lidp);
+	}
+
+	/*
+	 * Now that we've counted the number of items in this
+	 * transaction, fill in the transaction header.
+	 */
+	tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+	tp->t_header.th_type = tp->t_type;
+	tp->t_header.th_num_items = nitems;
+	log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+	log_vector->i_len = sizeof(xfs_trans_header_t);
+}
+
+
+/*
+ * Unlock all of the transaction's items and free the transaction.
+ * The transaction must not have modified any of its items, because
+ * there is no way to restore them to their previous state.
+ *
+ * If the transaction has made a log reservation, make sure to release
+ * it as well.
+ */
+void
+xfs_trans_cancel(
+	xfs_trans_t		*tp,
+	int			flags)
+{
+	int			log_flags;
+#ifdef DEBUG
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_item_t		*lip;
+	int			i;
+#endif
+
+	/*
+	 * See if the caller is being too lazy to figure out if
+	 * the transaction really needs an abort.
+	 */
+	if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
+		flags &= ~XFS_TRANS_ABORT;
+	/*
+	 * See if the caller is relying on us to shut down the
+	 * filesystem.	This happens in paths where we detect
+	 * corruption and decide to give up.
+	 */
+	if ((tp->t_flags & XFS_TRANS_DIRTY) &&
+	    !XFS_FORCED_SHUTDOWN(tp->t_mountp))
+		xfs_force_shutdown(tp->t_mountp, XFS_CORRUPT_INCORE);
+#ifdef DEBUG
+	if (!(flags & XFS_TRANS_ABORT)) {
+		licp = &(tp->t_items);
+		while (licp != NULL) {
+			lidp = licp->lic_descs;
+			for (i = 0; i < licp->lic_unused; i++, lidp++) {
+				if (XFS_LIC_ISFREE(licp, i)) {
+					continue;
+				}
+
+				lip = lidp->lid_item;
+				if (!XFS_FORCED_SHUTDOWN(tp->t_mountp))
+					ASSERT(!(lip->li_type == XFS_LI_EFD));
+			}
+			licp = licp->lic_next;
+		}
+	}
+#endif
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	if (tp->t_dqinfo && (tp->t_flags & XFS_TRANS_DQ_DIRTY))
+		xfs_trans_unreserve_and_mod_dquots(tp);
+
+	if (tp->t_ticket) {
+		if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+			ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+			log_flags = XFS_LOG_REL_PERM_RESERV;
+		} else {
+			log_flags = 0;
+		}
+		xfs_log_done(tp->t_mountp, tp->t_ticket, log_flags);
+	}
+	xfs_trans_free_items(tp, flags);
+	xfs_trans_free_busy(tp);
+	xfs_trans_free(tp);
+
+	/* mark this thread as no longer being in a transaction */
+	current->flags &= ~PF_FSTRANS;
+}
+
+
+/*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+	xfs_trans_t	*tp)
+{
+	atomic_dec(&tp->t_mountp->m_active_trans);
+	if (tp->t_dqinfo)
+		xfs_trans_free_dqinfo(tp);
+	kmem_zone_free(xfs_trans_zone, tp);
+}
+
+
+/*
+ * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
+ *
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk.  It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ *
+ * Call xfs_trans_chunk_committed() to process the items in
+ * each chunk.
+ */
+STATIC void
+xfs_trans_committed(
+	xfs_trans_t	*tp,
+	int		abortflag)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_chunk_t	*next_licp;
+	xfs_log_busy_chunk_t	*lbcp;
+	xfs_log_busy_slot_t	*lbsp;
+	int			i;
+
+	/*
+	 * Call the transaction's completion callback if there
+	 * is one.
+	 */
+	if (tp->t_callback != NULL) {
+		tp->t_callback(tp, tp->t_callarg);
+	}
+
+	/*
+	 * Special case the chunk embedded in the transaction.
+	 */
+	licp = &(tp->t_items);
+	if (!(XFS_LIC_ARE_ALL_FREE(licp))) {
+		xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
+	}
+
+	/*
+	 * Process the items in each chunk in turn.
+	 */
+	licp = licp->lic_next;
+	while (licp != NULL) {
+		ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+		xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
+		next_licp = licp->lic_next;
+		kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+		licp = next_licp;
+	}
+
+	/*
+	 * Clear all the per-AG busy list items listed in this transaction
+	 */
+	lbcp = &tp->t_busy;
+	while (lbcp != NULL) {
+		for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
+			if (!XFS_LBC_ISFREE(lbcp, i)) {
+				xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
+						     lbsp->lbc_idx);
+			}
+		}
+		lbcp = lbcp->lbc_next;
+	}
+	xfs_trans_free_busy(tp);
+
+	/*
+	 * That's it for the transaction structure.  Free it.
+	 */
+	xfs_trans_free(tp);
+}
+
+/*
+ * This is called to perform the commit processing for each
+ * item described by the given chunk.
+ *
+ * The commit processing consists of unlocking items which were
+ * held locked with the SYNC_UNLOCK attribute, calling the committed
+ * routine of each logged item, updating the item's position in the AIL
+ * if necessary, and unpinning each item.  If the committed routine
+ * returns -1, then do nothing further with the item because it
+ * may have been freed.
+ *
+ * Since items are unlocked when they are copied to the incore
+ * log, it is possible for two transactions to be completing
+ * and manipulating the same item simultaneously.  The AIL lock
+ * will protect the lsn field of each item.  The value of this
+ * field can never go backwards.
+ *
+ * We unpin the items after repositioning them in the AIL, because
+ * otherwise they could be immediately flushed and we'd have to race
+ * with the flusher trying to pull the item from the AIL as we add it.
+ */
+STATIC void
+xfs_trans_chunk_committed(
+	xfs_log_item_chunk_t	*licp,
+	xfs_lsn_t		lsn,
+	int			aborted)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_item_t		*lip;
+	xfs_lsn_t		item_lsn;
+	struct xfs_mount	*mp;
+	int			i;
+	SPLDECL(s);
+
+	lidp = licp->lic_descs;
+	for (i = 0; i < licp->lic_unused; i++, lidp++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+
+		lip = lidp->lid_item;
+		if (aborted)
+			lip->li_flags |= XFS_LI_ABORTED;
+
+		if (lidp->lid_flags & XFS_LID_SYNC_UNLOCK) {
+			IOP_UNLOCK(lip);
+		}
+
+		/*
+		 * Send in the ABORTED flag to the COMMITTED routine
+		 * so that it knows whether the transaction was aborted
+		 * or not.
+		 */
+		item_lsn = IOP_COMMITTED(lip, lsn);
+
+		/*
+		 * If the committed routine returns -1, make
+		 * no more references to the item.
+		 */
+		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
+			continue;
+		}
+
+		/*
+		 * If the returned lsn is greater than what it
+		 * contained before, update the location of the
+		 * item in the AIL.  If it is not, then do nothing.
+		 * Items can never move backwards in the AIL.
+		 *
+		 * While the new lsn should usually be greater, it
+		 * is possible that a later transaction completing
+		 * simultaneously with an earlier one using the
+		 * same item could complete first with a higher lsn.
+		 * This would cause the earlier transaction to fail
+		 * the test below.
+		 */
+		mp = lip->li_mountp;
+		AIL_LOCK(mp,s);
+		if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+			/*
+			 * This will set the item's lsn to item_lsn
+			 * and update the position of the item in
+			 * the AIL.
+			 *
+			 * xfs_trans_update_ail() drops the AIL lock.
+			 */
+			xfs_trans_update_ail(mp, lip, item_lsn, s);
+		} else {
+			AIL_UNLOCK(mp, s);
+		}
+
+		/*
+		 * Now that we've repositioned the item in the AIL,
+		 * unpin it so it can be flushed.
+		 */
+		IOP_UNPIN(lip);
+	}
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1034 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TRANS_H__
+#define __XFS_TRANS_H__
+
+/*
+ * This is the structure written in the log at the head of
+ * every transaction. It identifies the type and id of the
+ * transaction, and contains the number of items logged by
+ * the transaction so we know how many to expect during recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+	uint		th_magic;		/* magic number */
+	uint		th_type;		/* transaction type */
+	__int32_t	th_tid;			/* transaction id (unused) */
+	uint		th_num_items;		/* num items logged by trans */
+} xfs_trans_header_t;
+
+#define XFS_TRANS_HEADER_MAGIC	0x5452414e	/* TRAN */
+
+/*
+ * Log item types.
+ */
+#define XFS_LI_5_3_BUF		0x1234	/* v1 bufs, 1-block inode buffers */
+#define XFS_LI_5_3_INODE	0x1235	/* 1-block inode buffers */
+#define XFS_LI_EFI		0x1236
+#define XFS_LI_EFD		0x1237
+#define XFS_LI_IUNLINK		0x1238
+#define XFS_LI_6_1_INODE	0x1239	/* 4K non-aligned inode bufs */
+#define XFS_LI_6_1_BUF		0x123a	/* v1, 4K inode buffers */
+#define XFS_LI_INODE		0x123b	/* aligned ino chunks, var-size ibufs */
+#define XFS_LI_BUF		0x123c	/* v2 bufs, variable sized inode bufs */
+#define XFS_LI_DQUOT		0x123d
+#define XFS_LI_QUOTAOFF		0x123e
+#define XFS_LI_RPC		0x123f	/* CXFS RPC return info */
+
+/*
+ * Transaction types.  Used to distinguish types of buffers.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE	1
+#define XFS_TRANS_SETATTR_SIZE		2
+#define XFS_TRANS_INACTIVE		3
+#define XFS_TRANS_CREATE		4
+#define XFS_TRANS_CREATE_TRUNC		5
+#define XFS_TRANS_TRUNCATE_FILE		6
+#define XFS_TRANS_REMOVE		7
+#define XFS_TRANS_LINK			8
+#define XFS_TRANS_RENAME		9
+#define XFS_TRANS_MKDIR			10
+#define XFS_TRANS_RMDIR			11
+#define XFS_TRANS_SYMLINK		12
+#define XFS_TRANS_SET_DMATTRS		13
+#define XFS_TRANS_GROWFS		14
+#define XFS_TRANS_STRAT_WRITE		15
+#define XFS_TRANS_DIOSTRAT		16
+#define XFS_TRANS_WRITE_SYNC		17
+#define XFS_TRANS_WRITEID		18
+#define XFS_TRANS_ADDAFORK		19
+#define XFS_TRANS_ATTRINVAL		20
+#define XFS_TRANS_ATRUNCATE		21
+#define XFS_TRANS_ATTR_SET		22
+#define XFS_TRANS_ATTR_RM		23
+#define XFS_TRANS_ATTR_FLAG		24
+#define XFS_TRANS_CLEAR_AGI_BUCKET	25
+#define XFS_TRANS_QM_SBCHANGE		26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1		27
+#define XFS_TRANS_DUMMY2		28
+#define XFS_TRANS_QM_QUOTAOFF		29
+#define XFS_TRANS_QM_DQALLOC		30
+#define XFS_TRANS_QM_SETQLIM		31
+#define XFS_TRANS_QM_DQCLUSTER		32
+#define XFS_TRANS_QM_QINOCREATE		33
+#define XFS_TRANS_QM_QUOTAOFF_END	34
+#define XFS_TRANS_SB_UNIT		35
+#define XFS_TRANS_FSYNC_TS		36
+#define XFS_TRANS_GROWFSRT_ALLOC	37
+#define XFS_TRANS_GROWFSRT_ZERO		38
+#define XFS_TRANS_GROWFSRT_FREE		39
+#define XFS_TRANS_SWAPEXT		40
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+
+#ifdef __KERNEL__
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+
+typedef struct xfs_ail_entry {
+	struct xfs_log_item	*ail_forw;	/* AIL forw pointer */
+	struct xfs_log_item	*ail_back;	/* AIL back pointer */
+} xfs_ail_entry_t;
+
+/*
+ * This structure is passed as a parameter to xfs_trans_push_ail()
+ * and is used to track the what LSN the waiting processes are
+ * waiting to become unused.
+ */
+typedef struct xfs_ail_ticket {
+	xfs_lsn_t		at_lsn;		/* lsn waitin for */
+	struct xfs_ail_ticket	*at_forw;	/* wait list ptr */
+	struct xfs_ail_ticket	*at_back;	/* wait list ptr */
+	sv_t			at_sema;	/* wait sema */
+} xfs_ail_ticket_t;
+
+
+typedef struct xfs_log_item {
+	xfs_ail_entry_t			li_ail;		/* AIL pointers */
+	xfs_lsn_t			li_lsn;		/* last on-disk lsn */
+	struct xfs_log_item_desc	*li_desc;	/* ptr to current desc*/
+	struct xfs_mount		*li_mountp;	/* ptr to fs mount */
+	uint				li_type;	/* item type */
+	uint				li_flags;	/* misc flags */
+	struct xfs_log_item		*li_bio_list;	/* buffer item list */
+	void				(*li_cb)(struct xfs_buf *,
+						 struct xfs_log_item *);
+							/* buffer item iodone */
+							/* callback func */
+	struct xfs_item_ops		*li_ops;	/* function list */
+} xfs_log_item_t;
+
+#define XFS_LI_IN_AIL	0x1
+#define XFS_LI_ABORTED	0x2
+
+typedef struct xfs_item_ops {
+	uint (*iop_size)(xfs_log_item_t *);
+	void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+	void (*iop_pin)(xfs_log_item_t *);
+	void (*iop_unpin)(xfs_log_item_t *);
+	void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+	uint (*iop_trylock)(xfs_log_item_t *);
+	void (*iop_unlock)(xfs_log_item_t *);
+	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+	void (*iop_push)(xfs_log_item_t *);
+	void (*iop_abort)(xfs_log_item_t *);
+	void (*iop_pushbuf)(xfs_log_item_t *);
+	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+
+#define IOP_SIZE(ip)		(*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp)	(*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip)		(*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip)		(*(ip)->li_ops->iop_unpin)(ip)
+#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_TRYLOCK(ip)		(*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip)		(*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn)	(*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip)		(*(ip)->li_ops->iop_push)(ip)
+#define IOP_ABORT(ip)		(*(ip)->li_ops->iop_abort)(ip)
+#define IOP_PUSHBUF(ip)		(*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define XFS_ITEM_SUCCESS	0
+#define XFS_ITEM_PINNED		1
+#define XFS_ITEM_LOCKED		2
+#define XFS_ITEM_FLUSHING	3
+#define XFS_ITEM_PUSHBUF	4
+
+#endif	/* __KERNEL__ */
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+typedef struct xfs_log_item_desc {
+	xfs_log_item_t	*lid_item;
+	ushort		lid_size;
+	unsigned char	lid_flags;
+	unsigned char	lid_index;
+} xfs_log_item_desc_t;
+
+#define XFS_LID_DIRTY		0x1
+#define XFS_LID_PINNED		0x2
+#define XFS_LID_SYNC_UNLOCK	0x4
+
+/*
+ * This structure is used to maintain a chunk list of log_item_desc
+ * structures. The free field is a bitmask indicating which descriptors
+ * in this chunk's array are free.  The unused field is the first value
+ * not used since this chunk was allocated.
+ */
+#define XFS_LIC_NUM_SLOTS	15
+typedef struct xfs_log_item_chunk {
+	struct xfs_log_item_chunk	*lic_next;
+	ushort				lic_free;
+	ushort				lic_unused;
+	xfs_log_item_desc_t		lic_descs[XFS_LIC_NUM_SLOTS];
+} xfs_log_item_chunk_t;
+
+#define XFS_LIC_MAX_SLOT	(XFS_LIC_NUM_SLOTS - 1)
+#define XFS_LIC_FREEMASK	((1 << XFS_LIC_NUM_SLOTS) - 1)
+
+
+/*
+ * Initialize the given chunk.	Set the chunk's free descriptor mask
+ * to indicate that all descriptors are free.  The caller gets to set
+ * lic_unused to the right value (0 matches all free).	The
+ * lic_descs.lid_index values are set up as each desc is allocated.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT)
+void xfs_lic_init(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_INIT(cp)	xfs_lic_init(cp)
+#else
+#define XFS_LIC_INIT(cp)	((cp)->lic_free = XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT_SLOT)
+void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_INIT_SLOT(cp,slot)	xfs_lic_init_slot(cp, slot)
+#else
+#define XFS_LIC_INIT_SLOT(cp,slot)	\
+	((cp)->lic_descs[slot].lid_index = (unsigned char)(slot))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_VACANCY)
+int xfs_lic_vacancy(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_VACANCY(cp)		xfs_lic_vacancy(cp)
+#else
+#define XFS_LIC_VACANCY(cp)		(((cp)->lic_free) & XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ALL_FREE)
+void xfs_lic_all_free(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_ALL_FREE(cp)		xfs_lic_all_free(cp)
+#else
+#define XFS_LIC_ALL_FREE(cp)		((cp)->lic_free = XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ARE_ALL_FREE)
+int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_ARE_ALL_FREE(cp)	xfs_lic_are_all_free(cp)
+#else
+#define XFS_LIC_ARE_ALL_FREE(cp)	(((cp)->lic_free & XFS_LIC_FREEMASK) ==\
+					XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ISFREE)
+int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot)
+#else
+#define XFS_LIC_ISFREE(cp,slot) ((cp)->lic_free & (1 << (slot)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_CLAIM)
+void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_CLAIM(cp,slot)		xfs_lic_claim(cp,slot)
+#else
+#define XFS_LIC_CLAIM(cp,slot)		((cp)->lic_free &= ~(1 << (slot)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_RELSE)
+void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_RELSE(cp,slot)		xfs_lic_relse(cp,slot)
+#else
+#define XFS_LIC_RELSE(cp,slot)		((cp)->lic_free |= 1 << (slot))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_SLOT)
+xfs_log_item_desc_t *xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_SLOT(cp,slot)		xfs_lic_slot(cp,slot)
+#else
+#define XFS_LIC_SLOT(cp,slot)		(&((cp)->lic_descs[slot]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_SLOT)
+int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp);
+#define XFS_LIC_DESC_TO_SLOT(dp)	xfs_lic_desc_to_slot(dp)
+#else
+#define XFS_LIC_DESC_TO_SLOT(dp)	((uint)((dp)->lid_index))
+#endif
+/*
+ * Calculate the address of a chunk given a descriptor pointer:
+ * dp - dp->lid_index give the address of the start of the lic_descs array.
+ * From this we subtract the offset of the lic_descs field in a chunk.
+ * All of this yields the address of the chunk, which is
+ * cast to a chunk pointer.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_CHUNK)
+xfs_log_item_chunk_t *xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp);
+#define XFS_LIC_DESC_TO_CHUNK(dp)	xfs_lic_desc_to_chunk(dp)
+#else
+#define XFS_LIC_DESC_TO_CHUNK(dp)	((xfs_log_item_chunk_t*) \
+					(((xfs_caddr_t)((dp) - (dp)->lid_index)) -\
+					(xfs_caddr_t)(((xfs_log_item_chunk_t*) \
+					0)->lic_descs)))
+#endif
+
+#ifdef __KERNEL__
+/*
+ * This structure is used to maintain a list of block ranges that have been
+ * freed in the transaction.  The ranges are listed in the perag[] busy list
+ * between when they're freed and the transaction is committed to disk.
+ */
+
+typedef struct xfs_log_busy_slot {
+	xfs_agnumber_t		lbc_ag;
+	ushort			lbc_idx;	/* index in perag.busy[] */
+} xfs_log_busy_slot_t;
+
+#define XFS_LBC_NUM_SLOTS	31
+typedef struct xfs_log_busy_chunk {
+	struct xfs_log_busy_chunk	*lbc_next;
+	uint				lbc_free;	/* bitmask of free slots */
+	ushort				lbc_unused;	/* first unused */
+	xfs_log_busy_slot_t		lbc_busy[XFS_LBC_NUM_SLOTS];
+} xfs_log_busy_chunk_t;
+
+#define XFS_LBC_MAX_SLOT	(XFS_LBC_NUM_SLOTS - 1)
+#define XFS_LBC_FREEMASK	((1U << XFS_LBC_NUM_SLOTS) - 1)
+
+#define XFS_LBC_INIT(cp)	((cp)->lbc_free = XFS_LBC_FREEMASK)
+#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
+#define XFS_LBC_SLOT(cp, slot)	(&((cp)->lbc_busy[(slot)]))
+#define XFS_LBC_VACANCY(cp)	(((cp)->lbc_free) & XFS_LBC_FREEMASK)
+#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+	unsigned int		t_magic;	/* magic number */
+	xfs_log_callback_t	t_logcb;	/* log callback struct */
+	struct xfs_trans	*t_forw;	/* async list pointers */
+	struct xfs_trans	*t_back;	/* async list pointers */
+	unsigned int		t_type;		/* transaction type */
+	unsigned int		t_log_res;	/* amt of log space resvd */
+	unsigned int		t_log_count;	/* count for perm log res */
+	unsigned int		t_blk_res;	/* # of blocks resvd */
+	unsigned int		t_blk_res_used; /* # of resvd blocks used */
+	unsigned int		t_rtx_res;	/* # of rt extents resvd */
+	unsigned int		t_rtx_res_used; /* # of resvd rt extents used */
+	xfs_log_ticket_t	t_ticket;	/* log mgr ticket */
+	sema_t			t_sema;		/* sema for commit completion */
+	xfs_lsn_t		t_lsn;		/* log seq num of trans commit*/
+	struct xfs_mount	*t_mountp;	/* ptr to fs mount struct */
+	struct xfs_dquot_acct	*t_dqinfo;	/* accting info for dquots */
+	xfs_trans_callback_t	t_callback;	/* transaction callback */
+	void			*t_callarg;	/* callback arg */
+	unsigned int		t_flags;	/* misc flags */
+	long			t_icount_delta; /* superblock icount change */
+	long			t_ifree_delta;	/* superblock ifree change */
+	long			t_fdblocks_delta; /* superblock fdblocks chg */
+	long			t_res_fdblocks_delta; /* on-disk only chg */
+	long			t_frextents_delta;/* superblock freextents chg*/
+	long			t_res_frextents_delta; /* on-disk only chg */
+	long			t_ag_freeblks_delta; /* debugging counter */
+	long			t_ag_flist_delta; /* debugging counter */
+	long			t_ag_btree_delta; /* debugging counter */
+	long			t_dblocks_delta;/* superblock dblocks change */
+	long			t_agcount_delta;/* superblock agcount change */
+	long			t_imaxpct_delta;/* superblock imaxpct change */
+	long			t_rextsize_delta;/* superblock rextsize chg */
+	long			t_rbmblocks_delta;/* superblock rbmblocks chg */
+	long			t_rblocks_delta;/* superblock rblocks change */
+	long			t_rextents_delta;/* superblocks rextents chg */
+	long			t_rextslog_delta;/* superblocks rextslog chg */
+	unsigned int		t_items_free;	/* log item descs free */
+	xfs_log_item_chunk_t	t_items;	/* first log item desc chunk */
+	xfs_trans_header_t	t_header;	/* header for in-log trans */
+	unsigned int		t_busy_free;	/* busy descs free */
+	xfs_log_busy_chunk_t	t_busy;		/* busy/async free blocks */
+} xfs_trans_t;
+
+#endif	/* __KERNEL__ */
+
+
+#define XFS_TRANS_MAGIC		0x5452414E	/* 'TRAN' */
+/*
+ * Values for t_flags.
+ */
+#define XFS_TRANS_DIRTY		0x01	/* something needs to be logged */
+#define XFS_TRANS_SB_DIRTY	0x02	/* superblock is modified */
+#define XFS_TRANS_PERM_LOG_RES	0x04	/* xact took a permanent log res */
+#define XFS_TRANS_SYNC		0x08	/* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE	0x20	/* OK to use reserved data blocks */
+
+/*
+ * Values for call flags parameter.
+ */
+#define XFS_TRANS_NOSLEEP		0x1
+#define XFS_TRANS_WAIT			0x2
+#define XFS_TRANS_RELEASE_LOG_RES	0x4
+#define XFS_TRANS_ABORT			0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define XFS_TRANS_SB_ICOUNT		0x00000001
+#define XFS_TRANS_SB_IFREE		0x00000002
+#define XFS_TRANS_SB_FDBLOCKS		0x00000004
+#define XFS_TRANS_SB_RES_FDBLOCKS	0x00000008
+#define XFS_TRANS_SB_FREXTENTS		0x00000010
+#define XFS_TRANS_SB_RES_FREXTENTS	0x00000020
+#define XFS_TRANS_SB_DBLOCKS		0x00000040
+#define XFS_TRANS_SB_AGCOUNT		0x00000080
+#define XFS_TRANS_SB_IMAXPCT		0x00000100
+#define XFS_TRANS_SB_REXTSIZE		0x00000200
+#define XFS_TRANS_SB_RBMBLOCKS		0x00000400
+#define XFS_TRANS_SB_RBLOCKS		0x00000800
+#define XFS_TRANS_SB_REXTENTS		0x00001000
+#define XFS_TRANS_SB_REXTSLOG		0x00002000
+
+
+/*
+ * Various log reservation values.
+ * These are based on the size of the file system block
+ * because that is what most transactions manipulate.
+ * Each adds in an additional 128 bytes per item logged to
+ * try to account for the overhead of the transaction mechanism.
+ *
+ * Note:
+ * Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish()
+ * call.  This is because the number in the worst case is quite high
+ * and quite unusual.  In order to fix this we need to change
+ * xfs_bmap_finish() to free extents in only a single AG at a time.
+ * This will require changes to the EFI code as well, however, so that
+ * the EFI for the extents not freed is logged again in each transaction.
+ * See bug 261917.
+ */
+
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
+	((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+	((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define XFS_DIROP_LOG_RES(mp)	\
+	(XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+	 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define XFS_DIROP_LOG_COUNT(mp) \
+	(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+	 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode\'s bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_WRITE_LOG_RES(mp) \
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
+	  (2 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+	  (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
+	 ((2 * (mp)->m_sb.sb_sectsize) + \
+	  (2 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
+
+#define XFS_WRITE_LOG_RES(mp)	((mp)->m_reservations.tr_write)
+
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode\'s bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
+	  (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
+	 ((4 * (mp)->m_sb.sb_sectsize) + \
+	  (4 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 4) + \
+	  (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
+
+#define XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
+
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *	of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_RENAME_LOG_RES(mp) \
+	(MAX( \
+	 ((4 * (mp)->m_sb.sb_inodesize) + \
+	  (2 * XFS_DIROP_LOG_RES(mp)) + \
+	  (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
+	 ((3 * (mp)->m_sb.sb_sectsize) + \
+	  (3 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 3) + \
+	  (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
+
+#define XFS_RENAME_LOG_RES(mp)	((mp)->m_reservations.tr_rename)
+
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_LINK_LOG_RES(mp) \
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_inodesize + \
+	  XFS_DIROP_LOG_RES(mp) + \
+	  (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
+	 ((mp)->m_sb.sb_sectsize + \
+	  (mp)->m_sb.sb_sectsize + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	  (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+
+#define XFS_LINK_LOG_RES(mp)	((mp)->m_reservations.tr_link)
+
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_REMOVE_LOG_RES(mp)	\
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_inodesize + \
+	  XFS_DIROP_LOG_RES(mp) + \
+	  (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
+	 ((2 * (mp)->m_sb.sb_sectsize) + \
+	  (2 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
+
+#define XFS_REMOVE_LOG_RES(mp)	((mp)->m_reservations.tr_remove)
+
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 KB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_SYMLINK_LOG_RES(mp)		\
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B(mp, 1) + \
+	  XFS_DIROP_LOG_RES(mp) + \
+	  1024 + \
+	  (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
+	 (2 * (mp)->m_sb.sb_sectsize + \
+	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
+	  XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+
+#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
+
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+#define XFS_CALC_CREATE_LOG_RES(mp)		\
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_FSB_TO_B(mp, 1) + \
+	  XFS_DIROP_LOG_RES(mp) + \
+	  (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
+	 (3 * (mp)->m_sb.sb_sectsize + \
+	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
+	  XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+
+#define XFS_CREATE_LOG_RES(mp)	((mp)->m_reservations.tr_create)
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+#define XFS_CALC_MKDIR_LOG_RES(mp)	XFS_CALC_CREATE_LOG_RES(mp)
+
+#define XFS_MKDIR_LOG_RES(mp)	((mp)->m_reservations.tr_mkdir)
+
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ */
+#define XFS_CALC_IFREE_LOG_RES(mp) \
+	((mp)->m_sb.sb_inodesize + \
+	 (mp)->m_sb.sb_sectsize + \
+	 (mp)->m_sb.sb_sectsize + \
+	 XFS_FSB_TO_B((mp), 1) + \
+	 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
+	 (128 * 5))
+
+#define XFS_IFREE_LOG_RES(mp)	((mp)->m_reservations.tr_ifree)
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+#define XFS_CALC_ICHANGE_LOG_RES(mp)	((mp)->m_sb.sb_inodesize + \
+					 (mp)->m_sb.sb_sectsize + 512)
+
+#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
+
+/*
+ * Growing the data section of the filesystem.
+ *	superblock
+ *	agi and agf
+ *	allocation btrees
+ */
+#define XFS_CALC_GROWDATA_LOG_RES(mp) \
+	((mp)->m_sb.sb_sectsize * 3 + \
+	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+
+#define XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *	superblock: sector size
+ *	agf of the ag from which the extent is allocated: sector size
+ *	bmap btree for bitmap/summary inode: max depth * blocksize
+ *	bitmap/summary inode: inode size
+ *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
+	(2 * (mp)->m_sb.sb_sectsize + \
+	 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
+	 (mp)->m_sb.sb_inodesize + \
+	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	 (128 * \
+	  (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
+	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+
+#define XFS_GROWRTALLOC_LOG_RES(mp)	((mp)->m_reservations.tr_growrtalloc)
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *	one bitmap/summary block: blocksize
+ */
+#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
+	((mp)->m_sb.sb_blocksize + 128)
+
+#define XFS_GROWRTZERO_LOG_RES(mp)	((mp)->m_reservations.tr_growrtzero)
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *	superblock: sector size
+ *	bitmap inode: inode size
+ *	summary inode: inode size
+ *	one bitmap block: blocksize
+ *	summary blocks: new summary size
+ */
+#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
+	((mp)->m_sb.sb_sectsize + \
+	 2 * (mp)->m_sb.sb_inodesize + \
+	 (mp)->m_sb.sb_blocksize + \
+	 (mp)->m_rsumsize + \
+	 (128 * 5))
+
+#define XFS_GROWRTFREE_LOG_RES(mp)	((mp)->m_reservations.tr_growrtfree)
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *	inode
+ */
+#define XFS_CALC_SWRITE_LOG_RES(mp) \
+	((mp)->m_sb.sb_inodesize + 128)
+
+#define XFS_SWRITE_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
+
+/*
+ * Logging the inode timestamps on an fsync -- same as SWRITE
+ * as long as SWRITE logs the entire inode core
+ */
+#define XFS_FSYNC_TS_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *	inode
+ */
+#define XFS_CALC_WRITEID_LOG_RES(mp) \
+	((mp)->m_sb.sb_inodesize + 128)
+
+#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ *	the inode being converted: inode size
+ *	agf block and superblock (for block allocation)
+ *	the new block (directory sized)
+ *	bmap blocks for the new directory block
+ *	allocation btrees
+ */
+#define XFS_CALC_ADDAFORK_LOG_RES(mp)	\
+	((mp)->m_sb.sb_inodesize + \
+	 (mp)->m_sb.sb_sectsize * 2 + \
+	 (mp)->m_dirblksize + \
+	 (XFS_DIR_IS_V1(mp) ? 0 : \
+	    XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1))) + \
+	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	 (128 * (4 + \
+		 (XFS_DIR_IS_V1(mp) ? 0 : \
+			 XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
+		 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+
+#define XFS_ADDAFORK_LOG_RES(mp)	((mp)->m_reservations.tr_addafork)
+
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode\'s bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_ATTRINVAL_LOG_RES(mp)	\
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
+	  (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
+	 ((4 * (mp)->m_sb.sb_sectsize) + \
+	  (4 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 4) + \
+	  (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
+
+#define XFS_ATTRINVAL_LOG_RES(mp)	((mp)->m_reservations.tr_attrinval)
+
+/*
+ * Setting an attribute.
+ *	the inode getting the attribute
+ *	the superblock for allocations
+ *	the agfs extents are allocated from
+ *	the attribute btree * max depth
+ *	the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
+#define XFS_CALC_ATTRSET_LOG_RES(mp)	\
+	((mp)->m_sb.sb_inodesize + \
+	 (mp)->m_sb.sb_sectsize + \
+	  XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
+	  (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
+
+#define XFS_ATTRSET_LOG_RES(mp, ext)	\
+	((mp)->m_reservations.tr_attrset + \
+	 (ext * (mp)->m_sb.sb_sectsize) + \
+	 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
+	 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
+
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_ATTRRM_LOG_RES(mp)	\
+	(MAX( \
+	  ((mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
+	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
+	  (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
+	 ((2 * (mp)->m_sb.sb_sectsize) + \
+	  (2 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
+
+#define XFS_ATTRRM_LOG_RES(mp)	((mp)->m_reservations.tr_attrrm)
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
+	((mp)->m_sb.sb_sectsize + 128)
+
+#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
+
+
+/*
+ * Various log count values.
+ */
+#define XFS_DEFAULT_LOG_COUNT		1
+#define XFS_DEFAULT_PERM_LOG_COUNT	2
+#define XFS_ITRUNCATE_LOG_COUNT		2
+#define XFS_CREATE_LOG_COUNT		2
+#define XFS_MKDIR_LOG_COUNT		3
+#define XFS_SYMLINK_LOG_COUNT		3
+#define XFS_REMOVE_LOG_COUNT		2
+#define XFS_LINK_LOG_COUNT		2
+#define XFS_RENAME_LOG_COUNT		2
+#define XFS_WRITE_LOG_COUNT		2
+#define XFS_ADDAFORK_LOG_COUNT		2
+#define XFS_ATTRINVAL_LOG_COUNT		1
+#define XFS_ATTRSET_LOG_COUNT		3
+#define XFS_ATTRRM_LOG_COUNT		3
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer
+ * reference count values.  This determine how hard the buffer
+ * cache tries to hold onto the buffer.
+ */
+#define XFS_AGF_REF		4
+#define XFS_AGI_REF		4
+#define XFS_AGFL_REF		3
+#define XFS_INO_BTREE_REF	3
+#define XFS_ALLOC_BTREE_REF	2
+#define XFS_BMAP_BTREE_REF	2
+#define XFS_DIR_BTREE_REF	2
+#define XFS_ATTR_BTREE_REF	1
+#define XFS_INO_REF		1
+#define XFS_DQUOT_REF		1
+
+#ifdef __KERNEL__
+/*
+ * XFS transaction mechanism exported interfaces that are
+ * actually macros.
+ */
+#define xfs_trans_get_log_res(tp)	((tp)->t_log_res)
+#define xfs_trans_get_log_count(tp)	((tp)->t_log_count)
+#define xfs_trans_get_block_res(tp)	((tp)->t_blk_res)
+#define xfs_trans_set_sync(tp)		((tp)->t_flags |= XFS_TRANS_SYNC)
+
+#ifdef DEBUG
+#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (long)d)
+#define xfs_trans_agflist_delta(tp, d)	((tp)->t_ag_flist_delta += (long)d)
+#define xfs_trans_agbtree_delta(tp, d)	((tp)->t_ag_btree_delta += (long)d)
+#else
+#define xfs_trans_agblocks_delta(tp, d)
+#define xfs_trans_agflist_delta(tp, d)
+#define xfs_trans_agbtree_delta(tp, d)
+#endif
+
+/*
+ * XFS transaction mechanism exported interfaces.
+ */
+void		xfs_trans_init(struct xfs_mount *);
+xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t	*xfs_trans_dup(xfs_trans_t *);
+int		xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+				  uint, uint);
+void		xfs_trans_callback(xfs_trans_t *,
+				   void (*)(xfs_trans_t *, void *), void *);
+void		xfs_trans_mod_sb(xfs_trans_t *, uint, long);
+struct xfs_buf	*xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t,
+				   int, uint);
+int		xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *,
+				   struct xfs_buftarg *, xfs_daddr_t, int, uint,
+				   struct xfs_buf **);
+struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+
+void		xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold_until_committed(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+int		xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
+			       xfs_ino_t , uint, struct xfs_inode **);
+void		xfs_trans_iput(xfs_trans_t *, struct xfs_inode *, uint);
+void		xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint);
+void		xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *);
+void		xfs_trans_ihold_release(xfs_trans_t *, struct xfs_inode *);
+void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
+void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
+void		xfs_efi_release(struct xfs_efi_log_item *, uint);
+void		xfs_trans_log_efi_extent(xfs_trans_t *,
+					 struct xfs_efi_log_item *,
+					 xfs_fsblock_t,
+					 xfs_extlen_t);
+struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *,
+				  struct xfs_efi_log_item *,
+				  uint);
+void		xfs_trans_log_efd_extent(xfs_trans_t *,
+					 struct xfs_efd_log_item *,
+					 xfs_fsblock_t,
+					 xfs_extlen_t);
+void		xfs_trans_log_create_rpc(xfs_trans_t *, int, xfs_ino_t);
+void		xfs_trans_log_setattr_rpc(xfs_trans_t *, int);
+int		xfs_trans_commit(xfs_trans_t *, uint flags, xfs_lsn_t *);
+void		xfs_trans_commit_async(struct xfs_mount *);
+void		xfs_trans_cancel(xfs_trans_t *, int);
+void		xfs_trans_ail_init(struct xfs_mount *);
+xfs_lsn_t	xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
+xfs_lsn_t	xfs_trans_tail_ail(struct xfs_mount *);
+void		xfs_trans_unlocked_item(struct xfs_mount *,
+					xfs_log_item_t *);
+xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
+					xfs_agnumber_t ag,
+					xfs_extlen_t idx);
+
+/*
+ * Not necessarily exported, but used outside a single file.
+ */
+int		xfs_trans_lsn_danger(struct xfs_mount *, xfs_lsn_t);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_TRANS_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_ail.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_ail.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_ail.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_ail.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *);
+STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *);
+
+#ifdef XFSDEBUG
+STATIC void xfs_ail_check(xfs_ail_entry_t *);
+#else
+#define xfs_ail_check(a)
+#endif /* XFSDEBUG */
+
+
+/*
+ * This is called by the log manager code to determine the LSN
+ * of the tail of the log.  This is exactly the LSN of the first
+ * item in the AIL.  If the AIL is empty, then this function
+ * returns 0.
+ *
+ * We need the AIL lock in order to get a coherent read of the
+ * lsn of the last item in the AIL.
+ */
+xfs_lsn_t
+xfs_trans_tail_ail(
+	xfs_mount_t	*mp)
+{
+	xfs_lsn_t	lsn;
+	xfs_log_item_t	*lip;
+	SPLDECL(s);
+
+	AIL_LOCK(mp,s);
+	lip = xfs_ail_min(&(mp->m_ail));
+	if (lip == NULL) {
+		lsn = (xfs_lsn_t)0;
+	} else {
+		lsn = lip->li_lsn;
+	}
+	AIL_UNLOCK(mp, s);
+
+	return lsn;
+}
+
+/*
+ * xfs_trans_push_ail
+ *
+ * This routine is called to move the tail of the AIL
+ * forward.  It does this by trying to flush items in the AIL
+ * whose lsns are below the given threshold_lsn.
+ *
+ * The routine returns the lsn of the tail of the log.
+ */
+xfs_lsn_t
+xfs_trans_push_ail(
+	xfs_mount_t		*mp,
+	xfs_lsn_t		threshold_lsn)
+{
+	xfs_lsn_t		lsn;
+	xfs_log_item_t		*lip;
+	int			gen;
+	int			restarts;
+	int			lock_result;
+	int			flush_log;
+	SPLDECL(s);
+
+#define XFS_TRANS_PUSH_AIL_RESTARTS	10
+
+	AIL_LOCK(mp,s);
+	lip = xfs_trans_first_ail(mp, &gen);
+	if (lip == NULL || XFS_FORCED_SHUTDOWN(mp)) {
+		/*
+		 * Just return if the AIL is empty.
+		 */
+		AIL_UNLOCK(mp, s);
+		return (xfs_lsn_t)0;
+	}
+
+	XFS_STATS_INC(xfsstats.xs_push_ail);
+
+	/*
+	 * While the item we are looking at is below the given threshold
+	 * try to flush it out.	 Make sure to limit the number of times
+	 * we allow xfs_trans_next_ail() to restart scanning from the
+	 * beginning of the list.  We'd like not to stop until we've at least
+	 * tried to push on everything in the AIL with an LSN less than
+	 * the given threshold. However, we may give up before that if
+	 * we realize that we've been holding the AIL_LOCK for 'too long',
+	 * blocking interrupts. Currently, too long is < 500us roughly.
+	 */
+	flush_log = 0;
+	restarts = 0;
+	while (((restarts < XFS_TRANS_PUSH_AIL_RESTARTS) &&
+		(XFS_LSN_CMP(lip->li_lsn, threshold_lsn) < 0))) {
+		/*
+		 * If we can lock the item without sleeping, unlock
+		 * the AIL lock and flush the item.  Then re-grab the
+		 * AIL lock so we can look for the next item on the
+		 * AIL.	 Since we unlock the AIL while we flush the
+		 * item, the next routine may start over again at the
+		 * the beginning of the list if anything has changed.
+		 * That is what the generation count is for.
+		 *
+		 * If we can't lock the item, either its holder will flush
+		 * it or it is already being flushed or it is being relogged.
+		 * In any of these case it is being taken care of and we
+		 * can just skip to the next item in the list.
+		 */
+		lock_result = IOP_TRYLOCK(lip);
+		switch (lock_result) {
+		      case XFS_ITEM_SUCCESS:
+			AIL_UNLOCK(mp, s);
+			XFS_STATS_INC(xfsstats.xs_push_ail_success);
+			IOP_PUSH(lip);
+			AIL_LOCK(mp,s);
+			break;
+
+		      case XFS_ITEM_PUSHBUF:
+			AIL_UNLOCK(mp, s);
+			XFS_STATS_INC(xfsstats.xs_push_ail_pushbuf);
+#ifdef XFSRACEDEBUG
+			delay_for_intr();
+			delay(300);
+#endif
+			ASSERT(lip->li_ops->iop_pushbuf);
+			ASSERT(lip);
+			IOP_PUSHBUF(lip);
+			AIL_LOCK(mp,s);
+			break;
+
+		      case XFS_ITEM_PINNED:
+			XFS_STATS_INC(xfsstats.xs_push_ail_pinned);
+			flush_log = 1;
+			break;
+
+		      case XFS_ITEM_LOCKED:
+			XFS_STATS_INC(xfsstats.xs_push_ail_locked);
+			break;
+
+		      case XFS_ITEM_FLUSHING:
+			XFS_STATS_INC(xfsstats.xs_push_ail_flushing);
+			break;
+
+		      default:
+			ASSERT(0);
+			break;
+		}
+
+		lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
+		if (lip == NULL) {
+			break;
+		}
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			/*
+			 * Just return if we shut down during the last try.
+			 */
+			AIL_UNLOCK(mp, s);
+			return (xfs_lsn_t)0;
+		}
+
+	}
+
+	if (flush_log) {
+		/*
+		 * If something we need to push out was pinned, then
+		 * push out the log so it will become unpinned and
+		 * move forward in the AIL.
+		 */
+		AIL_UNLOCK(mp, s);
+		XFS_STATS_INC(xfsstats.xs_push_ail_flush);
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+		AIL_LOCK(mp, s);
+	}
+
+	lip = xfs_ail_min(&(mp->m_ail));
+	if (lip == NULL) {
+		lsn = (xfs_lsn_t)0;
+	} else {
+		lsn = lip->li_lsn;
+	}
+
+	AIL_UNLOCK(mp, s);
+	return lsn;
+}	/* xfs_trans_push_ail */
+
+
+/*
+ * This is to be called when an item is unlocked that may have
+ * been in the AIL.  It will wake up the first member of the AIL
+ * wait list if this item's unlocking might allow it to progress.
+ * If the item is in the AIL, then we need to get the AIL lock
+ * while doing our checking so we don't race with someone going
+ * to sleep waiting for this event in xfs_trans_push_ail().
+ */
+void
+xfs_trans_unlocked_item(
+	xfs_mount_t	*mp,
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*min_lip;
+
+	/*
+	 * If we're forcibly shutting down, we may have
+	 * unlocked log items arbitrarily. The last thing
+	 * we want to do is to move the tail of the log
+	 * over some potentially valid data.
+	 */
+	if (!(lip->li_flags & XFS_LI_IN_AIL) ||
+	    XFS_FORCED_SHUTDOWN(mp)) {
+		return;
+	}
+
+	/*
+	 * This is the one case where we can call into xfs_ail_min()
+	 * without holding the AIL lock because we only care about the
+	 * case where we are at the tail of the AIL.  If the object isn't
+	 * at the tail, it doesn't matter what result we get back.  This
+	 * is slightly racy because since we were just unlocked, we could
+	 * go to sleep between the call to xfs_ail_min and the call to
+	 * xfs_log_move_tail, have someone else lock us, commit to us disk,
+	 * move us out of the tail of the AIL, and then we wake up.  However,
+	 * the call to xfs_log_move_tail() doesn't do anything if there's
+	 * not enough free space to wake people up so we're safe calling it.
+	 */
+	min_lip = xfs_ail_min(&mp->m_ail);
+
+	if (min_lip == lip)
+		xfs_log_move_tail(mp, 1);
+}	/* xfs_trans_unlocked_item */
+
+
+/*
+ * Update the position of the item in the AIL with the new
+ * lsn.	 If it is not yet in the AIL, add it.  Otherwise, move
+ * it to its new position by removing it and re-adding it.
+ *
+ * Wakeup anyone with an lsn less than the item's lsn.	If the item
+ * we move in the AIL is the minimum one, update the tail lsn in the
+ * log manager.
+ *
+ * Increment the AIL's generation count to indicate that the tree
+ * has changed.
+ *
+ * This function must be called with the AIL lock held.	 The lock
+ * is dropped before returning, so the caller must pass in the
+ * cookie returned by AIL_LOCK.
+ */
+void
+xfs_trans_update_ail(
+	xfs_mount_t	*mp,
+	xfs_log_item_t	*lip,
+	xfs_lsn_t	lsn,
+	unsigned long	s)
+{
+	xfs_ail_entry_t		*ailp;
+	xfs_log_item_t		*dlip=NULL;
+	xfs_log_item_t		*mlip;	/* ptr to minimum lip */
+
+	ailp = &(mp->m_ail);
+	mlip = xfs_ail_min(ailp);
+
+	if (lip->li_flags & XFS_LI_IN_AIL) {
+		dlip = xfs_ail_delete(ailp, lip);
+		ASSERT(dlip == lip);
+	} else {
+		lip->li_flags |= XFS_LI_IN_AIL;
+	}
+
+	lip->li_lsn = lsn;
+
+	xfs_ail_insert(ailp, lip);
+	mp->m_ail_gen++;
+
+	if (mlip == dlip) {
+		mlip = xfs_ail_min(&(mp->m_ail));
+		AIL_UNLOCK(mp, s);
+		xfs_log_move_tail(mp, mlip->li_lsn);
+	} else {
+		AIL_UNLOCK(mp, s);
+	}
+
+
+}	/* xfs_trans_update_ail */
+
+/*
+ * Delete the given item from the AIL.	It must already be in
+ * the AIL.
+ *
+ * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * we delete in the AIL is the minimum one, update the tail lsn in the
+ * log manager.
+ *
+ * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * bump the AIL's generation count to indicate that the tree
+ * has changed.
+ *
+ * This function must be called with the AIL lock held.	 The lock
+ * is dropped before returning, so the caller must pass in the
+ * cookie returned by AIL_LOCK.
+ */
+void
+xfs_trans_delete_ail(
+	xfs_mount_t	*mp,
+	xfs_log_item_t	*lip,
+	unsigned long	s)
+{
+	xfs_ail_entry_t		*ailp;
+	xfs_log_item_t		*dlip;
+	xfs_log_item_t		*mlip;
+
+	if (lip->li_flags & XFS_LI_IN_AIL) {
+		ailp = &(mp->m_ail);
+		mlip = xfs_ail_min(ailp);
+		dlip = xfs_ail_delete(ailp, lip);
+		ASSERT(dlip == lip);
+
+
+		lip->li_flags &= ~XFS_LI_IN_AIL;
+		lip->li_lsn = 0;
+		mp->m_ail_gen++;
+
+		if (mlip == dlip) {
+			mlip = xfs_ail_min(&(mp->m_ail));
+			AIL_UNLOCK(mp, s);
+			xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
+		} else {
+			AIL_UNLOCK(mp, s);
+		}
+	}
+	else {
+		/*
+		 * If the file system is not being shutdown, we are in
+		 * serious trouble if we get to this stage.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp))
+			AIL_UNLOCK(mp, s);
+		else {
+			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+				"xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL");
+			xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+			AIL_UNLOCK(mp, s);
+		}
+	}
+}
+
+
+
+/*
+ * Return the item in the AIL with the smallest lsn.
+ * Return the current tree generation number for use
+ * in calls to xfs_trans_next_ail().
+ */
+xfs_log_item_t *
+xfs_trans_first_ail(
+	xfs_mount_t	*mp,
+	int		*gen)
+{
+	xfs_log_item_t	*lip;
+
+	lip = xfs_ail_min(&(mp->m_ail));
+	*gen = (int)mp->m_ail_gen;
+
+	return (lip);
+}
+
+/*
+ * If the generation count of the tree has not changed since the
+ * caller last took something from the AIL, then return the elmt
+ * in the tree which follows the one given.  If the count has changed,
+ * then return the minimum elmt of the AIL and bump the restarts counter
+ * if one is given.
+ */
+xfs_log_item_t *
+xfs_trans_next_ail(
+	xfs_mount_t	*mp,
+	xfs_log_item_t	*lip,
+	int		*gen,
+	int		*restarts)
+{
+	xfs_log_item_t	*nlip;
+
+	ASSERT(mp && lip && gen);
+	if (mp->m_ail_gen == *gen) {
+		nlip = xfs_ail_next(&(mp->m_ail), lip);
+	} else {
+		nlip = xfs_ail_min(&(mp->m_ail));
+		*gen = (int)mp->m_ail_gen;
+		if (restarts != NULL) {
+			XFS_STATS_INC(xfsstats.xs_push_ail_restarts);
+			(*restarts)++;
+		}
+	}
+
+	return (nlip);
+}
+
+
+/*
+ * The active item list (AIL) is a doubly linked list of log
+ * items sorted by ascending lsn.  The base of the list is
+ * a forw/back pointer pair embedded in the xfs mount structure.
+ * The base is initialized with both pointers pointing to the
+ * base.  This case always needs to be distinguished, because
+ * the base has no lsn to look at.  We almost always insert
+ * at the end of the list, so on inserts we search from the
+ * end of the list to find where the new item belongs.
+ */
+
+/*
+ * Initialize the doubly linked list to point only to itself.
+ */
+void
+xfs_trans_ail_init(
+	xfs_mount_t	*mp)
+{
+	mp->m_ail.ail_forw = (xfs_log_item_t*)&(mp->m_ail);
+	mp->m_ail.ail_back = (xfs_log_item_t*)&(mp->m_ail);
+}
+
+/*
+ * Insert the given log item into the AIL.
+ * We almost always insert at the end of the list, so on inserts
+ * we search from the end of the list to find where the
+ * new item belongs.
+ */
+STATIC void
+xfs_ail_insert(
+	xfs_ail_entry_t *base,
+	xfs_log_item_t	*lip)
+/* ARGSUSED */
+{
+	xfs_log_item_t	*next_lip;
+
+	/*
+	 * If the list is empty, just insert the item.
+	 */
+	if (base->ail_back == (xfs_log_item_t*)base) {
+		base->ail_forw = lip;
+		base->ail_back = lip;
+		lip->li_ail.ail_forw = (xfs_log_item_t*)base;
+		lip->li_ail.ail_back = (xfs_log_item_t*)base;
+		return;
+	}
+
+	next_lip = base->ail_back;
+	while ((next_lip != (xfs_log_item_t*)base) &&
+	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) {
+		next_lip = next_lip->li_ail.ail_back;
+	}
+	ASSERT((next_lip == (xfs_log_item_t*)base) ||
+	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
+	lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
+	lip->li_ail.ail_back = next_lip;
+	next_lip->li_ail.ail_forw = lip;
+	lip->li_ail.ail_forw->li_ail.ail_back = lip;
+
+	xfs_ail_check(base);
+	return;
+}
+
+/*
+ * Delete the given item from the AIL.	Return a pointer to the item.
+ */
+/*ARGSUSED*/
+STATIC xfs_log_item_t *
+xfs_ail_delete(
+	xfs_ail_entry_t *base,
+	xfs_log_item_t	*lip)
+/* ARGSUSED */
+{
+	lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back;
+	lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw;
+	lip->li_ail.ail_forw = NULL;
+	lip->li_ail.ail_back = NULL;
+
+	xfs_ail_check(base);
+	return lip;
+}
+
+/*
+ * Return a pointer to the first item in the AIL.
+ * If the AIL is empty, then return NULL.
+ */
+STATIC xfs_log_item_t *
+xfs_ail_min(
+	xfs_ail_entry_t *base)
+/* ARGSUSED */
+{
+	register xfs_log_item_t *forw = base->ail_forw;
+	if (forw == (xfs_log_item_t*)base) {
+		return NULL;
+	}
+	return forw;
+}
+
+/*
+ * Return a pointer to the item which follows
+ * the given item in the AIL.  If the given item
+ * is the last item in the list, then return NULL.
+ */
+STATIC xfs_log_item_t *
+xfs_ail_next(
+	xfs_ail_entry_t *base,
+	xfs_log_item_t	*lip)
+/* ARGSUSED */
+{
+	if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) {
+		return NULL;
+	}
+	return lip->li_ail.ail_forw;
+
+}
+
+#ifdef XFSDEBUG
+/*
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+	xfs_ail_entry_t *base)
+{
+	xfs_log_item_t	*lip;
+	xfs_log_item_t	*prev_lip;
+
+	lip = base->ail_forw;
+	if (lip == (xfs_log_item_t*)base) {
+		/*
+		 * Make sure the pointers are correct when the list
+		 * is empty.
+		 */
+		ASSERT(base->ail_back == (xfs_log_item_t*)base);
+		return;
+	}
+
+	/*
+	 * Walk the list checking forward and backward pointers,
+	 * lsn ordering, and that every entry has the XFS_LI_IN_AIL
+	 * flag set.
+	 */
+	prev_lip = (xfs_log_item_t*)base;
+	while (lip != (xfs_log_item_t*)base) {
+		if (prev_lip != (xfs_log_item_t*)base) {
+			ASSERT(prev_lip->li_ail.ail_forw == lip);
+			ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+		}
+		ASSERT(lip->li_ail.ail_back == prev_lip);
+		ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+		prev_lip = lip;
+		lip = lip->li_ail.ail_forw;
+	}
+	ASSERT(lip == (xfs_log_item_t*)base);
+	ASSERT(base->ail_back == prev_lip);
+}
+#endif /* XFSDEBUG */
+
+
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_buf.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_buf.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_buf.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_buf.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1085 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
+		xfs_daddr_t, int);
+STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
+		xfs_daddr_t, int);
+
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.	 If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * Use the fast path function xfs_trans_buf_item_match() or the buffer
+ * cache routine incore_match() to find the buffer
+ * if it is already owned by this transaction.
+ *
+ * If we don't already own the buffer, use get_buf() to get it.
+ * If it doesn't yet have an associated xfs_buf_log_item structure,
+ * then allocate one and add the item to this transaction.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * get_buf() call.
+ */
+xfs_buf_t *
+xfs_trans_get_buf(xfs_trans_t	*tp,
+		  xfs_buftarg_t	*target_dev,
+		  xfs_daddr_t	blkno,
+		  int		len,
+		  uint		flags)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+
+	if (flags == 0)
+		flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+
+	/*
+	 * Default to a normal get_buf() call if the tp is NULL.
+	 */
+	if (tp == NULL) {
+		bp = xfs_buf_get_flags(target_dev, blkno, len,
+							flags | BUF_BUSY);
+		return(bp);
+	}
+
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the buffer to the caller.
+	 */
+	if (tp->t_items.lic_next == NULL) {
+		bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
+	} else {
+		bp  = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
+	}
+	if (bp != NULL) {
+		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+			xfs_buftrace("TRANS GET RECUR SHUT", bp);
+			XFS_BUF_SUPER_STALE(bp);
+		}
+		/*
+		 * If the buffer is stale then it was binval'ed
+		 * since last read.  This doesn't matter since the
+		 * caller isn't allowed to use the data anyway.
+		 */
+		else if (XFS_BUF_ISSTALE(bp)) {
+			xfs_buftrace("TRANS GET RECUR STALE", bp);
+			ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+		}
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+		xfs_buftrace("TRANS GET RECUR", bp);
+		xfs_buf_item_trace("GET RECUR", bip);
+		return (bp);
+	}
+
+	/*
+	 * We always specify the BUF_BUSY flag within a transaction so
+	 * that get_buf does not try to push out a delayed write buffer
+	 * which might cause another transaction to take place (if the
+	 * buffer was delayed alloc).  Such recursive transactions can
+	 * easily deadlock with our current transaction as well as cause
+	 * us to run out of stack space.
+	 */
+	bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY);
+	if (bp == NULL) {
+		return NULL;
+	}
+
+	ASSERT(!XFS_BUF_GETERROR(bp));
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+
+	/*
+	 * Set the recursion count for the buffer within this transaction
+	 * to 0.
+	 */
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+	xfs_buftrace("TRANS GET", bp);
+	xfs_buf_item_trace("GET", bip);
+	return (bp);
+}
+
+/*
+ * Get and lock the superblock buffer of this file system for the
+ * given transaction.
+ *
+ * We don't need to use incore_match() here, because the superblock
+ * buffer is a private buffer which we keep a pointer to in the
+ * mount structure.
+ */
+xfs_buf_t *
+xfs_trans_getsb(xfs_trans_t	*tp,
+		struct xfs_mount *mp,
+		int		flags)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+
+	/*
+	 * Default to just trying to lock the superblock buffer
+	 * if tp is NULL.
+	 */
+	if (tp == NULL) {
+		return (xfs_getsb(mp, flags));
+	}
+
+	/*
+	 * If the superblock buffer already has this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the buffer to the caller.
+	 */
+	bp = mp->m_sb_bp;
+	if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) {
+		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+		xfs_buf_item_trace("GETSB RECUR", bip);
+		return (bp);
+	}
+
+	bp = xfs_getsb(mp, flags);
+	if (bp == NULL) {
+		return NULL;
+	}
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, mp);
+
+	/*
+	 * Set the recursion count for the buffer within this transaction
+	 * to 0.
+	 */
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+	xfs_buf_item_trace("GETSB", bip);
+	return (bp);
+}
+
+#ifdef DEBUG
+dev_t	xfs_error_dev = 0;
+int	xfs_do_error;
+int	xfs_req_num;
+int	xfs_error_mod = 33;
+#endif
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.	 If it has not yet been
+ * read in, read it from disk. If it is already locked
+ * within the transaction and already read in, just increment its
+ * lock recursion count and return a pointer to it.
+ *
+ * Use the fast path function xfs_trans_buf_item_match() or the buffer
+ * cache routine incore_match() to find the buffer
+ * if it is already owned by this transaction.
+ *
+ * If we don't already own the buffer, use read_buf() to get it.
+ * If it doesn't yet have an associated xfs_buf_log_item structure,
+ * then allocate one and add the item to this transaction.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * read_buf() call.
+ */
+int
+xfs_trans_read_buf(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_buftarg_t	*target,
+	xfs_daddr_t	blkno,
+	int		len,
+	uint		flags,
+	xfs_buf_t	**bpp)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+	int			error;
+
+	if (flags == 0)
+		flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+
+	/*
+	 * Default to a normal get_buf() call if the tp is NULL.
+	 */
+	if (tp == NULL) {
+		bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+		if (!bp)
+			return XFS_ERROR(ENOMEM);
+
+		if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+			xfs_ioerror_alert("xfs_trans_read_buf", mp,
+					  bp, blkno);
+			error = XFS_BUF_GETERROR(bp);
+			xfs_buf_relse(bp);
+			return error;
+		}
+#ifdef DEBUG
+		if (xfs_do_error && (bp != NULL)) {
+			if (xfs_error_dev == target->pbr_dev) {
+				if (((xfs_req_num++) % xfs_error_mod) == 0) {
+					xfs_buf_relse(bp);
+					printk("Returning error!\n");
+					return XFS_ERROR(EIO);
+				}
+			}
+		}
+#endif
+		if (XFS_FORCED_SHUTDOWN(mp))
+			goto shutdown_abort;
+		*bpp = bp;
+		return 0;
+	}
+
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  If it is already read in we just increment
+	 * the lock recursion count and return the buffer to the caller.
+	 * If the buffer is not yet read in, then we read it in, increment
+	 * the lock recursion count, and return it to the caller.
+	 */
+	if (tp->t_items.lic_next == NULL) {
+		bp = xfs_trans_buf_item_match(tp, target, blkno, len);
+	} else {
+		bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
+	}
+	if (bp != NULL) {
+		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+		ASSERT((XFS_BUF_ISERROR(bp)) == 0);
+		if (!(XFS_BUF_ISDONE(bp))) {
+			xfs_buftrace("READ_BUF_INCORE !DONE", bp);
+			ASSERT(!XFS_BUF_ISASYNC(bp));
+			XFS_BUF_READ(bp);
+			xfsbdstrat(tp->t_mountp, bp);
+			xfs_iowait(bp);
+			if (XFS_BUF_GETERROR(bp) != 0) {
+				xfs_ioerror_alert("xfs_trans_read_buf", mp,
+						  bp, blkno);
+				error = XFS_BUF_GETERROR(bp);
+				xfs_buf_relse(bp);
+				/*
+				 * We can gracefully recover from most
+				 * read errors. Ones we can't are those
+				 * that happen after the transaction's
+				 * already dirty.
+				 */
+				if (tp->t_flags & XFS_TRANS_DIRTY)
+					xfs_force_shutdown(tp->t_mountp,
+							   XFS_METADATA_IO_ERROR);
+				return error;
+			}
+		}
+		/*
+		 * We never locked this buf ourselves, so we shouldn't
+		 * brelse it either. Just get out.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp);
+			*bpp = NULL;
+			return XFS_ERROR(EIO);
+		}
+
+
+		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+		bip->bli_recur++;
+
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		xfs_buf_item_trace("READ RECUR", bip);
+		*bpp = bp;
+		return 0;
+	}
+
+	/*
+	 * We always specify the BUF_BUSY flag within a transaction so
+	 * that get_buf does not try to push out a delayed write buffer
+	 * which might cause another transaction to take place (if the
+	 * buffer was delayed alloc).  Such recursive transactions can
+	 * easily deadlock with our current transaction as well as cause
+	 * us to run out of stack space.
+	 */
+	bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+	if (bp == NULL) {
+		*bpp = NULL;
+		return 0;
+	}
+	if (XFS_BUF_GETERROR(bp) != 0) {
+	    XFS_BUF_SUPER_STALE(bp);
+		xfs_buftrace("READ ERROR", bp);
+		error = XFS_BUF_GETERROR(bp);
+
+		xfs_ioerror_alert("xfs_trans_read_buf", mp,
+				  bp, blkno);
+		if (tp->t_flags & XFS_TRANS_DIRTY)
+			xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR);
+		xfs_buf_relse(bp);
+		return error;
+	}
+#ifdef DEBUG
+	if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) {
+		if (xfs_error_dev == target->pbr_dev) {
+			if (((xfs_req_num++) % xfs_error_mod) == 0) {
+				xfs_force_shutdown(tp->t_mountp,
+						   XFS_METADATA_IO_ERROR);
+				xfs_buf_relse(bp);
+				printk("Returning error in trans!\n");
+				return XFS_ERROR(EIO);
+			}
+		}
+	}
+#endif
+	if (XFS_FORCED_SHUTDOWN(mp))
+		goto shutdown_abort;
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+
+	/*
+	 * Set the recursion count for the buffer within this transaction
+	 * to 0.
+	 */
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+	xfs_buftrace("TRANS READ", bp);
+	xfs_buf_item_trace("READ", bip);
+	*bpp = bp;
+	return 0;
+
+shutdown_abort:
+	/*
+	 * the theory here is that buffer is good but we're
+	 * bailing out because the filesystem is being forcibly
+	 * shut down.  So we should leave the b_flags alone since
+	 * the buffer's not staled and just get out.
+	 */
+#if defined(DEBUG)
+	if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
+		cmn_err(CE_NOTE, "about to pop assert, bp == 0x%x\n", bp);
+#endif
+	ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) !=
+						(XFS_B_STALE|XFS_B_DELWRI));
+
+	xfs_buftrace("READ_BUF XFSSHUTDN", bp);
+	xfs_buf_relse(bp);
+	*bpp = NULL;
+	return XFS_ERROR(EIO);
+}
+
+
+/*
+ * Release the buffer bp which was previously acquired with one of the
+ * xfs_trans_... buffer allocation routines if the buffer has not
+ * been modified within this transaction.  If the buffer is modified
+ * within this transaction, do decrement the recursion count but do
+ * not release the buffer even if the count goes to 0.	If the buffer is not
+ * modified within the transaction, decrement the recursion count and
+ * release the buffer if the recursion count goes to 0.
+ *
+ * If the buffer is to be released and it was not modified before
+ * this transaction began, then free the buf_log_item associated with it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * brelse() call.
+ */
+void
+xfs_trans_brelse(xfs_trans_t	*tp,
+		 xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+	xfs_log_item_t		*lip;
+	xfs_log_item_desc_t	*lidp;
+
+	/*
+	 * Default to a normal brelse() call if the tp is NULL.
+	 */
+	if (tp == NULL) {
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+		/*
+		 * If there's a buf log item attached to the buffer,
+		 * then let the AIL know that the buffer is being
+		 * unlocked.
+		 */
+		if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+			lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+			if (lip->li_type == XFS_LI_BUF) {
+				bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
+				xfs_trans_unlocked_item(
+						bip->bli_item.li_mountp,
+						lip);
+			}
+		}
+		xfs_buf_relse(bp);
+		return;
+	}
+
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	/*
+	 * Find the item descriptor pointing to this buffer's
+	 * log item.  It must be there.
+	 */
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+	ASSERT(lidp != NULL);
+
+	/*
+	 * If the release is just for a recursive lock,
+	 * then decrement the count and return.
+	 */
+	if (bip->bli_recur > 0) {
+		bip->bli_recur--;
+		xfs_buf_item_trace("RELSE RECUR", bip);
+		return;
+	}
+
+	/*
+	 * If the buffer is dirty within this transaction, we can't
+	 * release it until we commit.
+	 */
+	if (lidp->lid_flags & XFS_LID_DIRTY) {
+		xfs_buf_item_trace("RELSE DIRTY", bip);
+		return;
+	}
+
+	/*
+	 * If the buffer has been invalidated, then we can't release
+	 * it until the transaction commits to disk unless it is re-dirtied
+	 * as part of this transaction.	 This prevents us from pulling
+	 * the item from the AIL before we should.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		xfs_buf_item_trace("RELSE STALE", bip);
+		return;
+	}
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	xfs_buf_item_trace("RELSE", bip);
+
+	/*
+	 * Free up the log item descriptor tracking the released item.
+	 */
+	xfs_trans_free_item(tp, lidp);
+
+	/*
+	 * Clear the hold flag in the buf log item if it is set.
+	 * We wouldn't want the next user of the buffer to
+	 * get confused.
+	 */
+	if (bip->bli_flags & XFS_BLI_HOLD) {
+		bip->bli_flags &= ~XFS_BLI_HOLD;
+	}
+
+	/*
+	 * Drop our reference to the buf log item.
+	 */
+	atomic_dec(&bip->bli_refcount);
+
+	/*
+	 * If the buf item is not tracking data in the log, then
+	 * we must free it before releasing the buffer back to the
+	 * free pool.  Before releasing the buffer to the free pool,
+	 * clear the transaction pointer in b_fsprivate2 to dissolve
+	 * its relation to this transaction.
+	 */
+	if (!xfs_buf_item_dirty(bip)) {
+/***
+		ASSERT(bp->b_pincount == 0);
+***/
+		ASSERT(atomic_read(&bip->bli_refcount) == 0);
+		ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+		ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
+		xfs_buf_item_relse(bp);
+		bip = NULL;
+	}
+	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+
+	/*
+	 * If we've still got a buf log item on the buffer, then
+	 * tell the AIL that the buffer is being unlocked.
+	 */
+	if (bip != NULL) {
+		xfs_trans_unlocked_item(bip->bli_item.li_mountp,
+					(xfs_log_item_t*)bip);
+	}
+
+	xfs_buf_relse(bp);
+	return;
+}
+
+/*
+ * Add the locked buffer to the transaction.
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it.  Then add the buf item to the transaction.
+ */
+void
+xfs_trans_bjoin(xfs_trans_t	*tp,
+		xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * in xfs_trans_get_buf() and friends above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+	xfs_buf_item_trace("BJOIN", bip);
+}
+
+/*
+ * Mark the buffer as not needing to be unlocked when the buf item's
+ * IOP_UNLOCK() routine is called.  The buffer must already be locked
+ * and associated with the given transaction.
+ */
+/* ARGSUSED */
+void
+xfs_trans_bhold(xfs_trans_t	*tp,
+		xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	bip->bli_flags |= XFS_BLI_HOLD;
+	xfs_buf_item_trace("BHOLD", bip);
+}
+
+/*
+ * This function is used to indicate that the buffer should not be
+ * unlocked until the transaction is committed to disk.	 Since we
+ * are going to keep the lock held, make the transaction synchronous
+ * so that the lock is not held too long.
+ *
+ * It uses the log item descriptor flag XFS_LID_SYNC_UNLOCK to
+ * delay the buf items's unlock call until the transaction is
+ * committed to disk or aborted.
+ */
+void
+xfs_trans_bhold_until_committed(xfs_trans_t	*tp,
+				xfs_buf_t	*bp)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+	ASSERT(lidp != NULL);
+
+	lidp->lid_flags |= XFS_LID_SYNC_UNLOCK;
+	xfs_buf_item_trace("BHOLD UNTIL COMMIT", bip);
+
+	xfs_trans_set_sync(tp);
+}
+
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(xfs_trans_t	*tp,
+		  xfs_buf_t	*bp,
+		  uint		first,
+		  uint		last)
+{
+	xfs_buf_log_item_t	*bip;
+	xfs_log_item_desc_t	*lidp;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+	ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
+	ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) ||
+	       (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks));
+
+	/*
+	 * Mark the buffer as needing to be written out eventually,
+	 * and set its iodone function to remove the buffer's buf log
+	 * item from the AIL and free it when the buffer is flushed
+	 * to disk.  See xfs_buf_attach_iodone() for more details
+	 * on li_cb and xfs_buf_iodone_callbacks().
+	 * If we end up aborting this transaction, we trap this buffer
+	 * inside the b_bdstrat callback so that this won't get written to
+	 * disk.
+	 */
+	XFS_BUF_DELAYWRITE(bp);
+	XFS_BUF_DONE(bp);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+	bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
+
+	/*
+	 * If we invalidated the buffer within this transaction, then
+	 * cancel the invalidation now that we're dirtying the buffer
+	 * again.  There are no races with the code in xfs_buf_item_unpin(),
+	 * because we have a reference to the buffer this entire time.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		xfs_buf_item_trace("BLOG UNSTALE", bip);
+		bip->bli_flags &= ~XFS_BLI_STALE;
+		/* note this will have to change for page_buf interface... unstale isn't really an option RMC */
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		XFS_BUF_UNSTALE(bp);
+		bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+	}
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+	bip->bli_flags |= XFS_BLI_LOGGED;
+	xfs_buf_item_log(bip, first, last);
+	xfs_buf_item_trace("BLOG", bip);
+}
+
+
+/*
+ * This called to invalidate a buffer that is being used within
+ * a transaction.  Typically this is because the blocks in the
+ * buffer are being freed, so we need to prevent it from being
+ * written out when we're done.	 Allowing it to be written again
+ * might overwrite data in the free blocks if they are reallocated
+ * to a file.
+ *
+ * We prevent the buffer from being written out by clearing the
+ * B_DELWRI flag.  We can't always
+ * get rid of the buf log item at this point, though, because
+ * the buffer may still be pinned by another transaction.  If that
+ * is the case, then we'll wait until the buffer is committed to
+ * disk for the last time (we can tell by the ref count) and
+ * free it in xfs_buf_item_unpin().  Until it is cleaned up we
+ * will keep the buffer locked so that the buffer and buf log item
+ * are not reused.
+ */
+void
+xfs_trans_binval(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+	ASSERT(lidp != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * If the buffer is already invalidated, then
+		 * just return.
+		 */
+		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
+		ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
+		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
+		xfs_buftrace("XFS_BINVAL RECUR", bp);
+		xfs_buf_item_trace("BINVAL RECUR", bip);
+		return;
+	}
+
+	/*
+	 * Clear the dirty bit in the buffer and set the STALE flag
+	 * in the buf log item.	 The STALE flag will be used in
+	 * xfs_buf_item_unpin() to determine if it should clean up
+	 * when the last reference to the buf item is given up.
+	 * We set the XFS_BLI_CANCEL flag in the buf log format structure
+	 * and log the buf item.  This will be used at recovery time
+	 * to determine that copies of the buffer in the log before
+	 * this should not be replayed.
+	 * We mark the item descriptor and the transaction dirty so
+	 * that we'll hold the buffer until after the commit.
+	 *
+	 * Since we're invalidating the buffer, we also clear the state
+	 * about which parts of the buffer have been logged.  We also
+	 * clear the flag indicating that this is an inode buffer since
+	 * the data in the buffer will no longer be valid.
+	 *
+	 * We set the stale bit in the buffer as well since we're getting
+	 * rid of it.
+	 */
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_STALE(bp);
+	bip->bli_flags |= XFS_BLI_STALE;
+	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+	bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
+	bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+	bzero((char *)(bip->bli_format.blf_data_map),
+	      (bip->bli_format.blf_map_size * sizeof(uint)));
+	lidp->lid_flags |= XFS_LID_DIRTY;
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	xfs_buftrace("XFS_BINVAL", bp);
+	xfs_buf_item_trace("BINVAL", bip);
+}
+
+/*
+ * This call is used to indicate that the buffer contains on-disk
+ * inodes which must be handled specially during recovery.  They
+ * require special handling because only the di_next_unlinked from
+ * the inodes in the buffer should be recovered.  The rest of the
+ * data in the buffer is logged via the inodes themselves.
+ *
+ * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
+ * format structure so that we'll know what to do at recovery time.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+}
+
+
+/*
+ * Mark the buffer as being one which contains newly allocated
+ * inodes.  We need to make sure that even if this buffer is
+ * relogged as an 'inode buf' we still recover all of the inode
+ * images in the face of a crash.  This works in coordination with
+ * xfs_buf_item_committed() to ensure that the buffer remains in the
+ * AIL at its original location even after it has been relogged.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_alloc_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
+
+	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+}
+
+
+/*
+ * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
+ * dquots. However, unlike in inode buffer recovery, dquot buffers get
+ * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag).
+ * The only thing that makes dquot buffers different from regular
+ * buffers is that we must not replay dquot bufs when recovering
+ * if a _corresponding_ quotaoff has happened. We also have to distinguish
+ * between usr dquot bufs and grp dquot bufs, because usr and grp quotas
+ * can be turned off independently.
+ */
+/* ARGSUSED */
+void
+xfs_trans_dquot_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp,
+	uint		type)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+	ASSERT(type == XFS_BLI_UDQUOT_BUF ||
+	       type == XFS_BLI_GDQUOT_BUF);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_format.blf_flags |= type;
+}
+
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.  Only check the first, embedded
+ * chunk, since we don't want to spend all day scanning large transactions.
+ */
+STATIC xfs_buf_t *
+xfs_trans_buf_item_match(
+	xfs_trans_t	*tp,
+	xfs_buftarg_t	*target,
+	xfs_daddr_t	blkno,
+	int		len)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_desc_t	*lidp;
+	xfs_buf_log_item_t	*blip;
+	xfs_buf_t		*bp;
+	int			i;
+
+	bp = NULL;
+	len = BBTOB(len);
+	licp = &tp->t_items;
+	if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+		for (i = 0; i < licp->lic_unused; i++) {
+			/*
+			 * Skip unoccupied slots.
+			 */
+			if (XFS_LIC_ISFREE(licp, i)) {
+				continue;
+			}
+
+			lidp = XFS_LIC_SLOT(licp, i);
+			blip = (xfs_buf_log_item_t *)lidp->lid_item;
+			if (blip->bli_item.li_type != XFS_LI_BUF) {
+				continue;
+			}
+
+			bp = blip->bli_buf;
+			if ((XFS_BUF_TARGET_DEV(bp) == target->pbr_dev) &&
+			    (XFS_BUF_ADDR(bp) == blkno) &&
+			    (XFS_BUF_COUNT(bp) == len)) {
+				/*
+				 * We found it.	 Break out and
+				 * return the pointer to the buffer.
+				 */
+				break;
+			} else {
+				bp = NULL;
+			}
+		}
+	}
+	return bp;
+}
+
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.  Check all the chunks, we
+ * want to be thorough.
+ */
+STATIC xfs_buf_t *
+xfs_trans_buf_item_match_all(
+	xfs_trans_t	*tp,
+	xfs_buftarg_t	*target,
+	xfs_daddr_t	blkno,
+	int		len)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_desc_t	*lidp;
+	xfs_buf_log_item_t	*blip;
+	xfs_buf_t		*bp;
+	int			i;
+
+	bp = NULL;
+	len = BBTOB(len);
+	for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
+		if (XFS_LIC_ARE_ALL_FREE(licp)) {
+			ASSERT(licp == &tp->t_items);
+			ASSERT(licp->lic_next == NULL);
+			return NULL;
+		}
+		for (i = 0; i < licp->lic_unused; i++) {
+			/*
+			 * Skip unoccupied slots.
+			 */
+			if (XFS_LIC_ISFREE(licp, i)) {
+				continue;
+			}
+
+			lidp = XFS_LIC_SLOT(licp, i);
+			blip = (xfs_buf_log_item_t *)lidp->lid_item;
+			if (blip->bli_item.li_type != XFS_LI_BUF) {
+				continue;
+			}
+
+			bp = blip->bli_buf;
+			if ((XFS_BUF_TARGET_DEV(bp) == target->pbr_dev) &&
+			    (XFS_BUF_ADDR(bp) == blkno) &&
+			    (XFS_BUF_COUNT(bp) == len)) {
+				/*
+				 * We found it.	 Break out and
+				 * return the pointer to the buffer.
+				 */
+				return bp;
+			}
+		}
+	}
+	return NULL;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_dquot.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_dquot.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_dquot.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_dquot.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,852 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+
+/*
+ * Add the locked dquot to the transaction.
+ * The dquot must be locked, and it cannot be associated with any
+ * transaction.
+ */
+void
+xfs_trans_dqjoin(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	xfs_dq_logitem_t    *lp;
+
+	ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
+	lp = &dqp->q_logitem;
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp));
+
+	/*
+	 * Initialize i_transp so we can later determine if this dquot is
+	 * associated with this transaction.
+	 */
+	dqp->q_transp = tp;
+}
+
+
+/*
+ * This is called to mark the dquot as needing
+ * to be logged when the transaction is committed.  The dquot must
+ * already be associated with the given transaction.
+ * Note that it marks the entire transaction as dirty. In the ordinary
+ * case, this gets called via xfs_trans_commit, after the transaction
+ * is already dirty. However, there's nothing stop this from getting
+ * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
+ * flag.
+ */
+void
+xfs_trans_log_dquot(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	xfs_log_item_desc_t	*lidp;
+
+	ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+}
+
+/*
+ * Carry forward whatever is left of the quota blk reservation to
+ * the spanky new transaction
+ */
+void
+xfs_trans_dup_dqinfo(
+	xfs_trans_t	*otp,
+	xfs_trans_t	*ntp)
+{
+	xfs_dqtrx_t	*oq, *nq;
+	int		i,j;
+	xfs_dqtrx_t	*oqa, *nqa;
+
+	xfs_trans_alloc_dqinfo(ntp);
+	oqa = otp->t_dqinfo->dqa_usrdquots;
+	nqa = ntp->t_dqinfo->dqa_usrdquots;
+
+	/*
+	 * Because the quota blk reservation is carried forward,
+	 * it is also necessary to carry forward the DQ_DIRTY flag.
+	 */
+	if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+		ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
+
+	for (j = 0; j < 2; j++) {
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			if (oqa[i].qt_dquot == NULL)
+				break;
+			oq = &oqa[i];
+			nq = &nqa[i];
+
+			nq->qt_dquot = oq->qt_dquot;
+			nq->qt_bcount_delta = nq->qt_icount_delta = 0;
+			nq->qt_rtbcount_delta = 0;
+
+			/*
+			 * Transfer whatever is left of the reservations.
+			 */
+			nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
+			oq->qt_blk_res = oq->qt_blk_res_used;
+
+			nq->qt_rtblk_res = oq->qt_rtblk_res -
+				oq->qt_rtblk_res_used;
+			oq->qt_rtblk_res = oq->qt_rtblk_res_used;
+
+			nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
+			oq->qt_ino_res = oq->qt_ino_res_used;
+
+		}
+		oqa = otp->t_dqinfo->dqa_grpdquots;
+		nqa = ntp->t_dqinfo->dqa_grpdquots;
+	}
+}
+
+/*
+ * Wrap around mod_dquot to account for both user and group quotas.
+ */
+void
+xfs_trans_mod_dquot_byino(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		field,
+	long		delta)
+{
+	ASSERT(tp);
+
+	if (tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+
+	if (XFS_IS_UQUOTA_ON(tp->t_mountp) && ip->i_udquot) {
+		(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+	}
+	if (XFS_IS_GQUOTA_ON(tp->t_mountp) && ip->i_gdquot) {
+		(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+	}
+}
+
+STATIC xfs_dqtrx_t *
+xfs_trans_get_dqtrx(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	int		i;
+	xfs_dqtrx_t	*qa;
+
+	for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+		qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
+
+		if (qa[i].qt_dquot == NULL ||
+		    qa[i].qt_dquot == dqp) {
+			return (&qa[i]);
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Make the changes in the transaction structure.
+ * The moral equivalent to xfs_trans_mod_sb().
+ * We don't touch any fields in the dquot, so we don't care
+ * if it's locked or not (most of the time it won't be).
+ */
+void
+xfs_trans_mod_dquot(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp,
+	uint		field,
+	long		delta)
+{
+	xfs_dqtrx_t	*qtrx;
+
+	ASSERT(tp);
+	qtrx = NULL;
+
+	if (tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+	/*
+	 * Find either the first free slot or the slot that belongs
+	 * to this dquot.
+	 */
+	qtrx = xfs_trans_get_dqtrx(tp, dqp);
+	ASSERT(qtrx);
+	if (qtrx->qt_dquot == NULL)
+		qtrx->qt_dquot = dqp;
+
+	switch (field) {
+
+		/*
+		 * regular disk blk reservation
+		 */
+	      case XFS_TRANS_DQ_RES_BLKS:
+		qtrx->qt_blk_res += (ulong)delta;
+		break;
+
+		/*
+		 * inode reservation
+		 */
+	      case XFS_TRANS_DQ_RES_INOS:
+		qtrx->qt_ino_res += (ulong)delta;
+		break;
+
+		/*
+		 * disk blocks used.
+		 */
+	      case XFS_TRANS_DQ_BCOUNT:
+		if (qtrx->qt_blk_res && delta > 0) {
+			qtrx->qt_blk_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
+		}
+		qtrx->qt_bcount_delta += delta;
+		break;
+
+	      case XFS_TRANS_DQ_DELBCOUNT:
+		qtrx->qt_delbcnt_delta += delta;
+		break;
+
+		/*
+		 * Inode Count
+		 */
+	      case XFS_TRANS_DQ_ICOUNT:
+		if (qtrx->qt_ino_res && delta > 0) {
+			qtrx->qt_ino_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+		}
+		qtrx->qt_icount_delta += delta;
+		break;
+
+		/*
+		 * rtblk reservation
+		 */
+	      case XFS_TRANS_DQ_RES_RTBLKS:
+		qtrx->qt_rtblk_res += (ulong)delta;
+		break;
+
+		/*
+		 * rtblk count
+		 */
+	      case XFS_TRANS_DQ_RTBCOUNT:
+		if (qtrx->qt_rtblk_res && delta > 0) {
+			qtrx->qt_rtblk_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
+		}
+		qtrx->qt_rtbcount_delta += delta;
+		break;
+
+	      case XFS_TRANS_DQ_DELRTBCOUNT:
+		qtrx->qt_delrtb_delta += delta;
+		break;
+
+	      default:
+		ASSERT(0);
+	}
+	tp->t_flags |= XFS_TRANS_DQ_DIRTY;
+}
+
+
+/*
+ * Given an array of dqtrx structures, lock all the dquots associated
+ * and join them to the transaction, provided they have been modified.
+ * We know that the highest number of dquots (of one type - usr OR grp),
+ * involved in a transaction is 2 and that both usr and grp combined - 3.
+ * So, we don't attempt to make this very generic.
+ */
+STATIC void
+xfs_trans_dqlockedjoin(
+	xfs_trans_t	*tp,
+	xfs_dqtrx_t	*q)
+{
+	ASSERT(q[0].qt_dquot != NULL);
+	if (q[1].qt_dquot == NULL) {
+		xfs_dqlock(q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+	} else {
+		ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+		xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[1].qt_dquot);
+	}
+}
+
+
+/*
+ * Called by xfs_trans_commit() and similar in spirit to
+ * xfs_trans_apply_sb_deltas().
+ * Go thru all the dquots belonging to this transaction and modify the
+ * INCORE dquot to reflect the actual usages.
+ * Unreserve just the reservations done by this transaction
+ * dquot is still left locked at exit.
+ */
+void
+xfs_trans_apply_dquot_deltas(
+	xfs_trans_t		*tp)
+{
+	int			i, j;
+	xfs_dquot_t		*dqp;
+	xfs_dqtrx_t		*qtrx, *qa;
+	xfs_disk_dquot_t	*d;
+	long			totalbdelta;
+	long			totalrtbdelta;
+
+	ASSERT(tp->t_dqinfo);
+	qa = tp->t_dqinfo->dqa_usrdquots;
+	for (j = 0; j < 2; j++) {
+		if (qa[0].qt_dquot == NULL) {
+			qa = tp->t_dqinfo->dqa_grpdquots;
+			continue;
+		}
+
+		/*
+		 * Lock all of the dquots and join them to the transaction.
+		 */
+		xfs_trans_dqlockedjoin(tp, qa);
+
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			qtrx = &qa[i];
+			/*
+			 * The array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+
+			ASSERT(XFS_DQ_IS_LOCKED(dqp));
+			ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+
+			/*
+			 * adjust the actual number of blocks used
+			 */
+			d = &dqp->q_core;
+
+			/*
+			 * The issue here is - sometimes we don't make a blkquota
+			 * reservation intentionally to be fair to users
+			 * (when the amount is small). On the other hand,
+			 * delayed allocs do make reservations, but that's
+			 * outside of a transaction, so we have no
+			 * idea how much was really reserved.
+			 * So, here we've accumulated delayed allocation blks and
+			 * non-delay blks. The assumption is that the
+			 * delayed ones are always reserved (outside of a
+			 * transaction), and the others may or may not have
+			 * quota reservations.
+			 */
+			totalbdelta = qtrx->qt_bcount_delta +
+				qtrx->qt_delbcnt_delta;
+			totalrtbdelta = qtrx->qt_rtbcount_delta +
+				qtrx->qt_delrtb_delta;
+#ifdef QUOTADEBUG
+			if (totalbdelta < 0)
+				ASSERT(INT_GET(d->d_bcount, ARCH_CONVERT) >=
+				       (xfs_qcnt_t) -totalbdelta);
+
+			if (totalrtbdelta < 0)
+				ASSERT(INT_GET(d->d_rtbcount, ARCH_CONVERT) >=
+				       (xfs_qcnt_t) -totalrtbdelta);
+
+			if (qtrx->qt_icount_delta < 0)
+				ASSERT(INT_GET(d->d_icount, ARCH_CONVERT) >=
+				       (xfs_qcnt_t) -qtrx->qt_icount_delta);
+#endif
+			if (totalbdelta)
+				INT_MOD(d->d_bcount, ARCH_CONVERT, (xfs_qcnt_t)totalbdelta);
+
+			if (qtrx->qt_icount_delta)
+				INT_MOD(d->d_icount, ARCH_CONVERT, (xfs_qcnt_t)qtrx->qt_icount_delta);
+
+			if (totalrtbdelta)
+				INT_MOD(d->d_rtbcount, ARCH_CONVERT, (xfs_qcnt_t)totalrtbdelta);
+
+			/*
+			 * Start/reset the timer(s) if needed.
+			 */
+			xfs_qm_adjust_dqtimers(tp->t_mountp, d);
+
+			dqp->dq_flags |= XFS_DQ_DIRTY;
+			/*
+			 * add this to the list of items to get logged
+			 */
+			xfs_trans_log_dquot(tp, dqp);
+			/*
+			 * Take off what's left of the original reservation.
+			 * In case of delayed allocations, there's no
+			 * reservation that a transaction structure knows of.
+			 */
+			if (qtrx->qt_blk_res != 0) {
+				if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+					if (qtrx->qt_blk_res >
+					    qtrx->qt_blk_res_used)
+						dqp->q_res_bcount -= (xfs_qcnt_t)
+							(qtrx->qt_blk_res -
+							 qtrx->qt_blk_res_used);
+					else
+						dqp->q_res_bcount -= (xfs_qcnt_t)
+							(qtrx->qt_blk_res_used -
+							 qtrx->qt_blk_res);
+				}
+			} else {
+				/*
+				 * These blks were never reserved, either inside
+				 * a transaction or outside one (in a delayed
+				 * allocation). Also, this isn't always a
+				 * negative number since we sometimes
+				 * deliberately skip quota reservations.
+				 */
+				if (qtrx->qt_bcount_delta) {
+					dqp->q_res_bcount +=
+					      (xfs_qcnt_t)qtrx->qt_bcount_delta;
+				}
+			}
+			/*
+			 * Adjust the RT reservation.
+			 */
+			if (qtrx->qt_rtblk_res != 0) {
+				if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+					if (qtrx->qt_rtblk_res >
+					    qtrx->qt_rtblk_res_used)
+					       dqp->q_res_rtbcount -= (xfs_qcnt_t)
+						       (qtrx->qt_rtblk_res -
+							qtrx->qt_rtblk_res_used);
+					else
+					       dqp->q_res_rtbcount -= (xfs_qcnt_t)
+						       (qtrx->qt_rtblk_res_used -
+							qtrx->qt_rtblk_res);
+				}
+			} else {
+				if (qtrx->qt_rtbcount_delta)
+					dqp->q_res_rtbcount +=
+					    (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
+			}
+
+			/*
+			 * Adjust the inode reservation.
+			 */
+			if (qtrx->qt_ino_res != 0) {
+				ASSERT(qtrx->qt_ino_res >=
+				       qtrx->qt_ino_res_used);
+				if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
+					dqp->q_res_icount -= (xfs_qcnt_t)
+						(qtrx->qt_ino_res -
+						 qtrx->qt_ino_res_used);
+			} else {
+				if (qtrx->qt_icount_delta)
+					dqp->q_res_icount +=
+					    (xfs_qcnt_t)qtrx->qt_icount_delta;
+			}
+
+
+#ifdef QUOTADEBUG
+			if (qtrx->qt_rtblk_res != 0)
+				printk("RT res %d for 0x%p\n",
+				      (int) qtrx->qt_rtblk_res,
+				      dqp);
+#endif
+			ASSERT(dqp->q_res_bcount >= INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
+			ASSERT(dqp->q_res_icount >= INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
+			ASSERT(dqp->q_res_rtbcount >= INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT));
+		}
+		/*
+		 * Do the group quotas next
+		 */
+		qa = tp->t_dqinfo->dqa_grpdquots;
+	}
+}
+
+/*
+ * Release the reservations, and adjust the dquots accordingly.
+ * This is called only when the transaction is being aborted. If by
+ * any chance we have done dquot modifications incore (ie. deltas) already,
+ * we simply throw those away, since that's the expected behavior
+ * when a transaction is curtailed without a commit.
+ */
+void
+xfs_trans_unreserve_and_mod_dquots(
+	xfs_trans_t	*tp)
+{
+	int			i, j;
+	xfs_dquot_t		*dqp;
+	xfs_dqtrx_t		*qtrx, *qa;
+	boolean_t		locked;
+
+	ASSERT(tp->t_dqinfo);
+	qa = tp->t_dqinfo->dqa_usrdquots;
+
+	for (j = 0; j < 2; j++) {
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			qtrx = &qa[i];
+			/*
+			 * We assume that the array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+			/*
+			 * Unreserve the original reservation. We don't care
+			 * about the number of blocks used field, or deltas.
+			 * Also we don't bother to zero the fields.
+			 */
+			locked = B_FALSE;
+			if (qtrx->qt_blk_res) {
+				xfs_dqlock(dqp);
+				locked = B_TRUE;
+				dqp->q_res_bcount -=
+					(xfs_qcnt_t)qtrx->qt_blk_res;
+			}
+			if (qtrx->qt_ino_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = B_TRUE;
+				}
+				dqp->q_res_icount -=
+					(xfs_qcnt_t)qtrx->qt_ino_res;
+			}
+
+			if (qtrx->qt_rtblk_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = B_TRUE;
+				}
+				dqp->q_res_rtbcount -=
+					(xfs_qcnt_t)qtrx->qt_rtblk_res;
+			}
+			if (locked)
+				xfs_dqunlock(dqp);
+
+		}
+		qa = tp->t_dqinfo->dqa_grpdquots;
+	}
+}
+
+/*
+ * This reserves disk blocks and inodes against a dquot.
+ * Flags indicate if the dquot is to be locked here and also
+ * if the blk reservation is for RT or regular blocks.
+ * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
+ * Returns EDQUOT if quota is exceeded.
+ */
+STATIC int
+xfs_trans_dqresv(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp,
+	long		nblks,
+	long		ninos,
+	uint		flags)
+{
+	int		error;
+	xfs_qcnt_t	hardlimit;
+	xfs_qcnt_t	softlimit;
+	time_t		btimer;
+	xfs_qcnt_t	*resbcountp;
+
+	if (! (flags & XFS_QMOPT_DQLOCK)) {
+		xfs_dqlock(dqp);
+	}
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (flags & XFS_TRANS_DQ_RES_BLKS) {
+		hardlimit = INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT);
+		softlimit = INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT);
+		btimer = INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT);
+		resbcountp = &dqp->q_res_bcount;
+	} else {
+		ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
+		hardlimit = INT_GET(dqp->q_core.d_rtb_hardlimit, ARCH_CONVERT);
+		softlimit = INT_GET(dqp->q_core.d_rtb_softlimit, ARCH_CONVERT);
+		btimer = INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT);
+		resbcountp = &dqp->q_res_rtbcount;
+	}
+	error = 0;
+
+	if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
+	    !INT_ISZERO(dqp->q_core.d_id, ARCH_CONVERT) &&
+	    XFS_IS_QUOTA_ENFORCED(dqp->q_mount)) {
+#ifdef QUOTADEBUG
+		printk("BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?\n",
+			nblks, *resbcountp, hardlimit);
+#endif
+		if (nblks > 0) {
+			/*
+			 * dquot is locked already. See if we'd go over the
+			 * hardlimit or exceed the timelimit if we allocate
+			 * nblks.
+			 */
+			if (hardlimit > 0ULL &&
+			     (hardlimit <= nblks + *resbcountp)) {
+				error = EDQUOT;
+				goto error_return;
+			}
+
+			if (softlimit > 0ULL &&
+			     (softlimit <= nblks + *resbcountp)) {
+				/*
+				 * If timer or warnings has expired,
+				 * return EDQUOT
+				 */
+				if ((btimer != 0 && CURRENT_TIME > btimer) ||
+				    (!INT_ISZERO(dqp->q_core.d_bwarns, ARCH_CONVERT) &&
+				     INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) >=
+				     XFS_QI_BWARNLIMIT(dqp->q_mount))) {
+					error = EDQUOT;
+					goto error_return;
+				}
+			}
+		}
+		if (ninos > 0) {
+			if (INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT) > 0ULL &&
+			    INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >=
+			    INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT)) {
+				error = EDQUOT;
+				goto error_return;
+			} else if (INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT) > 0ULL &&
+				   INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >=
+				   INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT)) {
+				/*
+				 * If timer or warnings has expired,
+				 * return EDQUOT
+				 */
+				if ((!INT_ISZERO(dqp->q_core.d_itimer, ARCH_CONVERT) &&
+				     CURRENT_TIME > INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT)) ||
+				    (!INT_ISZERO(dqp->q_core.d_iwarns, ARCH_CONVERT) &&
+				     INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) >=
+				     XFS_QI_IWARNLIMIT(dqp->q_mount))) {
+					error = EDQUOT;
+					goto error_return;
+				}
+			}
+		}
+	}
+
+	/*
+	 * Change the reservation, but not the actual usage.
+	 * Note that q_res_bcount = q_core.d_bcount + resv
+	 */
+	(*resbcountp) += (xfs_qcnt_t)nblks;
+	if (ninos != 0)
+		dqp->q_res_icount += (xfs_qcnt_t)ninos;
+
+	/*
+	 * note the reservation amt in the trans struct too,
+	 * so that the transaction knows how much was reserved by
+	 * it against this particular dquot.
+	 * We don't do this when we are reserving for a delayed allocation,
+	 * because we don't have the luxury of a transaction envelope then.
+	 */
+	if (tp) {
+		ASSERT(tp->t_dqinfo);
+		ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+		if (nblks != 0)
+			xfs_trans_mod_dquot(tp, dqp,
+					    flags & XFS_QMOPT_RESBLK_MASK,
+					    nblks);
+		if (ninos != 0)
+			xfs_trans_mod_dquot(tp, dqp,
+					    XFS_TRANS_DQ_RES_INOS,
+					    ninos);
+	}
+	ASSERT(dqp->q_res_bcount >= INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
+	ASSERT(dqp->q_res_rtbcount >= INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT));
+	ASSERT(dqp->q_res_icount >= INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
+
+error_return:
+	if (! (flags & XFS_QMOPT_DQLOCK)) {
+		xfs_dqunlock(dqp);
+	}
+	return (error);
+}
+
+
+/*
+ * Given a dquot(s), make disk block and/or inode reservations against them.
+ * The fact that this does the reservation against both the usr and
+ * grp quotas is important, because this follows a both-or-nothing
+ * approach.
+ *
+ * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked.
+ *	   XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ *	   XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
+ *	   XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
+ * dquots are unlocked on return, if they were not locked by caller.
+ */
+int
+xfs_trans_reserve_quota_bydquots(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*udqp,
+	xfs_dquot_t	*gdqp,
+	long		nblks,
+	long		ninos,
+	uint		flags)
+{
+	int		resvd;
+
+	if (tp && tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+
+	ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+	resvd = 0;
+
+	if (udqp) {
+		if (xfs_trans_dqresv(tp, udqp, nblks, ninos, flags))
+			return (EDQUOT);
+		resvd = 1;
+	}
+
+	if (gdqp) {
+		if (xfs_trans_dqresv(tp, gdqp, nblks, ninos, flags)) {
+			/*
+			 * can't do it, so backout previous reservation
+			 */
+			if (resvd) {
+				xfs_trans_dqresv(tp, udqp,  -nblks, -ninos,
+						 flags);
+			}
+			return (EDQUOT);
+		}
+	}
+
+	/*
+	 * Didnt change anything critical, so, no need to log
+	 */
+	return (0);
+}
+
+
+/*
+ * Lock the dquot and change the reservation if we can.
+ * This doesnt change the actual usage, just the reservation.
+ * The inode sent in is locked.
+ *
+ * Returns 0 on success, EDQUOT or other errors otherwise
+ */
+int
+xfs_trans_reserve_quota_nblks(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	long		nblks,
+	long		ninos,
+	uint		type)
+{
+	int error;
+
+#ifdef QUOTADEBUG
+	if (ip->i_udquot)
+		ASSERT(! XFS_DQ_IS_LOCKED(ip->i_udquot));
+	if (ip->i_gdquot)
+		ASSERT(! XFS_DQ_IS_LOCKED(ip->i_gdquot));
+#endif
+
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+	ASSERT((type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_RTBLKS ||
+	       (type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_BLKS);
+
+	/*
+	 * Reserve nblks against these dquots, with trans as the mediator.
+	 */
+	error = xfs_trans_reserve_quota_bydquots(tp,
+						 ip->i_udquot, ip->i_gdquot,
+						 nblks, ninos,
+						 type);
+	return (error);
+}
+
+/*
+ * This routine is called to allocate a quotaoff log item.
+ */
+xfs_qoff_logitem_t *
+xfs_trans_get_qoff_item(
+	xfs_trans_t		*tp,
+	xfs_qoff_logitem_t	*startqoff,
+	uint			flags)
+{
+	xfs_qoff_logitem_t	*q;
+
+	ASSERT(tp != NULL);
+
+	q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
+	ASSERT(q != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)q);
+
+	return (q);
+}
+
+
+/*
+ * This is called to mark the quotaoff logitem as needing
+ * to be logged when the transaction is committed.  The logitem must
+ * already be associated with the given transaction.
+ */
+void
+xfs_trans_log_quotaoff_item(
+	xfs_trans_t		*tp,
+	xfs_qoff_logitem_t	*qlp)
+{
+	xfs_log_item_desc_t	*lidp;
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+}
+
+void
+xfs_trans_alloc_dqinfo(
+	xfs_trans_t	*tp)
+{
+	(tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+}
+
+void
+xfs_trans_free_dqinfo(
+	xfs_trans_t	*tp)
+{
+	kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo);
+	(tp)->t_dqinfo = NULL;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_extfree.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_extfree.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_extfree.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_extfree.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * This routine is called to allocate an "extent free intention"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efi_log_item_t *
+xfs_trans_get_efi(xfs_trans_t	*tp,
+		  uint		nextents)
+{
+	xfs_efi_log_item_t	*efip;
+
+	ASSERT(tp != NULL);
+	ASSERT(nextents > 0);
+
+	efip = xfs_efi_init(tp->t_mountp, nextents);
+	ASSERT(efip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip);
+
+	return (efip);
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as needing to be freed.  It should
+ * be called once for each extent to be freed.
+ */
+void
+xfs_trans_log_efi_extent(xfs_trans_t		*tp,
+			 xfs_efi_log_item_t	*efip,
+			 xfs_fsblock_t		start_block,
+			 xfs_extlen_t		ext_len)
+{
+	xfs_log_item_desc_t	*lidp;
+	uint			next_extent;
+	xfs_extent_t		*extp;
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+
+	next_extent = efip->efi_next_extent;
+	ASSERT(next_extent < efip->efi_format.efi_nextents);
+	extp = &(efip->efi_format.efi_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+	efip->efi_next_extent++;
+}
+
+
+/*
+ * This routine is called to allocate an "extent free done"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efd_log_item_t *
+xfs_trans_get_efd(xfs_trans_t		*tp,
+		  xfs_efi_log_item_t	*efip,
+		  uint			nextents)
+{
+	xfs_efd_log_item_t	*efdp;
+
+	ASSERT(tp != NULL);
+	ASSERT(nextents > 0);
+
+	efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
+	ASSERT(efdp != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp);
+
+	return (efdp);
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as having been freed.	 It should
+ * be called once for each extent freed.
+ */
+void
+xfs_trans_log_efd_extent(xfs_trans_t		*tp,
+			 xfs_efd_log_item_t	*efdp,
+			 xfs_fsblock_t		start_block,
+			 xfs_extlen_t		ext_len)
+{
+	xfs_log_item_desc_t	*lidp;
+	uint			next_extent;
+	xfs_extent_t		*extp;
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+	efdp->efd_next_extent++;
+}
+
+
+
+
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_inode.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_inode.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_inode.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_inode.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+#ifdef XFS_TRANS_DEBUG
+STATIC void
+xfs_trans_inode_broot_debug(
+	xfs_inode_t	*ip);
+#else
+#define xfs_trans_inode_broot_debug(ip)
+#endif
+
+
+/*
+ * Get and lock the inode for the caller if it is not already
+ * locked within the given transaction.	 If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * For an inode to be locked in a transaction, the inode lock, as
+ * opposed to the io lock, must be taken exclusively.  This ensures
+ * that the inode can be involved in only 1 transaction at a time.
+ * Lock recursion is handled on the io lock, but only for lock modes
+ * of equal or lesser strength.	 That is, you can recur on the io lock
+ * held EXCL with a SHARED request but not vice versa.	Also, if
+ * the inode is already a part of the transaction then you cannot
+ * go from not holding the io lock to having it EXCL or SHARED.
+ *
+ * Use the inode cache routine xfs_inode_incore() to find the inode
+ * if it is already owned by this transaction.
+ *
+ * If we don't already own the inode, use xfs_iget() to get it.
+ * Since the inode log item structure is embedded in the incore
+ * inode structure and is initialized when the inode is brought
+ * into memory, there is nothing to do with it here.
+ *
+ * If the given transaction pointer is NULL, just call xfs_iget().
+ * This simplifies code which must handle both cases.
+ */
+int
+xfs_trans_iget(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		lock_flags,
+	xfs_inode_t	**ipp)
+{
+	int			error;
+	xfs_inode_t		*ip;
+	xfs_inode_log_item_t	*iip;
+
+	/*
+	 * If the transaction pointer is NULL, just call the normal
+	 * xfs_iget().
+	 */
+	if (tp == NULL) {
+		return (xfs_iget(mp, NULL, ino, lock_flags, ipp, 0));
+	}
+
+	/*
+	 * If we find the inode in core with this transaction
+	 * pointer in its i_transp field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the inode to the caller.
+	 * Assert that the inode is already locked in the mode requested
+	 * by the caller.  We cannot do lock promotions yet, so
+	 * die if someone gets this wrong.
+	 */
+	if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
+		/*
+		 * Make sure that the inode lock is held EXCL and
+		 * that the io lock is never upgraded when the inode
+		 * is already a part of the transaction.
+		 */
+		ASSERT(ip->i_itemp != NULL);
+		ASSERT(lock_flags & XFS_ILOCK_EXCL);
+		ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
+		       ismrlocked(&ip->i_iolock, MR_UPDATE));
+		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
+		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
+		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
+		       ismrlocked(&ip->i_iolock, (MR_UPDATE | MR_ACCESS)));
+		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
+		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
+
+		if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
+			ip->i_itemp->ili_iolock_recur++;
+		}
+		if (lock_flags & XFS_ILOCK_EXCL) {
+			ip->i_itemp->ili_ilock_recur++;
+		}
+		*ipp = ip;
+		return 0;
+	}
+
+	ASSERT(lock_flags & XFS_ILOCK_EXCL);
+	error = xfs_iget(tp->t_mountp, tp, ino, lock_flags, &ip, 0);
+	if (error) {
+		return error;
+	}
+	ASSERT(ip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	if (ip->i_itemp == NULL)
+		xfs_inode_item_init(ip, mp);
+	iip = ip->i_itemp;
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
+
+	xfs_trans_inode_broot_debug(ip);
+
+	/*
+	 * If the IO lock has been acquired, mark that in
+	 * the inode log item so we'll know to unlock it
+	 * when the transaction commits.
+	 */
+	ASSERT(iip->ili_flags == 0);
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
+	}
+
+	/*
+	 * Initialize i_transp so we can find it with xfs_inode_incore()
+	 * above.
+	 */
+	ip->i_transp = tp;
+
+	*ipp = ip;
+	return 0;
+}
+
+
+/*
+ * Release the inode ip which was previously acquired with xfs_trans_iget()
+ * or added with xfs_trans_ijoin(). This will decrement the lock
+ * recursion count of the inode item.  If the count goes to less than 0,
+ * the inode will be unlocked and disassociated from the transaction.
+ *
+ * If the inode has been modified within the transaction, it will not be
+ * unlocked until the transaction commits.
+ */
+void
+xfs_trans_iput(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		lock_flags)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_log_item_desc_t	*lidp;
+
+	/*
+	 * If the transaction pointer is NULL, just call xfs_iput().
+	 */
+	if (tp == NULL) {
+		xfs_iput(ip, lock_flags);
+	}
+
+	ASSERT(ip->i_transp == tp);
+	iip = ip->i_itemp;
+	ASSERT(iip != NULL);
+
+	/*
+	 * Find the item descriptor pointing to this inode's
+	 * log item.  It must be there.
+	 */
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)iip);
+	ASSERT(lidp != NULL);
+	ASSERT(lidp->lid_item == (xfs_log_item_t*)iip);
+
+	/*
+	 * Be consistent about the bookkeeping for the inode's
+	 * io lock, but it doesn't mean much really.
+	 */
+	ASSERT((iip->ili_flags & XFS_ILI_IOLOCKED_ANY) != XFS_ILI_IOLOCKED_ANY);
+	if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
+		ASSERT(iip->ili_flags & XFS_ILI_IOLOCKED_ANY);
+		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
+		       (iip->ili_flags & XFS_ILI_IOLOCKED_EXCL));
+		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
+		       (iip->ili_flags &
+			(XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)));
+		if (iip->ili_iolock_recur > 0) {
+			iip->ili_iolock_recur--;
+		}
+	}
+
+	/*
+	 * If the release is just for a recursive lock on the inode lock,
+	 * then decrement the count and return.	 We can assert that
+	 * the caller is dropping an EXCL lock on the inode, because
+	 * inode must be locked EXCL within transactions.
+	 */
+	ASSERT(lock_flags & XFS_ILOCK_EXCL);
+	if (iip->ili_ilock_recur > 0) {
+		iip->ili_ilock_recur--;
+		return;
+	}
+	ASSERT(iip->ili_iolock_recur == 0);
+
+	/*
+	 * If the inode was dirtied within this transaction, it cannot
+	 * be released until the transaction commits.
+	 */
+	if (lidp->lid_flags & XFS_LID_DIRTY) {
+		return;
+	}
+
+	xfs_trans_free_item(tp, lidp);
+
+	/*
+	 * Clear the hold and iolocked flags in the inode log item.
+	 * We wouldn't want the next user of the inode to
+	 * get confused.  Assert that if the iolocked flag is set
+	 * in the item then we are unlocking it in the call to xfs_iput()
+	 * below.
+	 */
+	ASSERT((!(iip->ili_flags & XFS_ILI_IOLOCKED_ANY)) ||
+	       (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)));
+	if (iip->ili_flags & (XFS_ILI_HOLD | XFS_ILI_IOLOCKED_ANY)) {
+		iip->ili_flags &= ~(XFS_ILI_HOLD | XFS_ILI_IOLOCKED_ANY);
+	}
+
+	/*
+	 * Unlike xfs_brelse() the inode log item cannot be
+	 * freed, because it is embedded within the inode.
+	 * All we have to do is release the inode.
+	 */
+	xfs_iput(ip, lock_flags);
+	return;
+}
+
+
+/*
+ * Add the locked inode to the transaction.
+ * The inode must be locked, and it cannot be associated with any
+ * transaction.	 The caller must specify the locks already held
+ * on the inode.
+ */
+void
+xfs_trans_ijoin(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		lock_flags)
+{
+	xfs_inode_log_item_t	*iip;
+
+	ASSERT(ip->i_transp == NULL);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(lock_flags & XFS_ILOCK_EXCL);
+	if (ip->i_itemp == NULL)
+		xfs_inode_item_init(ip, ip->i_mount);
+	iip = ip->i_itemp;
+	ASSERT(iip->ili_flags == 0);
+	ASSERT(iip->ili_ilock_recur == 0);
+	ASSERT(iip->ili_iolock_recur == 0);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip));
+
+	xfs_trans_inode_broot_debug(ip);
+
+	/*
+	 * If the IO lock is already held, mark that in the inode log item.
+	 */
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
+	}
+
+	/*
+	 * Initialize i_transp so we can find it with xfs_inode_incore()
+	 * in xfs_trans_iget() above.
+	 */
+	ip->i_transp = tp;
+}
+
+
+
+/*
+ * Mark the inode as not needing to be unlocked when the inode item's
+ * IOP_UNLOCK() routine is called.  The inode must already be locked
+ * and associated with the given transaction.
+ */
+/*ARGSUSED*/
+void
+xfs_trans_ihold(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+
+	ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
+}
+
+/*
+ * Cancel the previous inode hold request made on this inode
+ * for this transaction.
+ */
+/*ARGSUSED*/
+void
+xfs_trans_ihold_release(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+
+	ip->i_itemp->ili_flags &= ~XFS_ILI_HOLD;
+}
+
+
+/*
+ * This is called to mark the fields indicated in fieldmask as needing
+ * to be logged when the transaction is committed.  The inode must
+ * already be associated with the given transaction.
+ *
+ * The values for fieldmask are defined in xfs_inode_item.h.  We always
+ * log all of the core inode if any of it has changed, and we always log
+ * all of the inline data/extents/b-tree root if any of them has changed.
+ */
+void
+xfs_trans_log_inode(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		flags)
+{
+	xfs_log_item_desc_t	*lidp;
+
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * Always OR in the bits from the ili_last_fields field.
+	 * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
+	 * routines in the eventual clearing of the ilf_fields bits.
+	 * See the big comment in xfs_iflush() for an explanation of
+	 * this coorination mechanism.
+	 */
+	flags |= ip->i_itemp->ili_last_fields;
+	ip->i_itemp->ili_format.ilf_fields |= flags;
+}
+
+#ifdef XFS_TRANS_DEBUG
+/*
+ * Keep track of the state of the inode btree root to make sure we
+ * log it properly.
+ */
+STATIC void
+xfs_trans_inode_broot_debug(
+	xfs_inode_t	*ip)
+{
+	xfs_inode_log_item_t	*iip;
+
+	ASSERT(ip->i_itemp != NULL);
+	iip = ip->i_itemp;
+	if (iip->ili_root_size != 0) {
+		ASSERT(iip->ili_orig_root != NULL);
+		kmem_free(iip->ili_orig_root, iip->ili_root_size);
+		iip->ili_root_size = 0;
+		iip->ili_orig_root = NULL;
+	}
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		ASSERT((ip->i_df.if_broot != NULL) &&
+		       (ip->i_df.if_broot_bytes > 0));
+		iip->ili_root_size = ip->i_df.if_broot_bytes;
+		iip->ili_orig_root =
+			(char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
+		bcopy((char*)(ip->i_df.if_broot), iip->ili_orig_root,
+		      iip->ili_root_size);
+	}
+}
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_item.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_item.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_item.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_item.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,557 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+STATIC int	xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
+					int, int, xfs_lsn_t);
+
+/*
+ * This is called to add the given log item to the transaction's
+ * list of log items.  It must find a free log item descriptor
+ * or allocate a new one and add the item to that descriptor.
+ * The function returns a pointer to item descriptor used to point
+ * to the new item.  The log item will now point to its new descriptor
+ * with its li_desc field.
+ */
+xfs_log_item_desc_t *
+xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_item_chunk_t	*licp;
+	int			i=0;
+
+	/*
+	 * If there are no free descriptors, allocate a new chunk
+	 * of them and put it at the front of the chunk list.
+	 */
+	if (tp->t_items_free == 0) {
+		licp = (xfs_log_item_chunk_t*)
+		       kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
+		ASSERT(licp != NULL);
+		/*
+		 * Initialize the chunk, and then
+		 * claim the first slot in the newly allocated chunk.
+		 */
+		XFS_LIC_INIT(licp);
+		XFS_LIC_CLAIM(licp, 0);
+		licp->lic_unused = 1;
+		XFS_LIC_INIT_SLOT(licp, 0);
+		lidp = XFS_LIC_SLOT(licp, 0);
+
+		/*
+		 * Link in the new chunk and update the free count.
+		 */
+		licp->lic_next = tp->t_items.lic_next;
+		tp->t_items.lic_next = licp;
+		tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
+
+		/*
+		 * Initialize the descriptor and the generic portion
+		 * of the log item.
+		 *
+		 * Point the new slot at this item and return it.
+		 * Also point the log item at its currently active
+		 * descriptor and set the item's mount pointer.
+		 */
+		lidp->lid_item = lip;
+		lidp->lid_flags = 0;
+		lidp->lid_size = 0;
+		lip->li_desc = lidp;
+		lip->li_mountp = tp->t_mountp;
+		return (lidp);
+	}
+
+	/*
+	 * Find the free descriptor. It is somewhere in the chunklist
+	 * of descriptors.
+	 */
+	licp = &tp->t_items;
+	while (licp != NULL) {
+		if (XFS_LIC_VACANCY(licp)) {
+			if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
+				i = licp->lic_unused;
+				ASSERT(XFS_LIC_ISFREE(licp, i));
+				break;
+			}
+			for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
+				if (XFS_LIC_ISFREE(licp, i))
+					break;
+			}
+			ASSERT(i <= XFS_LIC_MAX_SLOT);
+			break;
+		}
+		licp = licp->lic_next;
+	}
+	ASSERT(licp != NULL);
+	/*
+	 * If we find a free descriptor, claim it,
+	 * initialize it, and return it.
+	 */
+	XFS_LIC_CLAIM(licp, i);
+	if (licp->lic_unused <= i) {
+		licp->lic_unused = i + 1;
+		XFS_LIC_INIT_SLOT(licp, i);
+	}
+	lidp = XFS_LIC_SLOT(licp, i);
+	tp->t_items_free--;
+	lidp->lid_item = lip;
+	lidp->lid_flags = 0;
+	lidp->lid_size = 0;
+	lip->li_desc = lidp;
+	lip->li_mountp = tp->t_mountp;
+	return (lidp);
+}
+
+/*
+ * Free the given descriptor.
+ *
+ * This requires setting the bit in the chunk's free mask corresponding
+ * to the given slot.
+ */
+void
+xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
+{
+	uint			slot;
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_chunk_t	**licpp;
+
+	slot = XFS_LIC_DESC_TO_SLOT(lidp);
+	licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+	XFS_LIC_RELSE(licp, slot);
+	lidp->lid_item->li_desc = NULL;
+	tp->t_items_free++;
+
+	/*
+	 * If there are no more used items in the chunk and this is not
+	 * the chunk embedded in the transaction structure, then free
+	 * the chunk. First pull it from the chunk list and then
+	 * free it back to the heap.  We didn't bother with a doubly
+	 * linked list here because the lists should be very short
+	 * and this is not a performance path.	It's better to save
+	 * the memory of the extra pointer.
+	 *
+	 * Also decrement the transaction structure's count of free items
+	 * by the number in a chunk since we are freeing an empty chunk.
+	 */
+	if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) {
+		licpp = &(tp->t_items.lic_next);
+		while (*licpp != licp) {
+			ASSERT(*licpp != NULL);
+			licpp = &((*licpp)->lic_next);
+		}
+		*licpp = licp->lic_next;
+		kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+		tp->t_items_free -= XFS_LIC_NUM_SLOTS;
+	}
+}
+
+/*
+ * This is called to find the descriptor corresponding to the given
+ * log item.  It returns a pointer to the descriptor.
+ * The log item MUST have a corresponding descriptor in the given
+ * transaction.	 This routine does not return NULL, it panics.
+ *
+ * The descriptor pointer is kept in the log item's li_desc field.
+ * Just return it.
+ */
+/*ARGSUSED*/
+xfs_log_item_desc_t *
+xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
+{
+	ASSERT(lip->li_desc != NULL);
+
+	return (lip->li_desc);
+}
+
+
+/*
+ * Return a pointer to the first descriptor in the chunk list.
+ * This does not return NULL if there are none, it panics.
+ *
+ * The first descriptor must be in either the first or second chunk.
+ * This is because the only chunk allowed to be empty is the first.
+ * All others are freed when they become empty.
+ *
+ * At some point this and xfs_trans_next_item() should be optimized
+ * to quickly look at the mask to determine if there is anything to
+ * look at.
+ */
+xfs_log_item_desc_t *
+xfs_trans_first_item(xfs_trans_t *tp)
+{
+	xfs_log_item_chunk_t	*licp;
+	int			i;
+
+	licp = &tp->t_items;
+	/*
+	 * If it's not in the first chunk, skip to the second.
+	 */
+	if (XFS_LIC_ARE_ALL_FREE(licp)) {
+		licp = licp->lic_next;
+	}
+
+	/*
+	 * Return the first non-free descriptor in the chunk.
+	 */
+	ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+	for (i = 0; i < licp->lic_unused; i++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+
+		return (XFS_LIC_SLOT(licp, i));
+	}
+	cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
+	return(NULL);
+}
+
+
+/*
+ * Given a descriptor, return the next descriptor in the chunk list.
+ * This returns NULL if there are no more used descriptors in the list.
+ *
+ * We do this by first locating the chunk in which the descriptor resides,
+ * and then scanning forward in the chunk and the list for the next
+ * used descriptor.
+ */
+/*ARGSUSED*/
+xfs_log_item_desc_t *
+xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
+{
+	xfs_log_item_chunk_t	*licp;
+	int			i;
+
+	licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+
+	/*
+	 * First search the rest of the chunk. The for loop keeps us
+	 * from referencing things beyond the end of the chunk.
+	 */
+	for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+
+		return (XFS_LIC_SLOT(licp, i));
+	}
+
+	/*
+	 * Now search the next chunk.  It must be there, because the
+	 * next chunk would have been freed if it were empty.
+	 * If there is no next chunk, return NULL.
+	 */
+	if (licp->lic_next == NULL) {
+		return (NULL);
+	}
+
+	licp = licp->lic_next;
+	ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+	for (i = 0; i < licp->lic_unused; i++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+
+		return (XFS_LIC_SLOT(licp, i));
+	}
+	ASSERT(0);
+	/* NOTREACHED */
+	return 0; /* keep gcc quite */
+}
+
+/*
+ * This is called to unlock all of the items of a transaction and to free
+ * all the descriptors of that transaction.
+ *
+ * It walks the list of descriptors and unlocks each item.  It frees
+ * each chunk except that embedded in the transaction as it goes along.
+ */
+void
+xfs_trans_free_items(
+	xfs_trans_t	*tp,
+	int		flags)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_chunk_t	*next_licp;
+	int			abort;
+
+	abort = flags & XFS_TRANS_ABORT;
+	licp = &tp->t_items;
+	/*
+	 * Special case the embedded chunk so we don't free it below.
+	 */
+	if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+		XFS_LIC_ALL_FREE(licp);
+		licp->lic_unused = 0;
+	}
+	licp = licp->lic_next;
+
+	/*
+	 * Unlock each item in each chunk and free the chunks.
+	 */
+	while (licp != NULL) {
+		ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+		next_licp = licp->lic_next;
+		kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+		licp = next_licp;
+	}
+
+	/*
+	 * Reset the transaction structure's free item count.
+	 */
+	tp->t_items_free = XFS_LIC_NUM_SLOTS;
+	tp->t_items.lic_next = NULL;
+}
+
+
+
+/*
+ * This is called to unlock the items associated with a transaction.
+ * Items which were not logged should be freed.
+ * Those which were logged must still be tracked so they can be unpinned
+ * when the transaction commits.
+ */
+void
+xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_chunk_t	*next_licp;
+	xfs_log_item_chunk_t	**licpp;
+	int			freed;
+
+	freed = 0;
+	licp = &tp->t_items;
+
+	/*
+	 * Special case the embedded chunk so we don't free.
+	 */
+	if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+		freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
+	}
+	licpp = &(tp->t_items.lic_next);
+	licp = licp->lic_next;
+
+	/*
+	 * Unlock each item in each chunk, free non-dirty descriptors,
+	 * and free empty chunks.
+	 */
+	while (licp != NULL) {
+		ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+		freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
+		next_licp = licp->lic_next;
+		if (XFS_LIC_ARE_ALL_FREE(licp)) {
+			*licpp = next_licp;
+			kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+			freed -= XFS_LIC_NUM_SLOTS;
+		} else {
+			licpp = &(licp->lic_next);
+		}
+		ASSERT(*licpp == next_licp);
+		licp = next_licp;
+	}
+
+	/*
+	 * Fix the free descriptor count in the transaction.
+	 */
+	tp->t_items_free += freed;
+}
+
+/*
+ * Unlock each item pointed to by a descriptor in the given chunk.
+ * Stamp the commit lsn into each item if necessary.
+ * Free descriptors pointing to items which are not dirty if freeing_chunk
+ * is zero. If freeing_chunk is non-zero, then we need to unlock all
+ * items in the chunk including those with XFS_LID_SYNC_UNLOCK set.
+ * Return the number of descriptors freed.
+ */
+STATIC int
+xfs_trans_unlock_chunk(
+	xfs_log_item_chunk_t	*licp,
+	int			freeing_chunk,
+	int			abort,
+	xfs_lsn_t		commit_lsn)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_item_t		*lip;
+	int			i;
+	int			freed;
+
+	freed = 0;
+	lidp = licp->lic_descs;
+	for (i = 0; i < licp->lic_unused; i++, lidp++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+		lip = lidp->lid_item;
+		lip->li_desc = NULL;
+
+		if (commit_lsn != NULLCOMMITLSN)
+			IOP_COMMITTING(lip, commit_lsn);
+
+		/* XXXsup */
+		if (abort)
+			lip->li_flags |= XFS_LI_ABORTED;
+
+		/* if (abort) {
+			IOP_ABORT(lip);
+		} else */
+		if (!(lidp->lid_flags & XFS_LID_SYNC_UNLOCK) ||
+			   freeing_chunk || abort) {
+			IOP_UNLOCK(lip);
+		}
+
+		/*
+		 * Free the descriptor if the item is not dirty
+		 * within this transaction and the caller is not
+		 * going to just free the entire thing regardless.
+		 */
+		if (!(freeing_chunk) &&
+		    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
+			XFS_LIC_RELSE(licp, i);
+			freed++;
+		}
+	}
+
+	return (freed);
+}
+
+
+/*
+ * This is called to add the given busy item to the transaction's
+ * list of busy items.	It must find a free busy item descriptor
+ * or allocate a new one and add the item to that descriptor.
+ * The function returns a pointer to busy descriptor used to point
+ * to the new busy entry.  The log busy entry will now point to its new
+ * descriptor with its ???? field.
+ */
+xfs_log_busy_slot_t *
+xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
+{
+	xfs_log_busy_chunk_t	*lbcp;
+	xfs_log_busy_slot_t	*lbsp;
+	int			i=0;
+
+	/*
+	 * If there are no free descriptors, allocate a new chunk
+	 * of them and put it at the front of the chunk list.
+	 */
+	if (tp->t_busy_free == 0) {
+		lbcp = (xfs_log_busy_chunk_t*)
+		       kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
+		ASSERT(lbcp != NULL);
+		/*
+		 * Initialize the chunk, and then
+		 * claim the first slot in the newly allocated chunk.
+		 */
+		XFS_LBC_INIT(lbcp);
+		XFS_LBC_CLAIM(lbcp, 0);
+		lbcp->lbc_unused = 1;
+		lbsp = XFS_LBC_SLOT(lbcp, 0);
+
+		/*
+		 * Link in the new chunk and update the free count.
+		 */
+		lbcp->lbc_next = tp->t_busy.lbc_next;
+		tp->t_busy.lbc_next = lbcp;
+		tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
+
+		/*
+		 * Initialize the descriptor and the generic portion
+		 * of the log item.
+		 *
+		 * Point the new slot at this item and return it.
+		 * Also point the log item at its currently active
+		 * descriptor and set the item's mount pointer.
+		 */
+		lbsp->lbc_ag = ag;
+		lbsp->lbc_idx = idx;
+		return (lbsp);
+	}
+
+	/*
+	 * Find the free descriptor. It is somewhere in the chunklist
+	 * of descriptors.
+	 */
+	lbcp = &tp->t_busy;
+	while (lbcp != NULL) {
+		if (XFS_LBC_VACANCY(lbcp)) {
+			if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
+				i = lbcp->lbc_unused;
+				break;
+			} else {
+				/* out-of-order vacancy */
+				printk("OOO vacancy lbcp 0x%p\n", lbcp);
+				ASSERT(0);
+			}
+		}
+		lbcp = lbcp->lbc_next;
+	}
+	ASSERT(lbcp != NULL);
+	/*
+	 * If we find a free descriptor, claim it,
+	 * initialize it, and return it.
+	 */
+	XFS_LBC_CLAIM(lbcp, i);
+	if (lbcp->lbc_unused <= i) {
+		lbcp->lbc_unused = i + 1;
+	}
+	lbsp = XFS_LBC_SLOT(lbcp, i);
+	tp->t_busy_free--;
+	lbsp->lbc_ag = ag;
+	lbsp->lbc_idx = idx;
+	return (lbsp);
+}
+
+
+/*
+ * xfs_trans_free_busy
+ * Free all of the busy lists from a transaction
+ */
+void
+xfs_trans_free_busy(xfs_trans_t *tp)
+{
+	xfs_log_busy_chunk_t	*lbcp;
+	xfs_log_busy_chunk_t	*lbcq;
+
+	lbcp = tp->t_busy.lbc_next;
+	while (lbcp != NULL) {
+		lbcq = lbcp->lbc_next;
+		kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t));
+		lbcp = lbcq;
+	}
+
+	XFS_LBC_INIT(&tp->t_busy);
+	tp->t_busy.lbc_unused = 0;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_priv.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_priv.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_priv.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_priv.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TRANS_PRIV_H__
+#define __XFS_TRANS_PRIV_H__
+
+struct xfs_log_item;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * From xfs_trans_item.c
+ */
+struct xfs_log_item_desc	*xfs_trans_add_item(struct xfs_trans *,
+					    struct xfs_log_item *);
+void				xfs_trans_free_item(struct xfs_trans *,
+					    struct xfs_log_item_desc *);
+struct xfs_log_item_desc	*xfs_trans_find_item(struct xfs_trans *,
+					     struct xfs_log_item *);
+struct xfs_log_item_desc	*xfs_trans_first_item(struct xfs_trans *);
+struct xfs_log_item_desc	*xfs_trans_next_item(struct xfs_trans *,
+					     struct xfs_log_item_desc *);
+void				xfs_trans_free_items(struct xfs_trans *, int);
+void				xfs_trans_unlock_items(struct xfs_trans *,
+							xfs_lsn_t);
+void				xfs_trans_free_busy(xfs_trans_t *tp);
+xfs_log_busy_slot_t		*xfs_trans_add_busy(xfs_trans_t *tp,
+						    xfs_agnumber_t ag,
+						    xfs_extlen_t idx);
+
+/*
+ * From xfs_trans_ail.c
+ */
+void			xfs_trans_update_ail(struct xfs_mount *,
+				     struct xfs_log_item *, xfs_lsn_t,
+				     unsigned long);
+void			xfs_trans_delete_ail(struct xfs_mount *,
+				     struct xfs_log_item *, unsigned long);
+struct xfs_log_item	*xfs_trans_first_ail(struct xfs_mount *, int *);
+struct xfs_log_item	*xfs_trans_next_ail(struct xfs_mount *,
+				     struct xfs_log_item *, int *, int *);
+
+
+#endif	/* __XFS_TRANS_PRIV_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_space.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_space.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_trans_space.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_trans_space.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+
+/*
+ * Components of space reservations.
+ */
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)	\
+		(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define XFS_EXTENTADD_SPACE_RES(mp,w)	(XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+	  XFS_EXTENTADD_SPACE_RES(mp,w))
+#define XFS_DAENTER_1B(mp,w)	((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
+#define XFS_DAENTER_DBS(mp,w)	\
+	(XFS_DA_NODE_MAXDEPTH + \
+	 ((XFS_DIR_IS_V2(mp) && (w) == XFS_DATA_FORK) ? 2 : 0))
+#define XFS_DAENTER_BLOCKS(mp,w)	\
+	(XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define XFS_DAENTER_BMAP1B(mp,w)	\
+	XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define XFS_DAENTER_BMAPS(mp,w)		\
+	(XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define XFS_DAENTER_SPACE_RES(mp,w)	\
+	(XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define XFS_DAREMOVE_SPACE_RES(mp,w)	XFS_DAENTER_BMAPS(mp,w)
+#define XFS_DIRENTER_MAX_SPLIT(mp,nl)	\
+	(((mp)->m_sb.sb_blocksize == 512 && \
+	  XFS_DIR_IS_V1(mp) && \
+	  (nl) >= XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN) ? 2 : 1)
+#define XFS_DIRENTER_SPACE_RES(mp,nl)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+	 XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define XFS_DIRREMOVE_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define XFS_IALLOC_SPACE_RES(mp)	\
+	(XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1)
+
+/*
+ * Space reservation values for various transactions.
+ */
+#define XFS_ADDAFORK_SPACE_RES(mp)	\
+	((mp)->m_dirblkfsbs + \
+	 (XFS_DIR_IS_V1(mp) ? 0 : XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)))
+#define XFS_ATTRRM_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define XFS_ATTRSET_SPACE_RES(mp, v)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define XFS_CREATE_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_DIOSTRAT_SPACE_RES(mp, v)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define XFS_GROWFS_SPACE_RES(mp)	\
+	(2 * XFS_AG_MAXLEVELS(mp))
+#define XFS_GROWFSRT_SPACE_RES(mp,b)	\
+	((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define XFS_LINK_SPACE_RES(mp,nl)	\
+	XFS_DIRENTER_SPACE_RES(mp,nl)
+#define XFS_MKDIR_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_QM_DQALLOC_SPACE_RES(mp)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+	 XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
+	XFS_IALLOC_SPACE_RES(mp)
+#define XFS_REMOVE_SPACE_RES(mp)	\
+	XFS_DIRREMOVE_SPACE_RES(mp)
+#define XFS_RENAME_SPACE_RES(mp,nl)	\
+	(XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_SYMLINK_SPACE_RES(mp,nl,b)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+
+#endif	/* __XFS_TRANS_SPACE_H__ */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_types.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_types.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_types.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_types.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TYPES_H__
+#define __XFS_TYPES_H__
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+
+/*
+ * POSIX Extensions
+ */
+typedef unsigned char		uchar_t;
+typedef unsigned short		ushort_t;
+typedef unsigned int		uint_t;
+typedef unsigned long		ulong_t;
+
+/*
+ * Additional type declarations for XFS
+ */
+typedef signed char		__int8_t;
+typedef unsigned char		__uint8_t;
+typedef signed short int	__int16_t;
+typedef unsigned short int	__uint16_t;
+typedef signed int		__int32_t;
+typedef unsigned int		__uint32_t;
+typedef signed long long int	__int64_t;
+typedef unsigned long long int	__uint64_t;
+
+typedef enum { B_FALSE, B_TRUE } boolean_t;
+
+
+typedef __int64_t		prid_t;		/* project ID */
+typedef __uint32_t		inst_t;		/* an instruction */
+
+typedef __u64			xfs_off_t;
+typedef __u64			xfs_ino_t;	/* <inode> type */
+typedef __s64			xfs_daddr_t;	/* <disk address> type */
+typedef char *			xfs_caddr_t;	/* <core address> type */
+typedef __u32			xfs_dev_t;
+
+typedef struct timespec		timespec_t;
+
+typedef struct {
+	unsigned char	__u_bits[16];
+} uuid_t;
+
+/* __psint_t is the same size as a pointer */
+#if (BITS_PER_LONG == 32)
+typedef __int32_t __psint_t;
+typedef __uint32_t __psunsigned_t;
+#elif (BITS_PER_LONG == 64)
+typedef __int64_t __psint_t;
+typedef __uint64_t __psunsigned_t;
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+
+#endif	/* __KERNEL__ */
+
+/*
+ * Some types are conditional based on the selected configuration.
+ * Set XFS_BIG_FILESYSTEMS=1 or 0 depending on the desired configuration.
+ * XFS_BIG_FILESYSTEMS needs daddr_t to be 64 bits
+ *
+ * On linux right now we are limited to 2^32 512 byte blocks in a
+ * filesystem, Once this limit is changed, setting this to 1
+ * will allow XFS to go larger. With BIG_FILESYSTEMS set to 0
+ * a 4K block filesystem could still theoretically be 16Gbytes
+ * long, so on an ia32 box the 32 bit page index will then be
+ * the limiting factor.
+ */
+
+#ifndef XFS_BIG_FILESYSTEMS
+#define XFS_BIG_FILESYSTEMS	0
+#endif
+
+typedef __uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
+typedef __uint32_t	xfs_extlen_t;	/* extent length in blocks */
+typedef __uint32_t	xfs_agnumber_t; /* allocation group number */
+typedef __int32_t	xfs_extnum_t;	/* # of extents in a file */
+typedef __int16_t	xfs_aextnum_t;	/* # extents in an attribute fork */
+typedef __int64_t	xfs_fsize_t;	/* bytes in a file */
+typedef __uint64_t	xfs_ufsize_t;	/* unsigned bytes in a file */
+
+typedef __int32_t	xfs_suminfo_t;	/* type of bitmap summary info */
+typedef __int32_t	xfs_rtword_t;	/* word type for bitmap manipulations */
+
+typedef __int64_t	xfs_lsn_t;	/* log sequence number */
+typedef __int32_t	xfs_tid_t;	/* transaction identifier */
+
+typedef __uint32_t	xfs_dablk_t;	/* dir/attr block number (in file) */
+typedef __uint32_t	xfs_dahash_t;	/* dir/attr hash value */
+
+typedef __uint16_t	xfs_prid_t;	/* prid_t truncated to 16bits in XFS */
+
+/*
+ * These types are 64 bits on disk but are either 32 or 64 bits in memory.
+ * Disk based types:
+ */
+typedef __uint64_t	xfs_dfsbno_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint64_t	xfs_drfsbno_t;	/* blockno in filesystem (raw) */
+typedef __uint64_t	xfs_drtbno_t;	/* extent (block) in realtime area */
+typedef __uint64_t	xfs_dfiloff_t;	/* block number in a file */
+typedef __uint64_t	xfs_dfilblks_t; /* number of blocks in a file */
+
+/*
+ * Memory based types are conditional.
+ */
+#if XFS_BIG_FILESYSTEMS
+typedef __uint64_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint64_t	xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef __uint64_t	xfs_rtblock_t;	/* extent (block) in realtime area */
+typedef __int64_t	xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+#else
+typedef __uint32_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint32_t	xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef __uint32_t	xfs_rtblock_t;	/* extent (block) in realtime area */
+typedef __int32_t	xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+#endif
+typedef __uint64_t	xfs_fileoff_t;	/* block number in a file */
+typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
+typedef __uint64_t	xfs_filblks_t;	/* number of blocks in a file */
+
+typedef __uint8_t	xfs_arch_t;	/* architecutre of an xfs fs */
+
+/*
+ * Null values for the types.
+ */
+#define NULLDFSBNO	((xfs_dfsbno_t)-1)
+#define NULLDRFSBNO	((xfs_drfsbno_t)-1)
+#define NULLDRTBNO	((xfs_drtbno_t)-1)
+#define NULLDFILOFF	((xfs_dfiloff_t)-1)
+
+#define NULLFSBLOCK	((xfs_fsblock_t)-1)
+#define NULLRFSBLOCK	((xfs_rfsblock_t)-1)
+#define NULLRTBLOCK	((xfs_rtblock_t)-1)
+#define NULLFILEOFF	((xfs_fileoff_t)-1)
+
+#define NULLAGBLOCK	((xfs_agblock_t)-1)
+#define NULLAGNUMBER	((xfs_agnumber_t)-1)
+#define NULLEXTNUM	((xfs_extnum_t)-1)
+
+#define NULLCOMMITLSN	((xfs_lsn_t)-1)
+
+/*
+ * Max values for extlen, extnum, aextnum.
+ */
+#define MAXEXTLEN	((xfs_extlen_t)0x001fffff)	/* 21 bits */
+#define MAXEXTNUM	((xfs_extnum_t)0x7fffffff)	/* signed int */
+#define MAXAEXTNUM	((xfs_aextnum_t)0x7fff)		/* signed short */
+
+/*
+ * MAXNAMELEN is the length (including the terminating null) of
+ * the longest permissible file (component) name.
+ */
+#define MAXNAMELEN	256
+
+typedef enum {
+	XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
+} xfs_lookup_t;
+
+typedef enum {
+	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
+	XFS_BTNUM_MAX
+} xfs_btnum_t;
+
+
+#if defined(CONFIG_PROC_FS) && defined(__KERNEL__) && !defined(XFS_STATS_OFF)
+/*
+ * XFS global statistics
+ */
+struct xfsstats {
+# define XFSSTAT_END_EXTENT_ALLOC	4
+	__uint32_t		xs_allocx;
+	__uint32_t		xs_allocb;
+	__uint32_t		xs_freex;
+	__uint32_t		xs_freeb;
+# define XFSSTAT_END_ALLOC_BTREE	(XFSSTAT_END_EXTENT_ALLOC+4)
+	__uint32_t		xs_abt_lookup;
+	__uint32_t		xs_abt_compare;
+	__uint32_t		xs_abt_insrec;
+	__uint32_t		xs_abt_delrec;
+# define XFSSTAT_END_BLOCK_MAPPING	(XFSSTAT_END_ALLOC_BTREE+7)
+	__uint32_t		xs_blk_mapr;
+	__uint32_t		xs_blk_mapw;
+	__uint32_t		xs_blk_unmap;
+	__uint32_t		xs_add_exlist;
+	__uint32_t		xs_del_exlist;
+	__uint32_t		xs_look_exlist;
+	__uint32_t		xs_cmp_exlist;
+# define XFSSTAT_END_BLOCK_MAP_BTREE	(XFSSTAT_END_BLOCK_MAPPING+4)
+	__uint32_t		xs_bmbt_lookup;
+	__uint32_t		xs_bmbt_compare;
+	__uint32_t		xs_bmbt_insrec;
+	__uint32_t		xs_bmbt_delrec;
+# define XFSSTAT_END_DIRECTORY_OPS	(XFSSTAT_END_BLOCK_MAP_BTREE+4)
+	__uint32_t		xs_dir_lookup;
+	__uint32_t		xs_dir_create;
+	__uint32_t		xs_dir_remove;
+	__uint32_t		xs_dir_getdents;
+# define XFSSTAT_END_TRANSACTIONS	(XFSSTAT_END_DIRECTORY_OPS+3)
+	__uint32_t		xs_trans_sync;
+	__uint32_t		xs_trans_async;
+	__uint32_t		xs_trans_empty;
+# define XFSSTAT_END_INODE_OPS		(XFSSTAT_END_TRANSACTIONS+7)
+	__uint32_t		xs_ig_attempts;
+	__uint32_t		xs_ig_found;
+	__uint32_t		xs_ig_frecycle;
+	__uint32_t		xs_ig_missed;
+	__uint32_t		xs_ig_dup;
+	__uint32_t		xs_ig_reclaims;
+	__uint32_t		xs_ig_attrchg;
+# define XFSSTAT_END_LOG_OPS		(XFSSTAT_END_INODE_OPS+5)
+	__uint32_t		xs_log_writes;
+	__uint32_t		xs_log_blocks;
+	__uint32_t		xs_log_noiclogs;
+	__uint32_t		xs_log_force;
+	__uint32_t		xs_log_force_sleep;
+# define XFSSTAT_END_TAIL_PUSHING	(XFSSTAT_END_LOG_OPS+10)
+	__uint32_t		xs_try_logspace;
+	__uint32_t		xs_sleep_logspace;
+	__uint32_t		xs_push_ail;
+	__uint32_t		xs_push_ail_success;
+	__uint32_t		xs_push_ail_pushbuf;
+	__uint32_t		xs_push_ail_pinned;
+	__uint32_t		xs_push_ail_locked;
+	__uint32_t		xs_push_ail_flushing;
+	__uint32_t		xs_push_ail_restarts;
+	__uint32_t		xs_push_ail_flush;
+# define XFSSTAT_END_WRITE_CONVERT	(XFSSTAT_END_TAIL_PUSHING+2)
+	__uint32_t		xs_xstrat_quick;
+	__uint32_t		xs_xstrat_split;
+# define XFSSTAT_END_READ_WRITE_OPS	(XFSSTAT_END_WRITE_CONVERT+2)
+	__uint32_t		xs_write_calls;
+	__uint32_t		xs_read_calls;
+# define XFSSTAT_END_ATTRIBUTE_OPS	(XFSSTAT_END_READ_WRITE_OPS+4)
+	__uint32_t		xs_attr_get;
+	__uint32_t		xs_attr_set;
+	__uint32_t		xs_attr_remove;
+	__uint32_t		xs_attr_list;
+# define XFSSTAT_END_QUOTA_OPS		(XFSSTAT_END_ATTRIBUTE_OPS+8)
+	__uint32_t		xs_qm_dqreclaims;
+	__uint32_t		xs_qm_dqreclaim_misses;
+	__uint32_t		xs_qm_dquot_dups;
+	__uint32_t		xs_qm_dqcachemisses;
+	__uint32_t		xs_qm_dqcachehits;
+	__uint32_t		xs_qm_dqwants;
+	__uint32_t		xs_qm_dqshake_reclaims;
+	__uint32_t		xs_qm_dqinact_reclaims;
+# define XFSSTAT_END_INODE_CLUSTER	(XFSSTAT_END_QUOTA_OPS+3)
+	__uint32_t		xs_iflush_count;
+	__uint32_t		xs_icluster_flushcnt;
+	__uint32_t		xs_icluster_flushinode;
+# define XFSSTAT_END_VNODE_OPS		(XFSSTAT_END_INODE_CLUSTER+8)
+	__uint32_t		vn_active;	/* # vnodes not on free lists */
+	__uint32_t		vn_alloc;	/* # times vn_alloc called */
+	__uint32_t		vn_get;		/* # times vn_get called */
+	__uint32_t		vn_hold;	/* # times vn_hold called */
+	__uint32_t		vn_rele;	/* # times vn_rele called */
+	__uint32_t		vn_reclaim;	/* # times vn_reclaim called */
+	__uint32_t		vn_remove;	/* # times vn_remove called */
+	__uint32_t		vn_free;	/* # times vn_free called */
+/* Extra precision counters */
+	__uint64_t		xs_xstrat_bytes;
+	__uint64_t		xs_write_bytes;
+	__uint64_t		xs_read_bytes;
+};
+
+extern struct xfsstats xfsstats;
+
+# define XFS_STATS_INC(count)		( (count)++ )
+# define XFS_STATS_DEC(count)		( (count)-- )
+# define XFS_STATS_ADD(count, inc)	( (count) += (inc) )
+#else	/* !CONFIG_PROC_FS */
+# define XFS_STATS_INC(count)
+# define XFS_STATS_DEC(count)
+# define XFS_STATS_ADD(count, inc)
+#endif	/* !CONFIG_PROC_FS */
+
+
+
+/* juggle IRIX device numbers - still used in ondisk structures */
+
+#ifndef __KERNEL__
+#define MKDEV(major, minor)	makedev(major, minor)
+#endif
+
+#define IRIX_DEV_BITSMAJOR	14
+#define IRIX_DEV_BITSMINOR	18
+#define IRIX_DEV_MAXMAJ		0x1ff
+#define IRIX_DEV_MAXMIN		0x3ffff
+#define IRIX_DEV_MAJOR(dev)	((int)(((unsigned)(dev)>>IRIX_DEV_BITSMINOR) \
+				    & IRIX_DEV_MAXMAJ))
+#define IRIX_DEV_MINOR(dev)	((int)((dev)&IRIX_DEV_MAXMIN))
+#define IRIX_MKDEV(major,minor) ((xfs_dev_t)(((major)<<IRIX_DEV_BITSMINOR) \
+				    | (minor&IRIX_DEV_MAXMIN)))
+
+#define IRIX_DEV_TO_KDEVT(dev)	MKDEV(IRIX_DEV_MAJOR(dev),IRIX_DEV_MINOR(dev))
+
+#endif	/* !__XFS_TYPES_H */
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_utils.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_utils.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_utils.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_utils.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+#ifdef CONFIG_PROC_FS
+struct xfsstats xfsstats;
+#endif
+
+/*
+ * xfs_get_dir_entry is used to get a reference to an inode given
+ * its parent directory inode and the name of the file.	 It does
+ * not lock the child inode, and it unlocks the directory before
+ * returning.  The directory's generation number is returned for
+ * use by a later call to xfs_lock_dir_and_entry.
+ */
+int
+xfs_get_dir_entry(
+	struct dentry		*dentry,
+	xfs_inode_t		**ipp)
+{
+	vnode_t			*vp;
+	bhv_desc_t		*bdp;
+
+	ASSERT(dentry->d_inode);
+
+	vp = LINVFS_GET_VP(dentry->d_inode);
+	bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops);
+	if (!bdp) {
+		*ipp = NULL;
+		return XFS_ERROR(ENOENT);
+	}
+	VN_HOLD(vp);
+	*ipp = XFS_BHVTOI(bdp);
+	return 0;
+}
+
+/*
+ * Wrapper around xfs_dir_lookup.
+ *
+ * If DLF_IGET is set, then this routine will also return the inode.
+ * Note that the inode will not be locked. Note, however, that the
+ * vnode will have an additional reference in this case.
+ */
+int
+xfs_dir_lookup_int(
+	bhv_desc_t		*dir_bdp,
+	int			flags,
+	struct dentry		*dentry,
+	xfs_ino_t		*inum,
+	xfs_inode_t		**ipp)
+{
+	vnode_t		*dir_vp;
+	xfs_inode_t	*dp;
+	char		*name = (char *) dentry->d_name.name;
+	int		name_len = dentry->d_name.len;
+	int		error;
+	int		do_iget;
+	uint		lock_mode;
+	bhv_desc_t	*bdp;
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	vn_trace_entry(dir_vp, "xfs_dir_lookup_int",
+		       (inst_t *)__return_address);
+
+	do_iget = flags & DLF_IGET;
+	error = 0;
+
+	if (flags & DLF_LOCK_SHARED) {
+		lock_mode = XFS_ILOCK_SHARED;
+	} else {
+		lock_mode = XFS_ILOCK_EXCL;
+	}
+
+	dp = XFS_BHVTOI(dir_bdp);
+	bdp = NULL;
+
+	/*
+	 * If all else fails, call the directory code.
+	 */
+
+	error = XFS_DIR_LOOKUP(dp->i_mount, NULL, dp, name, name_len, inum);
+	if (!error && do_iget) {
+		/*
+		 * Unlock the directory. We do this because we can't
+		 * hold the directory lock while doing the vn_get()
+		 * in xfs_iget().  Doing so could cause us to hold
+		 * a lock while waiting for the inode to finish
+		 * being inactive while it's waiting for a log
+		 * reservation in the inactive routine.
+		 */
+		xfs_iunlock(dp, lock_mode);
+
+		if (bdp) {
+			VN_RELE(BHV_TO_VNODE(bdp));
+			bdp = NULL;
+		}
+
+		error = xfs_iget(dp->i_mount, NULL, *inum, 0, ipp, 0);
+
+		xfs_ilock(dp, lock_mode);
+
+		if (error) {
+			*ipp = NULL;
+			return error;
+		}
+
+		if ((*ipp)->i_d.di_mode == 0) {
+			/*
+			 * The inode has been freed.  Something is
+			 * wrong so just get out of here.
+			 */
+			xfs_iunlock(dp, lock_mode);
+			xfs_iput_new(*ipp, 0);
+			*ipp = NULL;
+			xfs_ilock(dp, lock_mode);
+			error = XFS_ERROR(ENOENT);
+		} else {
+			bdp = XFS_ITOBHV(*ipp);
+			bdp = NULL;
+		}
+	}
+	if (bdp) {
+		/* The only time we should get here is if the dir_lookup
+		 * failed.
+		 */
+		ASSERT(error);
+		xfs_iunlock(dp, lock_mode);
+		VN_RELE(BHV_TO_VNODE(bdp));
+		xfs_ilock(dp, lock_mode);
+	}
+	return error;
+}
+
+/*
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
+ *
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
+ *
+ */
+int
+xfs_dir_ialloc(
+	xfs_trans_t	**tpp,		/* input: current transaction;
+					   output: may be a new transaction. */
+	xfs_inode_t	*dp,		/* directory within whose allocate
+					   the inode. */
+	mode_t		mode,
+	nlink_t		nlink,
+	dev_t		rdev,
+	cred_t		*credp,
+	prid_t		prid,		/* project id */
+	int		okalloc,	/* ok to allocate new space */
+	xfs_inode_t	**ipp,		/* pointer to inode; it will be
+					   locked. */
+	int		*committed)
+
+{
+	xfs_trans_t	*tp;
+	xfs_trans_t	*ntp;
+	xfs_inode_t	*ip;
+	xfs_buf_t	*ialloc_context = NULL;
+	boolean_t	call_again = B_FALSE;
+	int		code;
+	uint		log_res;
+	uint		log_count;
+	void		*dqinfo;
+	uint		tflags;
+
+	tp = *tpp;
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+	/*
+	 * xfs_ialloc will return a pointer to an incore inode if
+	 * the Space Manager has an available inode on the free
+	 * list. Otherwise, it will do an allocation and replenish
+	 * the freelist.  Since we can only do one allocation per
+	 * transaction without deadlocks, we will need to commit the
+	 * current transaction and start a new one.  We will then
+	 * need to call xfs_ialloc again to get the inode.
+	 *
+	 * If xfs_ialloc did an allocation to replenish the freelist,
+	 * it returns the bp containing the head of the freelist as
+	 * ialloc_context. We will hold a lock on it across the
+	 * transaction commit so that no other process can steal
+	 * the inode(s) that we've just allocated.
+	 */
+	code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc,
+			  &ialloc_context, &call_again, &ip);
+
+	/*
+	 * Return an error if we were unable to allocate a new inode.
+	 * This should only happen if we run out of space on disk or
+	 * encounter a disk error.
+	 */
+	if (code) {
+		*ipp = NULL;
+		return code;
+	}
+	if (!call_again && (ip == NULL)) {
+		*ipp = NULL;
+		return XFS_ERROR(ENOSPC);
+	}
+
+	/*
+	 * If call_again is set, then we were unable to get an
+	 * inode in one operation.  We need to commit the current
+	 * transaction and call xfs_ialloc() again.  It is guaranteed
+	 * to succeed the second time.
+	 */
+	if (call_again) {
+
+		/*
+		 * Normally, xfs_trans_commit releases all the locks.
+		 * We call bhold to hang on to the ialloc_context across
+		 * the commit.	Holding this buffer prevents any other
+		 * processes from doing any allocations in this
+		 * allocation group.
+		 */
+		xfs_trans_bhold(tp, ialloc_context);
+		/*
+		 * Save the log reservation so we can use
+		 * them in the next transaction.
+		 */
+		log_res = xfs_trans_get_log_res(tp);
+		log_count = xfs_trans_get_log_count(tp);
+
+		/*
+		 * We want the quota changes to be associated with the next
+		 * transaction, NOT this one. So, detach the dqinfo from this
+		 * and attach it to the next transaction.
+		 */
+		dqinfo = NULL;
+		tflags = 0;
+		if (tp->t_dqinfo) {
+			dqinfo = (void *)tp->t_dqinfo;
+			tp->t_dqinfo = NULL;
+			tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+			tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+		}
+
+		ntp = xfs_trans_dup(tp);
+		code = xfs_trans_commit(tp, 0, NULL);
+		tp = ntp;
+		if (committed != NULL) {
+			*committed = 1;
+		}
+		/*
+		 * If we get an error during the commit processing,
+		 * release the buffer that is still held and return
+		 * to the caller.
+		 */
+		if (code) {
+			xfs_buf_relse(ialloc_context);
+			if (dqinfo) {
+				tp->t_dqinfo = dqinfo;
+				xfs_trans_free_dqinfo(tp);
+			}
+			*tpp = ntp;
+			*ipp = NULL;
+			return code;
+		}
+		code = xfs_trans_reserve(tp, 0, log_res, 0,
+					 XFS_TRANS_PERM_LOG_RES, log_count);
+		/*
+		 * Re-attach the quota info that we detached from prev trx.
+		 */
+		if (dqinfo) {
+			tp->t_dqinfo = dqinfo;
+			tp->t_flags |= tflags;
+		}
+
+		if (code) {
+			xfs_buf_relse(ialloc_context);
+			*tpp = ntp;
+			*ipp = NULL;
+			return code;
+		}
+		xfs_trans_bjoin (tp, ialloc_context);
+
+		/*
+		 * Call ialloc again. Since we've locked out all
+		 * other allocations in this allocation group,
+		 * this call should always succeed.
+		 */
+		code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid,
+				  okalloc, &ialloc_context, &call_again, &ip);
+
+		/*
+		 * If we get an error at this point, return to the caller
+		 * so that the current transaction can be aborted.
+		 */
+		if (code) {
+			*tpp = tp;
+			*ipp = NULL;
+			return code;
+		}
+		ASSERT ((!call_again) && (ip != NULL));
+
+	} else {
+		if (committed != NULL) {
+			*committed = 0;
+		}
+	}
+
+	*ipp = ip;
+	*tpp = tp;
+
+	return 0;
+}
+
+/*
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
+ */
+int				/* error */
+xfs_droplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	int	error;
+
+	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+
+	ASSERT (ip->i_d.di_nlink > 0);
+	ip->i_d.di_nlink--;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = 0;
+	if (ip->i_d.di_nlink == 0) {
+		/*
+		 * We're dropping the last link to this file.
+		 * Move the on-disk inode to the AGI unlinked list.
+		 * From xfs_inactive() we will pull the inode from
+		 * the list and free it.
+		 */
+		error = xfs_iunlink(tp, ip);
+	}
+	return error;
+}
+
+/*
+ * This gets called when the inode's version needs to be changed from 1 to 2.
+ * Currently this happens when the nlink field overflows the old 16-bit value
+ * or when chproj is called to change the project for the first time.
+ * As a side effect the superblock version will also get rev'd
+ * to contain the NLINK bit.
+ */
+void
+xfs_bump_ino_vers2(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	unsigned long		s;
+
+	ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
+	ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
+
+	ip->i_d.di_version = XFS_DINODE_VERSION_2;
+	ip->i_d.di_onlink = 0;
+	bzero(&(ip->i_d.di_pad[0]), sizeof(ip->i_d.di_pad));
+	mp = tp->t_mountp;
+	if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+		s = XFS_SB_LOCK(mp);
+		if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+			XFS_SB_VERSION_ADDNLINK(&mp->m_sb);
+			XFS_SB_UNLOCK(mp, s);
+			xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+		} else {
+			XFS_SB_UNLOCK(mp, s);
+		}
+	}
+	/* Caller must log the inode */
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	if (ip->i_d.di_nlink >= XFS_MAXLINK)
+		return XFS_ERROR(EMLINK);
+	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+
+	ASSERT(ip->i_d.di_nlink > 0);
+	ip->i_d.di_nlink++;
+	if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
+	    (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
+		/*
+		 * The inode has increased its number of links beyond
+		 * what can fit in an old format inode.	 It now needs
+		 * to be converted to a version 2 inode with a 32 bit
+		 * link count.	If this is the first inode in the file
+		 * system to do this, then we need to bump the superblock
+		 * version number as well.
+		 */
+		xfs_bump_ino_vers2(tp, ip);
+	}
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return 0;
+}
+
+/*
+ * Try to truncate the given file to 0 length.	Currently called
+ * only out of xfs_remove when it has to truncate a file to free
+ * up space for the remove to proceed.
+ */
+int
+xfs_truncate_file(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip)
+{
+	xfs_trans_t	*tp;
+	int		error;
+
+#ifdef QUOTADEBUG
+	/*
+	 * This is called to truncate the quotainodes too.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		if (ip->i_ino != mp->m_sb.sb_uquotino)
+			ASSERT(ip->i_udquot);
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		if (ip->i_ino != mp->m_sb.sb_gquotino)
+			ASSERT(ip->i_gdquot);
+	}
+#endif
+	/*
+	 * Make the call to xfs_itruncate_start before starting the
+	 * transaction, because we cannot make the call while we're
+	 * in a transaction.
+	 */
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+				      XFS_TRANS_PERM_LOG_RES,
+				      XFS_ITRUNCATE_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return error;
+	}
+
+	/*
+	 * Follow the normal truncate locking protocol.	 Since we
+	 * hold the inode in the transaction, we know that it's number
+	 * of references will stay constant.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	/*
+	 * Signal a sync xaction.  The only case where that isn't
+	 * the case is if we're truncating an already unlinked file
+	 * on a wsync fs.  In that case, we know the blocks can't
+	 * reappear in the file because the links to file are
+	 * permanently toast.  Currently, we're always going to
+	 * want a sync transaction because this code is being
+	 * called from places where nlink is guaranteed to be 1
+	 * but I'm leaving the tests in to protect against future
+	 * changes -- rcc.
+	 */
+	error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
+				     XFS_DATA_FORK,
+				     ((ip->i_d.di_nlink != 0 ||
+				       !(mp->m_flags & XFS_MOUNT_WSYNC))
+				      ? 1 : 0));
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT);
+	} else {
+		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
+					 NULL);
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	return error;
+}
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_utils.h linux-2.4.20-pre10-mjc2/fs/xfs/xfs_utils.h
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_utils.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_utils.h	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_UTILS_H__
+#define __XFS_UTILS_H__
+
+#define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
+#define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
+#define ITRACE(ip)	vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \
+				(inst_t *)__return_address)
+
+#define DLF_IGET	0x01	/* get entry inode if name lookup succeeds */
+#define DLF_LOCK_SHARED 0x02	/* directory locked shared */
+
+struct bhv_desc;
+struct cred;
+struct vnode;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern int
+xfs_rename(
+	struct bhv_desc *src_dir_bdp,
+	struct dentry	*src_dentry,
+	struct vnode	*target_dir_vp,
+	struct dentry	*target_dentry,
+	struct cred	*credp);
+
+extern int
+xfs_get_dir_entry(
+	struct dentry		*dentry,
+	xfs_inode_t		**ipp);
+
+extern int
+xfs_dir_lookup_int(
+	struct bhv_desc		*dir_bdp,
+	int			flags,
+	struct dentry		*dentry,
+	xfs_ino_t		*inum,
+	struct xfs_inode	**ipp);
+
+extern int
+xfs_truncate_file(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip);
+
+extern int
+xfs_dir_ialloc(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*dp,
+	mode_t			mode,
+	nlink_t			nlink,
+	dev_t			rdev,
+	struct cred		*credp,
+	prid_t			prid,
+	int			okalloc,
+	struct xfs_inode	**ipp,
+	int			*committed);
+
+extern int
+xfs_droplink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip);
+
+extern int
+xfs_bumplink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip);
+
+extern void
+xfs_bump_ino_vers2(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip);
+
+#endif /* XFS_UTILS_H */
+
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_vfsops.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_vfsops.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_vfsops.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_vfsops.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,1670 @@
+/*
+ * XFS filesystem operations.
+ *
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+STATIC int xfs_ibusy(xfs_mount_t *);
+STATIC int xfs_sync(bhv_desc_t *, int, cred_t *);
+STATIC int xfs_unmount(bhv_desc_t *, int, cred_t *);
+
+int
+xfs_init(void)
+{
+	extern kmem_zone_t	*xfs_da_state_zone;
+	extern kmem_zone_t	*xfs_bmap_free_item_zone;
+	extern kmem_zone_t	*xfs_btree_cur_zone;
+	extern kmem_zone_t	*xfs_inode_zone;
+	extern kmem_zone_t	*xfs_chashlist_zone;
+	extern kmem_zone_t	*xfs_trans_zone;
+	extern kmem_zone_t	*xfs_buf_item_zone;
+	extern kmem_zone_t	*xfs_efd_zone;
+	extern kmem_zone_t	*xfs_efi_zone;
+	extern kmem_zone_t	*xfs_dabuf_zone;
+	extern mutex_t		xfs_uuidtabmon;
+#ifdef DEBUG_NOT
+	extern ktrace_t		*xfs_alloc_trace_buf;
+	extern ktrace_t		*xfs_bmap_trace_buf;
+	extern ktrace_t		*xfs_bmbt_trace_buf;
+	extern ktrace_t		*xfs_dir_trace_buf;
+	extern ktrace_t		*xfs_attr_trace_buf;
+	extern ktrace_t		*xfs_dir2_trace_buf;
+#endif	/* DEBUG */
+#ifdef XFS_DABUF_DEBUG
+	extern lock_t		xfs_dabuf_global_lock;
+#endif
+	extern int		xfs_refcache_size;
+
+#ifdef XFS_DABUF_DEBUG
+	spinlock_init(&xfs_dabuf_global_lock, "xfsda");
+#endif
+	mutex_init(&xfs_uuidtabmon, MUTEX_DEFAULT, "xfs_uuidtab");
+	mutex_init(&xfs_Gqm_lock, MUTEX_DEFAULT, "xfs_qmlock");
+
+	/*
+	 * Initialize all of the zone allocators we use.
+	 */
+	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+						 "xfs_bmap_free_item");
+	xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+					    "xfs_btree_cur");
+	xfs_inode_zone = kmem_zone_init(sizeof(xfs_inode_t), "xfs_inode");
+	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+	xfs_da_state_zone =
+		kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
+	xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
+
+	/*
+	 * The size of the zone allocated buf log item is the maximum
+	 * size possible under XFS.  This wastes a little bit of memory,
+	 * but it is much faster.
+	 */
+	xfs_buf_item_zone =
+		kmem_zone_init((sizeof(xfs_buf_log_item_t) +
+				(((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+				  NBWORD) * sizeof(int))),
+			       "xfs_buf_item");
+	xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+				       ((XFS_EFD_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))),
+				      "xfs_efd_item");
+	xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+				       ((XFS_EFI_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))),
+				      "xfs_efi_item");
+	xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+	xfs_ili_zone = kmem_zone_init(sizeof(xfs_inode_log_item_t), "xfs_ili");
+	xfs_chashlist_zone = kmem_zone_init(sizeof(xfs_chashlist_t),
+					    "xfs_chashlist");
+	_ACL_ZONE_INIT(xfs_acl_zone, "xfs_acl");
+
+#ifdef CONFIG_XFS_VNODE_TRACING
+	ktrace_init(VNODE_TRACE_SIZE);
+#else
+#ifdef DEBUG
+	ktrace_init(64);
+#endif
+#endif
+
+	/*
+	 * Allocate global trace buffers.
+	 */
+#ifdef XFS_ALLOC_TRACE
+	xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_BMAP_TRACE
+	xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_BMBT_TRACE
+	xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_DIR_TRACE
+	xfs_dir_trace_buf = ktrace_alloc(XFS_DIR_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_ATTR_TRACE
+	xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_DIR2_TRACE
+	xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP);
+#endif
+
+	xfs_dir_startup();
+
+#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+	xfs_error_test_init();
+#endif /* DEBUG || INDUCE_IO_ERROR */
+
+	xfs_init_procfs();
+	xfs_sysctl_register();
+
+	xfs_refcache_size = xfs_params.refcache_size;
+
+	/*
+	 * The inode hash table is created on a per mounted
+	 * file system bases.
+	 */
+
+	return 0;
+}
+
+void
+xfs_cleanup(void)
+{
+	extern kmem_zone_t	*xfs_bmap_free_item_zone;
+	extern kmem_zone_t	*xfs_btree_cur_zone;
+	extern kmem_zone_t	*xfs_inode_zone;
+	extern kmem_zone_t	*xfs_trans_zone;
+	extern kmem_zone_t	*xfs_da_state_zone;
+	extern kmem_zone_t	*xfs_dabuf_zone;
+	extern kmem_zone_t	*xfs_efd_zone;
+	extern kmem_zone_t	*xfs_efi_zone;
+	extern kmem_zone_t	*xfs_buf_item_zone;
+	extern kmem_zone_t	*xfs_chashlist_zone;
+	extern xfs_inode_t	**xfs_refcache;
+
+	xfs_cleanup_procfs();
+	xfs_sysctl_unregister();
+	if (xfs_refcache) {
+		kmem_free(xfs_refcache,
+			XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *));
+	}
+
+	kmem_cache_destroy(xfs_bmap_free_item_zone);
+	kmem_cache_destroy(xfs_btree_cur_zone);
+	kmem_cache_destroy(xfs_inode_zone);
+	kmem_cache_destroy(xfs_trans_zone);
+	kmem_cache_destroy(xfs_da_state_zone);
+	kmem_cache_destroy(xfs_dabuf_zone);
+	kmem_cache_destroy(xfs_buf_item_zone);
+	kmem_cache_destroy(xfs_efd_zone);
+	kmem_cache_destroy(xfs_efi_zone);
+	kmem_cache_destroy(xfs_ifork_zone);
+	kmem_cache_destroy(xfs_ili_zone);
+	kmem_cache_destroy(xfs_chashlist_zone);
+	_XQM_ZONE_DESTROY(qm_dqzone);
+	_XQM_ZONE_DESTROY(qm_dqtrxzone);
+	_ACL_ZONE_DESTROY(xfs_acl_zone);
+#if  (defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING))
+	ktrace_uninit();
+#endif
+}
+
+/*
+ * xfs_cmountfs
+ *
+ * This function is the common mount file system function for XFS.
+ */
+STATIC int
+xfs_cmountfs(
+	vfs_t		*vfsp,
+	dev_t		ddev,
+	dev_t		logdev,
+	dev_t		rtdev,
+	struct xfs_mount_args *ap,
+	struct cred	*cr)
+{
+	xfs_mount_t	*mp;
+	int		error = 0;
+
+
+	/*
+	 * Allocate VFS private data (xfs mount structure).
+	 */
+	mp = xfs_mount_init();
+
+	vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp);
+
+	/*
+	 * Open data, real time, and log devices now - order is important.
+	 */
+	mp->m_ddev_targp = pagebuf_lock_enable(ddev, 0);
+	if (IS_ERR(mp->m_ddev_targp)) {
+		error = PTR_ERR(mp->m_ddev_targp);
+		goto error2;
+	}
+
+	if (rtdev != 0) {
+		mp->m_rtdev_targp = 
+				pagebuf_lock_enable(rtdev, 1);
+		if (IS_ERR(mp->m_rtdev_targp)) {
+			error = PTR_ERR(mp->m_rtdev_targp);
+			pagebuf_lock_disable(mp->m_ddev_targp, 0);
+			goto error2;
+		}
+	}
+
+	if (logdev != ddev) {
+		mp->m_logdev_targp = 
+				pagebuf_lock_enable(logdev, 1);
+		if (IS_ERR(mp->m_logdev_targp)) {
+			error = PTR_ERR(mp->m_logdev_targp);
+			pagebuf_lock_disable(mp->m_ddev_targp, 1);
+			if (mp->m_rtdev_targp)
+				pagebuf_lock_disable(mp->m_rtdev_targp, 1);
+			goto error2;
+		}
+	}
+
+	/* Values are in BBs */
+	if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+		/*
+		 * At this point the superblock has not been read
+		 * in, therefore we do not know the block size.
+		 * Before, the mount call ends we will convert
+		 * these to FSBs.
+		 */
+		mp->m_dalign = ap->sunit;
+		mp->m_swidth = ap->swidth;
+	} else {
+		mp->m_dalign = 0;
+		mp->m_swidth = 0;
+	}
+
+	if (logdev != 0) {
+		if (logdev == ddev) {
+			mp->m_logdev_targp = mp->m_ddev_targp;
+		} else {
+			/* Set the log device's block size */
+			set_blocksize(logdev, 512);
+		}
+
+		if (ap->logbufs != 0 && ap->logbufs != -1 &&
+		    (ap->logbufs < XLOG_NUM_ICLOGS ||
+		     ap->logbufs > XLOG_MAX_ICLOGS)) {
+			cmn_err(CE_WARN, 
+				"XFS: invalid logbufs value: %d [not %d-%d]\n",
+				ap->logbufs, XLOG_NUM_ICLOGS, XLOG_MAX_ICLOGS);
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		}
+		mp->m_logbufs = ap->logbufs;
+		if (ap->logbufsize != -1 &&
+		    ap->logbufsize != 16 * 1024 &&
+		    ap->logbufsize != 32 * 1024 &&
+		    ap->logbufsize != 64 * 1024 &&
+		    ap->logbufsize != 128 * 1024 &&
+		    ap->logbufsize != 256 * 1024) {
+			cmn_err(CE_WARN,
+		"XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]\n",
+				ap->logbufsize);
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		}
+		mp->m_logbsize = ap->logbufsize;
+		mp->m_fsname_len = strlen(ap->fsname) + 1;
+		mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
+		strcpy(mp->m_fsname, ap->fsname);
+	}
+	if (rtdev != 0) {
+		if (rtdev == ddev || rtdev == logdev) {
+			cmn_err(CE_WARN,
+	"XFS: Cannot mount filesystem with identical rtdev and logdev.");
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		} else {
+			/* Set the realtime device's block size */
+			set_blocksize(rtdev, 512);
+		}
+	}
+
+	/*
+	 * Pull in the 'wsync' and 'ino64' mount options before we do the real
+	 * work of mounting and recovery.  The arg pointer will
+	 * be NULL when we are being called from the root mount code.
+	 */
+#if XFS_BIG_FILESYSTEMS
+	mp->m_inoadd = 0;
+#endif
+	if (ap != NULL) {
+		if (ap->flags & XFSMNT_WSYNC)
+			mp->m_flags |= XFS_MOUNT_WSYNC;
+#if XFS_BIG_FILESYSTEMS
+		if (ap->flags & XFSMNT_INO64) {
+			mp->m_flags |= XFS_MOUNT_INO64;
+			mp->m_inoadd = XFS_INO64_OFFSET;
+		}
+#endif
+		if (ap->flags & XFSMNT_NOATIME)
+			mp->m_flags |= XFS_MOUNT_NOATIME;
+
+		if (ap->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA |
+				 XFSMNT_QUOTAMAYBE))
+			xfs_qm_mount_quotainit(mp, ap->flags);
+
+		if (ap->flags & XFSMNT_RETERR)
+			mp->m_flags |= XFS_MOUNT_RETERR;
+
+		if (ap->flags & XFSMNT_NOALIGN)
+			mp->m_flags |= XFS_MOUNT_NOALIGN;
+
+		if (ap->flags & XFSMNT_OSYNCISOSYNC)
+			mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
+
+		/* Default on Linux */
+		if ( 1 || ap->flags & XFSMNT_32BITINODES)
+			mp->m_flags |= XFS_MOUNT_32BITINODES;
+
+		if (ap->flags & XFSMNT_IRIXSGID)
+			mp->m_flags |= XFS_MOUNT_IRIXSGID;
+
+		if (ap->flags & XFSMNT_IOSIZE) {
+			if (ap->iosizelog > XFS_MAX_IO_LOG ||
+			    ap->iosizelog < XFS_MIN_IO_LOG) {
+				cmn_err(CE_WARN,
+			"XFS: invalid log iosize: %d [not %d-%d]",
+					ap->iosizelog, XFS_MIN_IO_LOG,
+					XFS_MAX_IO_LOG);
+				error = XFS_ERROR(EINVAL);
+				goto error3;
+			}
+
+			mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+			mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
+		}
+
+		/*
+		 * no recovery flag requires a read-only mount
+		 */
+		if (ap->flags & XFSMNT_NORECOVERY) {
+			if (!(vfsp->vfs_flag & VFS_RDONLY)) {
+				cmn_err(CE_WARN,
+		"XFS: tried to mount a FS read-write without recovery!");
+				error = XFS_ERROR(EINVAL);
+				goto error3;
+			}
+			mp->m_flags |= XFS_MOUNT_NORECOVERY;
+		}
+
+		if (ap->flags & XFSMNT_NOUUID)
+			mp->m_flags |= XFS_MOUNT_NOUUID;
+		if (ap->flags & XFSMNT_NOLOGFLUSH)
+			mp->m_flags |= XFS_MOUNT_NOLOGFLUSH;
+	}
+
+	/*
+	 * read in superblock to check read-only flags and shared
+	 * mount status
+	 */
+	if ((error = xfs_readsb(mp)))
+		goto error3;
+
+	/* Fail a mount where the logbuf is smaller then the log stripe */
+	if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
+		if (((ap->logbufsize == -1) &&
+		     (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) ||
+		    (ap->logbufsize < mp->m_sb.sb_logsunit)) {
+			cmn_err(CE_WARN, "XFS: "
+				"logbuf size must be greater than or equal to log stripe size");
+			xfs_freesb(mp);
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		}
+	} else {
+		/* Fail a mount if the logbuf is larger than 32K */
+		if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+			cmn_err(CE_WARN, "XFS: "
+		"XFS: logbuf size for version 1 logs must be 16K or 32K");
+			xfs_freesb(mp);
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		}
+	}
+
+	pagebuf_target_blocksize(mp->m_ddev_targp, mp->m_sb.sb_blocksize);
+	if (logdev != 0 && logdev != ddev)
+		pagebuf_target_blocksize(mp->m_logdev_targp, mp->m_sb.sb_blocksize);
+	if (rtdev != 0)
+		pagebuf_target_blocksize(mp->m_rtdev_targp, mp->m_sb.sb_blocksize);
+
+	/*
+	 * prohibit r/w mounts of read-only filesystems
+	 */
+	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) &&
+	    !(vfsp->vfs_flag & VFS_RDONLY)) {
+		cmn_err(CE_WARN, "XFS: "
+			"cannot mount a read-only filesystem as read-write");
+		error = XFS_ERROR(EROFS);
+		xfs_freesb(mp);
+		goto error3;
+	}
+
+	/*
+	 * check for shared mount.
+	 */
+	if (ap && ap->flags & XFSMNT_SHARED) {
+		if (!XFS_SB_VERSION_HASSHARED(&mp->m_sb)) {
+			error = XFS_ERROR(EINVAL);
+			xfs_freesb(mp);
+			goto error3;
+		}
+
+		/*
+		 * For IRIX 6.5, shared mounts must have the shared
+		 * version bit set, have the persistent readonly
+		 * field set, must be version 0 and can only be mounted
+		 * read-only.
+		 */
+		if (!(vfsp->vfs_flag & VFS_RDONLY) ||
+		    !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
+		    mp->m_sb.sb_shared_vn != 0) {
+			error = XFS_ERROR(EINVAL);
+			xfs_freesb(mp);
+			goto error3;
+		}
+
+		mp->m_flags |= XFS_MOUNT_SHARED;
+
+		/*
+		 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
+		 */
+		if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI)) {
+			error = XFS_ERROR(EINVAL);
+			xfs_freesb(mp);
+			goto error3;
+		}
+	}
+
+	if ((error = xfs_mountfs(vfsp, mp, ddev, 0)) == 0)
+		return 0;
+
+	/*
+	 * Be careful not to clobber the value of 'error' here.
+	 */
+ error3:
+	/* It's impossible to get here before buftargs are filled */
+	xfs_binval(mp->m_ddev_targp);
+	pagebuf_lock_disable(mp->m_ddev_targp, 0);
+	if (logdev && logdev != ddev) {
+		xfs_binval(mp->m_logdev_targp);
+		pagebuf_lock_disable(mp->m_logdev_targp, 1);
+	}
+	if (rtdev != 0) {
+		xfs_binval(mp->m_rtdev_targp);
+		pagebuf_lock_disable(mp->m_rtdev_targp, 1);
+	}
+ error2:
+	if (error) {
+		xfs_mount_free(mp, 1);
+	}
+	return error;
+}
+
+/*
+ * xfs_mount
+ *
+ * The file system configurations are:
+ *	(1) device (partition) with data and internal log
+ *	(2) logical volume with data and log subvolumes.
+ *	(3) logical volume with data, log, and realtime subvolumes.
+ */
+STATIC int
+xfs_mount(
+	vfs_t			*vfsp,
+	struct xfs_mount_args	*args,
+	cred_t			*credp)
+{
+	dev_t		ddev;
+	dev_t		logdev;
+	dev_t		rtdev;
+	int		error;
+
+	error = spectodevs(vfsp->vfs_super, args, &ddev, &logdev, &rtdev);
+	if (!error)
+		error = xfs_cmountfs(vfsp, ddev, logdev, rtdev, args, credp);
+	return (error);
+}
+
+/*
+ * xfs_ibusy searches for a busy inode in the mounted file system.
+ *
+ * Return 0 if there are no active inodes otherwise return 1.
+ */
+STATIC int
+xfs_ibusy(
+	xfs_mount_t	*mp)
+{
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+	int		busy;
+
+	busy = 0;
+
+	XFS_MOUNT_ILOCK(mp);
+
+	ip = mp->m_inodes;
+	if (ip == NULL) {
+		XFS_MOUNT_IUNLOCK(mp);
+		return busy;
+	}
+
+	do {
+		/* Skip markers inserted by xfs_sync */
+		if (ip->i_mount == NULL) {
+			ip = ip->i_mnext;
+			continue;
+		}
+
+		vp = XFS_ITOV_NULL(ip);
+
+		if (vp && vn_count(vp) != 0) {
+			if (xfs_ibusy_check(ip, vn_count(vp)) == 0) {
+				ip = ip->i_mnext;
+				continue;
+			}
+#ifdef DEBUG
+			printk("busy vp=0x%p ip=0x%p inum %Ld count=%d\n",
+				vp, ip, ip->i_ino, vn_count(vp));
+#endif
+			busy++;
+		}
+		ip = ip->i_mnext;
+	} while ((ip != mp->m_inodes) && !busy);
+
+	XFS_MOUNT_IUNLOCK(mp);
+
+	return busy;
+}
+
+
+STATIC int
+xfs_unmount(
+	bhv_desc_t	*bdp,
+	int		flags,
+	cred_t		*credp)
+{
+	xfs_mount_t	*mp;
+	xfs_inode_t	*rip;
+	vnode_t		*rvp = 0;
+	struct vfs	*vfsp = bhvtovfs(bdp);
+	int		unmount_event_wanted = 0;
+	int		unmount_event_flags = 0;
+	int		xfs_unmountfs_needed = 0;
+	int		error;
+
+	mp = XFS_BHVTOM(bdp);
+	rip = mp->m_rootip;
+	rvp = XFS_ITOV(rip);
+
+	if (vfsp->vfs_flag & VFS_DMI) {
+		bhv_desc_t	*rbdp;
+
+		rbdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(rvp), &xfs_vnodeops);
+		error = dm_send_namesp_event(DM_EVENT_PREUNMOUNT,
+				rbdp, DM_RIGHT_NULL, rbdp, DM_RIGHT_NULL,
+				NULL, NULL, 0, 0,
+				(mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
+					0:DM_FLAGS_UNWANTED);
+			if (error)
+				return XFS_ERROR(error);
+		unmount_event_wanted = 1;
+		unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))?
+					0 : DM_FLAGS_UNWANTED;
+	}
+
+	/*
+	 * First blow any referenced inode from this file system
+	 * out of the reference cache, and delete the timer.
+	 */
+	xfs_refcache_purge_mp(mp);
+	del_timer_sync(&mp->m_sbdirty_timer);
+
+	/*
+	 * Make sure there are no active users.
+	 */
+	if (xfs_ibusy(mp)) {
+		error = XFS_ERROR(EBUSY);
+		printk("xfs_unmount: xfs_ibusy says error/%d\n", error);
+		goto out;
+	}
+
+	XFS_bflush(mp->m_ddev_targp);
+	error = xfs_unmount_flush(mp, 0);
+	if (error)
+		goto out;
+
+	ASSERT(vn_count(rvp) == 1);
+
+	/*
+	 * Drop the reference count, and then
+	 * run the vnode through vn_remove.
+	 */
+	rvp->v_flag |= VPURGE;			/* OK for vn_purge */
+	VN_RELE(rvp);
+
+	vn_remove(rvp);
+
+	/*
+	 * If we're forcing a shutdown, typically because of a media error,
+	 * we want to make sure we invalidate dirty pages that belong to
+	 * referenced vnodes as well.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp))  {
+		error = xfs_sync(&mp->m_bhv,
+			 (SYNC_WAIT | SYNC_CLOSE), credp);
+		ASSERT(error != EFSCORRUPTED);
+	}
+	xfs_unmountfs_needed = 1;
+
+out:
+	/*	Send DMAPI event, if required.
+	 *	Then do xfs_unmountfs() if needed.
+	 *	Then return error (or zero).
+	 */
+	if (unmount_event_wanted) {
+		/* Note: mp structure must still exist for
+		 * dm_send_unmount_event() call.
+		 */
+		dm_send_unmount_event(vfsp, error == 0 ? rvp : NULL,
+			DM_RIGHT_NULL, 0, error, unmount_event_flags);
+	}
+	if (xfs_unmountfs_needed) {
+		/*
+		 * Call common unmount function to flush to disk
+		 * and free the super block buffer & mount structures.
+		 */
+		xfs_unmountfs(mp, credp);
+	}
+
+	return XFS_ERROR(error);
+}
+
+/*
+ * xfs_unmount_flush implements a set of flush operation on special
+ * inodes, which are needed as a separate set of operations so that
+ * they can be called as part of relocation process.
+ */
+int
+xfs_unmount_flush(
+	xfs_mount_t	*mp,		/* Mount structure we are getting
+					   rid of. */
+	int		relocation)	/* Called from vfs relocation. */
+{
+	xfs_inode_t	*rip = mp->m_rootip;
+	xfs_inode_t	*rbmip;
+	xfs_inode_t	*rsumip=NULL;
+	vnode_t		*rvp = XFS_ITOV(rip);
+	int		error;
+
+	xfs_ilock(rip, XFS_ILOCK_EXCL);
+	xfs_iflock(rip);
+
+	/*
+	 * Flush out the real time inodes.
+	 */
+	if ((rbmip = mp->m_rbmip) != NULL) {
+		xfs_ilock(rbmip, XFS_ILOCK_EXCL);
+		xfs_iflock(rbmip);
+		error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
+		xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
+
+		if (error == EFSCORRUPTED)
+			goto fscorrupt_out;
+
+		ASSERT(vn_count(XFS_ITOV(rbmip)) == 1);
+
+		rsumip = mp->m_rsumip;
+		xfs_ilock(rsumip, XFS_ILOCK_EXCL);
+		xfs_iflock(rsumip);
+		error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
+		xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
+
+		if (error == EFSCORRUPTED)
+			goto fscorrupt_out;
+
+		ASSERT(vn_count(XFS_ITOV(rsumip)) == 1);
+	}
+
+	/*
+	 * synchronously flush root inode to disk
+	 */
+	error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
+
+	if (error == EFSCORRUPTED)
+		goto fscorrupt_out2;
+
+	if (vn_count(rvp) != 1 && !relocation) {
+		xfs_iunlock(rip, XFS_ILOCK_EXCL);
+		error = XFS_ERROR(EBUSY);
+		return (error);
+	}
+	/*
+	 * Release dquot that rootinode, rbmino and rsumino might be holding,
+	 * flush and purge the quota inodes.
+	 */
+	error = xfs_qm_unmount_quotas(mp);
+	if (error == EFSCORRUPTED)
+		goto fscorrupt_out2;
+
+	if (rbmip) {
+		VN_RELE(XFS_ITOV(rbmip));
+		VN_RELE(XFS_ITOV(rsumip));
+	}
+
+	xfs_iunlock(rip, XFS_ILOCK_EXCL);
+	return (0);
+
+fscorrupt_out:
+	xfs_ifunlock(rip);
+
+fscorrupt_out2:
+	xfs_iunlock(rip, XFS_ILOCK_EXCL);
+
+	error = XFS_ERROR(EFSCORRUPTED);
+	return (error);
+}
+
+/*
+ * xfs_root extracts the root vnode from a vfs.
+ *
+ * vfsp -- the vfs struct for the desired file system
+ * vpp	-- address of the caller's vnode pointer which should be
+ *	   set to the desired fs root vnode
+ */
+STATIC int
+xfs_root(
+	bhv_desc_t	*bdp,
+	vnode_t		**vpp)
+{
+	vnode_t *vp;
+
+	vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip);
+	VN_HOLD(vp);
+	*vpp = vp;
+
+	return 0;
+}
+
+/*
+ * xfs_statvfs
+ *
+ * Fill in the statvfs structure for the given file system.  We use
+ * the superblock lock in the mount structure to ensure a consistent
+ * snapshot of the counters returned.
+ */
+STATIC int
+xfs_statvfs(
+	bhv_desc_t	*bdp,
+	struct statfs	*statp,
+	vnode_t		*vp)
+{
+	__uint64_t	fakeinos;
+	xfs_extlen_t	lsize;
+	xfs_mount_t	*mp;
+	xfs_sb_t	*sbp;
+	unsigned long	s;
+
+	mp = XFS_BHVTOM(bdp);
+	sbp = &(mp->m_sb);
+
+	statp->f_type = XFS_SB_MAGIC;
+
+	s = XFS_SB_LOCK(mp);
+	statp->f_bsize = sbp->sb_blocksize;
+	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+	statp->f_blocks = sbp->sb_dblocks - lsize;
+	statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks;
+	fakeinos = statp->f_bfree << sbp->sb_inopblog;
+#if XFS_BIG_FILESYSTEMS
+	fakeinos += mp->m_inoadd;
+#endif
+	statp->f_files =
+	    MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+	if (mp->m_maxicount)
+#if XFS_BIG_FILESYSTEMS
+		if (!mp->m_inoadd)
+#endif
+			statp->f_files =
+			    MIN(statp->f_files, (long)mp->m_maxicount);
+	statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+	XFS_SB_UNLOCK(mp, s);
+
+	statp->f_fsid.val[0] = mp->m_dev;
+	statp->f_fsid.val[1] = 0;
+	statp->f_namelen = MAXNAMELEN - 1;
+
+	return 0;
+}
+
+
+/*
+ * xfs_sync flushes any pending I/O to file system vfsp.
+ *
+ * This routine is called by vfs_sync() to make sure that things make it
+ * out to disk eventually, on sync() system calls to flush out everything,
+ * and when the file system is unmounted.  For the vfs_sync() case, all
+ * we really need to do is sync out the log to make all of our meta-data
+ * updates permanent (except for timestamps).  For calls from pflushd(),
+ * dirty pages are kept moving by calling pdflush() on the inodes
+ * containing them.  We also flush the inodes that we can lock without
+ * sleeping and the superblock if we can lock it without sleeping from
+ * vfs_sync() so that items at the tail of the log are always moving out.
+ *
+ * Flags:
+ *	SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
+ *		       to sleep if we can help it.  All we really need
+ *		       to do is ensure that the log is synced at least
+ *		       periodically.  We also push the inodes and
+ *		       superblock if we can lock them without sleeping
+ *			and they are not pinned.
+ *	SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
+ *		       set, then we really want to lock each inode and flush
+ *		       it.
+ *	SYNC_WAIT    - All the flushes that take place in this call should
+ *		       be synchronous.
+ *	SYNC_DELWRI  - This tells us to push dirty pages associated with
+ *		       inodes.	SYNC_WAIT and SYNC_BDFLUSH are used to
+ *		       determine if they should be flushed sync, async, or
+ *		       delwri.
+ *	SYNC_CLOSE   - This flag is passed when the system is being
+ *		       unmounted.  We should sync and invalidate everthing.
+ *	SYNC_FSDATA  - This indicates that the caller would like to make
+ *		       sure the superblock is safe on disk.  We can ensure
+ *		       this by simply makeing sure the log gets flushed
+ *		       if SYNC_BDFLUSH is set, and by actually writing it
+ *		       out otherwise.
+ *
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_sync(
+	bhv_desc_t	*bdp,
+	int		flags,
+	cred_t		*credp)
+{
+	xfs_mount_t	*mp;
+
+	mp = XFS_BHVTOM(bdp);
+	return (xfs_syncsub(mp, flags, 0, NULL));
+}
+
+/*
+ * xfs sync routine for internal use
+ *
+ * This routine supports all of the flags defined for the generic VFS_SYNC
+ * interface as explained above under xys_sync.	 In the interests of not
+ * changing interfaces within the 6.5 family, additional internallly-
+ * required functions are specified within a separate xflags parameter,
+ * only available by calling this routine.
+ *
+ * xflags:
+ *	XFS_XSYNC_RELOC - Sync for relocation.	Don't try to get behavior
+ *			  locks as this will cause you to hang.	 Not all
+ *			  combinations of flags are necessarily supported
+ *			  when this is specified.
+ */
+int
+xfs_syncsub(
+	xfs_mount_t	*mp,
+	int		flags,
+	int		xflags,
+	int		*bypassed)
+{
+	xfs_inode_t	*ip = NULL;
+	xfs_inode_t	*ip_next;
+	xfs_buf_t	*bp;
+	vnode_t		*vp = NULL;
+	vmap_t		vmap;
+	int		error;
+	int		last_error;
+	uint64_t	fflag;
+	uint		lock_flags;
+	uint		base_lock_flags;
+	uint		log_flags;
+	boolean_t	mount_locked;
+	boolean_t	vnode_refed;
+	int		preempt;
+	int		do_mmap_flush;
+	xfs_dinode_t	*dip;
+	xfs_buf_log_item_t	*bip;
+	xfs_iptr_t	*ipointer;
+#ifdef DEBUG
+	boolean_t	ipointer_in = B_FALSE;
+
+#define IPOINTER_SET	ipointer_in = B_TRUE
+#define IPOINTER_CLR	ipointer_in = B_FALSE
+#else
+#define IPOINTER_SET
+#define IPOINTER_CLR
+#endif
+
+
+/* Insert a marker record into the inode list after inode ip. The list
+ * must be locked when this is called. After the call the list will no
+ * longer be locked.
+ */
+#define IPOINTER_INSERT(ip, mp) { \
+		ASSERT(ipointer_in == B_FALSE); \
+		ipointer->ip_mnext = ip->i_mnext; \
+		ipointer->ip_mprev = ip; \
+		ip->i_mnext = (xfs_inode_t *)ipointer; \
+		ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
+		preempt = 0; \
+		XFS_MOUNT_IUNLOCK(mp); \
+		mount_locked = B_FALSE; \
+		IPOINTER_SET; \
+	}
+
+/* Remove the marker from the inode list. If the marker was the only item
+ * in the list then there are no remaining inodes and we should zero out
+ * the whole list. If we are the current head of the list then move the head
+ * past us.
+ */
+#define IPOINTER_REMOVE(ip, mp) { \
+		ASSERT(ipointer_in == B_TRUE); \
+		if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
+			ip = ipointer->ip_mnext; \
+			ip->i_mprev = ipointer->ip_mprev; \
+			ipointer->ip_mprev->i_mnext = ip; \
+			if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
+				mp->m_inodes = ip; \
+			} \
+		} else { \
+			ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
+			mp->m_inodes = NULL; \
+			ip = NULL; \
+		} \
+		IPOINTER_CLR; \
+	}
+
+#define PREEMPT_MASK	0x7f
+
+	if (bypassed)
+		*bypassed = 0;
+	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+		return 0;
+	error = 0;
+	last_error = 0;
+	preempt = 0;
+
+	/* Allocate a reference marker */
+	ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
+
+	fflag = XFS_B_ASYNC;		/* default is don't wait */
+	if (flags & SYNC_BDFLUSH)
+		fflag = XFS_B_DELWRI;
+	if (flags & SYNC_WAIT)
+		fflag = 0;		/* synchronous overrides all */
+	do_mmap_flush = (flags & (SYNC_DELWRI|SYNC_BDFLUSH)) !=
+						(SYNC_DELWRI|SYNC_BDFLUSH);
+
+	base_lock_flags = XFS_ILOCK_SHARED;
+	if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
+		/*
+		 * We need the I/O lock if we're going to call any of
+		 * the flush/inval routines.
+		 */
+		base_lock_flags |= XFS_IOLOCK_SHARED;
+	}
+
+	/*
+	 * Sync out the log.  This ensures that the log is periodically
+	 * flushed even if there is not enough activity to fill it up.
+	 */
+	if (flags & SYNC_WAIT) {
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	} else {
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	}
+
+	XFS_MOUNT_ILOCK(mp);
+
+	ip = mp->m_inodes;
+
+	mount_locked = B_TRUE;
+	vnode_refed  = B_FALSE;
+
+	IPOINTER_CLR;
+
+	do {
+		ASSERT(ipointer_in == B_FALSE);
+		ASSERT(vnode_refed == B_FALSE);
+
+		lock_flags = base_lock_flags;
+
+		/*
+		 * There were no inodes in the list, just break out
+		 * of the loop.
+		 */
+		if (ip == NULL) {
+			break;
+		}
+
+		/*
+		 * We found another sync thread marker - skip it
+		 */
+		if (ip->i_mount == NULL) {
+			ip = ip->i_mnext;
+			continue;
+		}
+
+		vp = XFS_ITOV_NULL(ip);
+
+		/*
+		 * If the vnode is gone then this is being torn down,
+		 * call reclaim if it is flushed, else let regular flush
+		 * code deal with it later in the loop.
+		 */
+
+		if (vp == NULL) {
+			/* Skip ones already in reclaim */
+			if (ip->i_flags & XFS_IRECLAIM) {
+				ip = ip->i_mnext;
+				continue;
+			}
+			if ((ip->i_update_core == 0) &&
+			    ((ip->i_itemp == NULL) ||
+			    !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+				if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
+					ip = ip->i_mnext;
+				} else if ((xfs_ipincount(ip) == 0) &&
+				    xfs_iflock_nowait(ip)) {
+					IPOINTER_INSERT(ip, mp);
+
+					xfs_finish_reclaim(ip, 1,
+						XFS_IFLUSH_DELWRI_ELSE_SYNC);
+
+					XFS_MOUNT_ILOCK(mp);
+					mount_locked = B_TRUE;
+					IPOINTER_REMOVE(ip, mp);
+				} else {
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					ip = ip->i_mnext;
+				}
+				continue;
+			}
+		}
+
+		if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
+			XFS_MOUNT_IUNLOCK(mp);
+			kmem_free(ipointer, sizeof(xfs_iptr_t));
+			return 0;
+		}
+
+		/*
+		 * If this is just vfs_sync() or pflushd() calling
+		 * then we can skip inodes for which it looks like
+		 * there is nothing to do.  Since we don't have the
+		 * inode locked this is racey, but these are periodic
+		 * calls so it doesn't matter.	For the others we want
+		 * to know for sure, so we at least try to lock them.
+		 */
+		if (flags & SYNC_BDFLUSH) {
+			if (((ip->i_itemp == NULL) ||
+			     !(ip->i_itemp->ili_format.ilf_fields &
+			       XFS_ILOG_ALL)) &&
+			    (ip->i_update_core == 0)) {
+				ip = ip->i_mnext;
+				continue;
+			}
+		}
+
+		/*
+		 * Try to lock without sleeping.  We're out of order with
+		 * the inode list lock here, so if we fail we need to drop
+		 * the mount lock and try again.  If we're called from
+		 * bdflush() here, then don't bother.
+		 *
+		 * The inode lock here actually coordinates with the
+		 * almost spurious inode lock in xfs_ireclaim() to prevent
+		 * the vnode we handle here without a reference from
+		 * being freed while we reference it.  If we lock the inode
+		 * while it's on the mount list here, then the spurious inode
+		 * lock in xfs_ireclaim() after the inode is pulled from
+		 * the mount list will sleep until we release it here.
+		 * This keeps the vnode from being freed while we reference
+		 * it.	It is also cheaper and simpler than actually doing
+		 * a vn_get() for every inode we touch here.
+		 */
+		if (xfs_ilock_nowait(ip, lock_flags) == 0) {
+
+			if ((flags & SYNC_BDFLUSH) || (vp == NULL)) {
+				ip = ip->i_mnext;
+				continue;
+			}
+
+			/*
+			 * We need to unlock the inode list lock in order
+			 * to lock the inode. Insert a marker record into
+			 * the inode list to remember our position, dropping
+			 * the lock is now done inside the IPOINTER_INSERT
+			 * macro.
+			 *
+			 * We also use the inode list lock to protect us
+			 * in taking a snapshot of the vnode version number
+			 * for use in calling vn_get().
+			 */
+			VMAP(vp, ip, vmap);
+			IPOINTER_INSERT(ip, mp);
+
+			vp = vn_get(vp, &vmap);
+			if (vp == NULL) {
+				/*
+				 * The vnode was reclaimed once we let go
+				 * of the inode list lock.  Skip to the
+				 * next list entry. Remove the marker.
+				 */
+
+				XFS_MOUNT_ILOCK(mp);
+
+				mount_locked = B_TRUE;
+				vnode_refed  = B_FALSE;
+
+				IPOINTER_REMOVE(ip, mp);
+
+				continue;
+			}
+
+			xfs_ilock(ip, lock_flags);
+
+			ASSERT(vp == XFS_ITOV(ip));
+			ASSERT(ip->i_mount == mp);
+
+			vnode_refed = B_TRUE;
+		}
+
+		/* From here on in the loop we may have a marker record
+		 * in the inode list.
+		 */
+
+		if ((flags & SYNC_CLOSE)  && (vp != NULL)) {
+			/*
+			 * This is the shutdown case.  We just need to
+			 * flush and invalidate all the pages associated
+			 * with the inode.  Drop the inode lock since
+			 * we can't hold it across calls to the buffer
+			 * cache.
+			 *
+			 * We don't set the VREMAPPING bit in the vnode
+			 * here, because we don't hold the vnode lock
+			 * exclusively.	 It doesn't really matter, though,
+			 * because we only come here when we're shutting
+			 * down anyway.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+			if (XFS_FORCED_SHUTDOWN(mp)) {
+				if (xflags & XFS_XSYNC_RELOC) {
+					fs_tosspages(XFS_ITOBHV(ip), 0, -1,
+						     FI_REMAPF);
+				}
+				else {
+					VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+				}
+			} else {
+				if (xflags & XFS_XSYNC_RELOC) {
+					fs_flushinval_pages(XFS_ITOBHV(ip),
+							    0, -1, FI_REMAPF);
+				}
+				else {
+					VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_REMAPF);
+				}
+			}
+
+			xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+		} else if ((flags & SYNC_DELWRI) && (vp != NULL)) {
+			if (VN_DIRTY(vp)) {
+				/* We need to have dropped the lock here,
+				 * so insert a marker if we have not already
+				 * done so.
+				 */
+				if (mount_locked) {
+					IPOINTER_INSERT(ip, mp);
+				}
+
+				/*
+				 * Drop the inode lock since we can't hold it
+				 * across calls to the buffer cache.
+				 */
+				xfs_iunlock(ip, XFS_ILOCK_SHARED);
+				if (do_mmap_flush) {
+					VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1,
+						fflag, FI_NONE, error);
+				} else {
+					fsync_inode_data_buffers(LINVFS_GET_IP(vp));
+				}
+				xfs_ilock(ip, XFS_ILOCK_SHARED);
+			}
+
+		}
+
+		if (flags & SYNC_BDFLUSH) {
+			if ((flags & SYNC_ATTR) &&
+			    ((ip->i_update_core) ||
+			     ((ip->i_itemp != NULL) &&
+			      (ip->i_itemp->ili_format.ilf_fields != 0)))) {
+
+				/* Insert marker and drop lock if not already
+				 * done.
+				 */
+				if (mount_locked) {
+					IPOINTER_INSERT(ip, mp);
+				}
+
+				/*
+				 * We don't want the periodic flushing of the
+				 * inodes by vfs_sync() to interfere with
+				 * I/O to the file, especially read I/O
+				 * where it is only the access time stamp
+				 * that is being flushed out.  To prevent
+				 * long periods where we have both inode
+				 * locks held shared here while reading the
+				 * inode's buffer in from disk, we drop the
+				 * inode lock while reading in the inode
+				 * buffer.  We have to release the buffer
+				 * and reacquire the inode lock so that they
+				 * are acquired in the proper order (inode
+				 * locks first).  The buffer will go at the
+				 * end of the lru chain, though, so we can
+				 * expect it to still be there when we go
+				 * for it again in xfs_iflush().
+				 */
+				if ((xfs_ipincount(ip) == 0) &&
+				    xfs_iflock_nowait(ip)) {
+
+					xfs_ifunlock(ip);
+					xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+					error = xfs_itobp(mp, NULL, ip,
+							  &dip, &bp, 0);
+					if (!error) {
+						xfs_buf_relse(bp);
+					} else {
+						/* Bailing out, remove the
+						 * marker and free it.
+						 */
+						XFS_MOUNT_ILOCK(mp);
+
+						IPOINTER_REMOVE(ip, mp);
+
+						XFS_MOUNT_IUNLOCK(mp);
+
+						ASSERT(!(lock_flags &
+							XFS_IOLOCK_SHARED));
+
+						kmem_free(ipointer,
+							sizeof(xfs_iptr_t));
+						return (0);
+					}
+
+					/*
+					 * Since we dropped the inode lock,
+					 * the inode may have been reclaimed.
+					 * Therefore, we reacquire the mount
+					 * lock and check to see if we were the
+					 * inode reclaimed. If this happened
+					 * then the ipointer marker will no
+					 * longer point back at us. In this
+					 * case, move ip along to the inode
+					 * after the marker, remove the marker
+					 * and continue.
+					 */
+					XFS_MOUNT_ILOCK(mp);
+					mount_locked = B_TRUE;
+
+					if (ip != ipointer->ip_mprev) {
+						IPOINTER_REMOVE(ip, mp);
+
+						ASSERT(!vnode_refed);
+						ASSERT(!(lock_flags &
+							XFS_IOLOCK_SHARED));
+						continue;
+					}
+
+					ASSERT(ip->i_mount == mp);
+
+					if (xfs_ilock_nowait(ip,
+						    XFS_ILOCK_SHARED) == 0) {
+						ASSERT(ip->i_mount == mp);
+						/*
+						 * We failed to reacquire
+						 * the inode lock without
+						 * sleeping, so just skip
+						 * the inode for now.  We
+						 * clear the ILOCK bit from
+						 * the lock_flags so that we
+						 * won't try to drop a lock
+						 * we don't hold below.
+						 */
+						lock_flags &= ~XFS_ILOCK_SHARED;
+						IPOINTER_REMOVE(ip_next, mp);
+					} else if ((xfs_ipincount(ip) == 0) &&
+						   xfs_iflock_nowait(ip)) {
+						ASSERT(ip->i_mount == mp);
+						/*
+						 * Since this is vfs_sync()
+						 * calling we only flush the
+						 * inode out if we can lock
+						 * it without sleeping and
+						 * it is not pinned.  Drop
+						 * the mount lock here so
+						 * that we don't hold it for
+						 * too long. We already have
+						 * a marker in the list here.
+						 */
+						XFS_MOUNT_IUNLOCK(mp);
+						mount_locked = B_FALSE;
+						error = xfs_iflush(ip,
+							   XFS_IFLUSH_DELWRI);
+					} else {
+						ASSERT(ip->i_mount == mp);
+						IPOINTER_REMOVE(ip_next, mp);
+					}
+				}
+
+			}
+
+		} else {
+			if ((flags & SYNC_ATTR) &&
+			    ((ip->i_update_core) ||
+			     ((ip->i_itemp != NULL) &&
+			      (ip->i_itemp->ili_format.ilf_fields != 0)))) {
+				if (mount_locked) {
+					IPOINTER_INSERT(ip, mp);
+				}
+
+				if (flags & SYNC_WAIT) {
+					xfs_iflock(ip);
+					error = xfs_iflush(ip,
+							   XFS_IFLUSH_SYNC);
+				} else {
+					/*
+					 * If we can't acquire the flush
+					 * lock, then the inode is already
+					 * being flushed so don't bother
+					 * waiting.  If we can lock it then
+					 * do a delwri flush so we can
+					 * combine multiple inode flushes
+					 * in each disk write.
+					 */
+					if (xfs_iflock_nowait(ip)) {
+						error = xfs_iflush(ip,
+							   XFS_IFLUSH_DELWRI);
+					}
+					else if (bypassed)
+						(*bypassed)++;
+				}
+			}
+		}
+
+		if (lock_flags != 0) {
+			xfs_iunlock(ip, lock_flags);
+		}
+
+		if (vnode_refed) {
+			/*
+			 * If we had to take a reference on the vnode
+			 * above, then wait until after we've unlocked
+			 * the inode to release the reference.	This is
+			 * because we can be already holding the inode
+			 * lock when VN_RELE() calls xfs_inactive().
+			 *
+			 * Make sure to drop the mount lock before calling
+			 * VN_RELE() so that we don't trip over ourselves if
+			 * we have to go for the mount lock again in the
+			 * inactive code.
+			 */
+			if (mount_locked) {
+				IPOINTER_INSERT(ip, mp);
+			}
+
+			VN_RELE(vp);
+
+			vnode_refed = B_FALSE;
+		}
+
+		if (error) {
+			last_error = error;
+		}
+
+		/*
+		 * bail out if the filesystem is corrupted.
+		 */
+		if (error == EFSCORRUPTED)  {
+			if (!mount_locked) {
+				XFS_MOUNT_ILOCK(mp);
+				IPOINTER_REMOVE(ip, mp);
+			}
+			XFS_MOUNT_IUNLOCK(mp);
+			ASSERT(ipointer_in == B_FALSE);
+			kmem_free(ipointer, sizeof(xfs_iptr_t));
+			return XFS_ERROR(error);
+		}
+
+		/* Let other threads have a chance at the mount lock
+		 * if we have looped many times without dropping the
+		 * lock.
+		 */
+		if ((++preempt & PREEMPT_MASK) == 0) {
+			if (mount_locked) {
+				IPOINTER_INSERT(ip, mp);
+			}
+		}
+
+		if (mount_locked == B_FALSE) {
+			XFS_MOUNT_ILOCK(mp);
+			mount_locked = B_TRUE;
+			IPOINTER_REMOVE(ip, mp);
+			continue;
+		}
+
+		ASSERT(ipointer_in == B_FALSE);
+		ip = ip->i_mnext;
+
+	} while (ip->i_mnext != mp->m_inodes);
+
+	XFS_MOUNT_IUNLOCK(mp);
+
+	ASSERT(ipointer_in == B_FALSE);
+
+	/*
+	 * Get the Quota Manager to flush the dquots in a similar manner.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if ((error = xfs_qm_sync(mp, flags))) {
+			/*
+			 * If we got an IO error, we will be shutting down.
+			 * So, there's nothing more for us to do here.
+			 */
+			ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
+			if (XFS_FORCED_SHUTDOWN(mp)) {
+				kmem_free(ipointer, sizeof(xfs_iptr_t));
+				return XFS_ERROR(error);
+			}
+		}
+	}
+
+	/*
+	 * Flushing out dirty data above probably generated more
+	 * log activity, so if this isn't vfs_sync() then flush
+	 * the log again.  If SYNC_WAIT is set then do it synchronously.
+	 */
+	if (!(flags & SYNC_BDFLUSH)) {
+		log_flags = XFS_LOG_FORCE;
+		if (flags & SYNC_WAIT) {
+			log_flags |= XFS_LOG_SYNC;
+		}
+		xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
+	}
+
+	if (flags & SYNC_FSDATA) {
+		/*
+		 * If this is vfs_sync() then only sync the superblock
+		 * if we can lock it without sleeping and it is not pinned.
+		 */
+		if (flags & SYNC_BDFLUSH) {
+			bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+			if (bp != NULL) {
+				bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
+				if ((bip != NULL) &&
+				    xfs_buf_item_dirty(bip)) {
+					if (!(XFS_BUF_ISPINNED(bp))) {
+						XFS_BUF_ASYNC(bp);
+						error = xfs_bwrite(mp, bp);
+					} else {
+						xfs_buf_relse(bp);
+					}
+				} else {
+					xfs_buf_relse(bp);
+				}
+			}
+		} else {
+			bp = xfs_getsb(mp, 0);
+			/*
+			 * If the buffer is pinned then push on the log so
+			 * we won't get stuck waiting in the write for
+			 * someone, maybe ourselves, to flush the log.
+			 * Even though we just pushed the log above, we
+			 * did not have the superblock buffer locked at
+			 * that point so it can become pinned in between
+			 * there and here.
+			 */
+			if (XFS_BUF_ISPINNED(bp)) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE);
+			}
+			XFS_BUF_BFLAGS(bp) |= fflag;
+			error = xfs_bwrite(mp, bp);
+		}
+		if (error) {
+			last_error = error;
+		}
+	}
+
+	/*
+	 * If this is the 30 second sync, then kick some entries out of
+	 * the reference cache.	 This ensures that idle entries are
+	 * eventually kicked out of the cache.
+	 */
+	if (flags & SYNC_BDFLUSH) {
+		xfs_refcache_purge_some(mp);
+	}
+
+	/*
+	 * Now check to see if the log needs a "dummy" transaction.
+	 */
+
+	if (xfs_log_need_covered(mp)) {
+		xfs_trans_t *tp;
+
+		/*
+		 * Put a dummy transaction in the log to tell
+		 * recovery that all others are OK.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+		if ((error = xfs_trans_reserve(tp, 0,
+				XFS_ICHANGE_LOG_RES(mp),
+				0, 0, 0)))  {
+			xfs_trans_cancel(tp, 0);
+			kmem_free(ipointer, sizeof(xfs_iptr_t));
+			return error;
+		}
+
+		ip = mp->m_rootip;
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		error = xfs_trans_commit(tp, 0, NULL);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	/*
+	 * When shutting down, we need to insure that the AIL is pushed
+	 * to disk or the filesystem can appear corrupt from the PROM.
+	 */
+	if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
+		XFS_bflush(mp->m_ddev_targp);
+		if (mp->m_rtdev_targp) {
+			XFS_bflush(mp->m_rtdev_targp);
+		}
+	}
+
+	kmem_free(ipointer, sizeof(xfs_iptr_t));
+	return XFS_ERROR(last_error);
+}
+
+
+/*
+ * xfs_vget - called by DMAPI to get vnode from file handle
+ */
+STATIC int
+xfs_vget(
+	bhv_desc_t	*bdp,
+	vnode_t		**vpp,
+	fid_t		*fidp)
+{
+	xfs_fid_t	*xfid;
+	xfs_inode_t	*ip;
+	int		error;
+	xfs_ino_t	ino;
+	unsigned int	igen;
+	xfs_mount_t	*mp;
+	struct inode	*inode = NULL;
+
+	xfid  = (struct xfs_fid *)fidp;
+
+	if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) {
+		ino  = xfid->xfs_fid_ino;
+		igen = xfid->xfs_fid_gen;
+	} else {
+		/*
+		 * Invalid.  Since handles can be created in user space
+		 * and passed in via gethandle(), this is not cause for
+		 * a panic.
+		 */
+		return XFS_ERROR(EINVAL);
+	}
+	mp = XFS_BHVTOM(bdp);
+	error = xfs_iget(mp, NULL, ino, XFS_ILOCK_SHARED, &ip, 0);
+	if (error) {
+		*vpp = NULL;
+		return error;
+	}
+	if (ip == NULL) {
+		*vpp = NULL;
+		return XFS_ERROR(EIO);
+	}
+
+	if (ip->i_d.di_mode == 0 || (igen && (ip->i_d.di_gen != igen))) {
+		xfs_iput_new(ip, XFS_ILOCK_SHARED);
+		*vpp = NULL;
+		return XFS_ERROR(ENOENT);
+	}
+
+	*vpp = XFS_ITOV(ip);
+	inode = LINVFS_GET_IP((*vpp));
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	error = linvfs_revalidate_core(inode, ATTR_COMM);
+	if (error) {
+		iput(inode);
+		return XFS_ERROR(error);
+	}
+	return 0;
+}
+
+
+vfsops_t xfs_vfsops = {
+	.vfs_mount		= xfs_mount,
+	.vfs_dounmount		= fs_dounmount,
+	.vfs_unmount		= xfs_unmount,
+	.vfs_root		= xfs_root,
+	.vfs_statvfs		= xfs_statvfs,
+	.vfs_sync		= xfs_sync,
+	.vfs_vget		= xfs_vget,
+	.vfs_force_shutdown	= xfs_do_force_shutdown,
+#ifdef CONFIG_XFS_DMAPI
+	.vfs_dmapi_mount	= xfs_dm_mount,
+	.vfs_dmapi_fsys_vector	= xfs_dm_get_fsys_vector,
+#endif
+};
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfs_vnodeops.c linux-2.4.20-pre10-mjc2/fs/xfs/xfs_vnodeops.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfs_vnodeops.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfs_vnodeops.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,5269 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <asm/fcntl.h>
+
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 2 extents back from
+ * bmapi.
+ */
+#define SYMLINK_MAPS 2
+
+extern int xfs_ioctl(bhv_desc_t *, struct inode *, struct file *,
+			unsigned int, unsigned long);
+
+
+#ifdef XFS_RW_TRACE
+STATIC void
+xfs_ctrunc_trace(
+	int		tag,
+	xfs_inode_t	*ip);
+#else
+#define xfs_ctrunc_trace(tag, ip)
+#endif /* DEBUG */
+
+/*
+ * For xfs, we check that the file isn't too big to be opened by this kernel.
+ * No other open action is required for regular files.	Devices are handled
+ * through the specfs file system, pipes through fifofs.  Device and
+ * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
+ * when a new vnode is first looked up or created.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_open(
+	bhv_desc_t	*bdp,
+	cred_t		*credp)
+{
+	int		mode;
+	vnode_t		*vp;
+	xfs_inode_t	*ip;
+
+	vp = BHV_TO_VNODE(bdp);
+	ip = XFS_BHVTOI(bdp);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * If it's a directory with any blocks, read-ahead block 0
+	 * as we're almost certain to have the next operation be a read there.
+	 */
+	if (vp->v_type == VDIR && ip->i_d.di_nextents > 0) {
+		mode = xfs_ilock_map_shared(ip);
+		(void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+		xfs_iunlock(ip, mode);
+	}
+	return 0;
+}
+
+
+/*
+ * xfs_getattr
+ */
+/*ARGSUSED*/
+int
+xfs_getattr(
+	bhv_desc_t	*bdp,
+	vattr_t		*vap,
+	int		flags,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+	vnode_t		*vp;
+
+	vp  = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_getattr", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	if (!(flags & ATTR_LAZY))
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	vap->va_size = ip->i_d.di_size;
+	if (vap->va_mask == AT_SIZE) {
+		if (!(flags & ATTR_LAZY))
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return 0;
+	}
+	vap->va_nblocks =
+		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+	vap->va_fsid = mp->m_dev;
+#if XFS_BIG_FILESYSTEMS
+	vap->va_nodeid = ip->i_ino + mp->m_inoadd;
+#else
+	vap->va_nodeid = ip->i_ino;
+#endif
+	vap->va_nlink = ip->i_d.di_nlink;
+
+	/*
+	 * Quick exit for non-stat callers
+	 */
+	if ((vap->va_mask & ~(AT_SIZE|AT_FSID|AT_NODEID|AT_NLINK)) == 0) {
+		if (!(flags & ATTR_LAZY))
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return 0;
+	}
+
+	/*
+	 * Copy from in-core inode.
+	 */
+	vap->va_type = vp->v_type;
+	vap->va_mode = ip->i_d.di_mode & MODEMASK;
+	vap->va_uid = ip->i_d.di_uid;
+	vap->va_gid = ip->i_d.di_gid;
+	vap->va_projid = ip->i_d.di_projid;
+
+	/*
+	 * Check vnode type block/char vs. everything else.
+	 * Do it with bitmask because that's faster than looking
+	 * for multiple values individually.
+	 */
+	if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
+		vap->va_rdev = 0;
+
+		if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+
+#if 0
+			/* Large block sizes confuse various
+			 * user space programs, so letting the
+			 * stripe size through is not a good
+			 * idea for now.
+			 */
+			vap->va_blksize = mp->m_swidth ?
+				/*
+				 * If the underlying volume is a stripe, then
+				 * return the stripe width in bytes as the
+				 * recommended I/O size.
+				 */
+				(mp->m_swidth << mp->m_sb.sb_blocklog) :
+				/*
+				 * Return the largest of the preferred buffer
+				 * sizes since doing small I/Os into larger
+				 * buffers causes buffers to be decommissioned.
+				 * The value returned is in bytes.
+				 */
+				(1 << (int)MAX(mp->m_readio_log,
+					       mp->m_writeio_log));
+
+#else
+			vap->va_blksize =
+				/*
+				 * Return the largest of the preferred buffer
+				 * sizes since doing small I/Os into larger
+				 * buffers causes buffers to be decommissioned.
+				 * The value returned is in bytes.
+				 */
+				1 << (int)MAX(mp->m_readio_log,
+					       mp->m_writeio_log);
+#endif
+		} else {
+
+			/*
+			 * If the file blocks are being allocated from a
+			 * realtime partition, then return the inode's
+			 * realtime extent size or the realtime volume's
+			 * extent size.
+			 */
+			vap->va_blksize = ip->i_d.di_extsize ?
+				(ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
+				(mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
+		}
+	} else {
+		vap->va_rdev = IRIX_DEV_TO_KDEVT(ip->i_df.if_u2.if_rdev);
+		vap->va_blksize = BLKDEV_IOSIZE;
+	}
+
+	vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
+	vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
+	vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
+	vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
+	vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
+	vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+
+	/*
+	 * Exit for stat callers.  See if any of the rest of the fields
+	 * to be filled in are needed.
+	 */
+	if ((vap->va_mask &
+	     (AT_XFLAGS|AT_EXTSIZE|AT_NEXTENTS|AT_ANEXTENTS|
+	      AT_GENCOUNT|AT_VCODE)) == 0) {
+		if (!(flags & ATTR_LAZY))
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return 0;
+	}
+	/*
+	 * convert di_flags to xflags
+	 */
+	vap->va_xflags =
+		((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+			XFS_XFLAG_REALTIME : 0) |
+		((ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) ?
+			XFS_XFLAG_PREALLOC : 0);
+	vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
+	vap->va_nextents =
+		(ip->i_df.if_flags & XFS_IFEXTENTS) ?
+			ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
+			ip->i_d.di_nextents;
+	if (ip->i_afp != NULL)
+		vap->va_anextents =
+			(ip->i_afp->if_flags & XFS_IFEXTENTS) ?
+				ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
+				 ip->i_d.di_anextents;
+	else
+		vap->va_anextents = 0;
+	vap->va_gencount = ip->i_d.di_gen;
+	vap->va_vcode = 0L;
+
+	if (!(flags & ATTR_LAZY))
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return 0;
+}
+
+
+/*
+ * xfs_setattr
+ */
+STATIC int
+xfs_setattr(
+	bhv_desc_t	*bdp,
+	vattr_t		*vap,
+	int		flags,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	int		mask;
+	int		code;
+	uint		lock_flags;
+	uint		commit_flags=0;
+	uid_t		uid=0, iuid=0;
+	gid_t		gid=0, igid=0;
+	int		timeflags = 0;
+	vnode_t		*vp;
+	xfs_prid_t	projid=0, iprojid=0;
+	int		privileged;
+	int		mandlock_before, mandlock_after;
+	uint		qflags;
+	struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
+	int		file_owner;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_setattr", (inst_t *)__return_address);
+	/*
+	 * Cannot set certain attributes.
+	 */
+	mask = vap->va_mask;
+	if (mask & AT_NOSET) {
+		return XFS_ERROR(EINVAL);
+	}
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * Timestamps do not need to be logged and hence do not
+	 * need to be done within a transaction.
+	 */
+	if (mask & AT_UPDTIMES) {
+		ASSERT((mask & ~AT_UPDTIMES) == 0);
+		timeflags = ((mask & AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
+			    ((mask & AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
+			    ((mask & AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
+		xfs_ichgtime(ip, timeflags);
+		return 0;
+	}
+
+	olddquot1 = olddquot2 = NULL;
+	udqp = gdqp = NULL;
+
+	/*
+	 * If disk quotas is on, we make sure that the dquots do exist on disk,
+	 * before we start any other transactions. Trying to do this later
+	 * is messy. We don't care to take a readlock to look at the ids
+	 * in inode here, because we can't hold it across the trans_reserve.
+	 * If the IDs do change before we take the ilock, we're covered
+	 * because the i_*dquot fields will get updated anyway.
+	 */
+	if (XFS_IS_QUOTA_ON(mp) && (mask & (AT_UID|AT_GID))) {
+		qflags = 0;
+		if (mask & AT_UID) {
+			uid = vap->va_uid;
+			qflags |= XFS_QMOPT_UQUOTA;
+		} else {
+			uid = ip->i_d.di_uid;
+		}
+		if (mask & AT_GID) {
+			gid = vap->va_gid;
+			qflags |= XFS_QMOPT_GQUOTA;
+		}  else {
+			gid = ip->i_d.di_gid;
+		}
+		/*
+		 * We take a reference when we initialize udqp and gdqp,
+		 * so it is important that we never blindly double trip on
+		 * the same variable. See xfs_create() for an example.
+		 */
+		ASSERT(udqp == NULL);
+		ASSERT(gdqp == NULL);
+		if ((code = xfs_qm_vop_dqalloc(mp, ip, uid, gid, qflags,
+						    &udqp, &gdqp)))
+			return (code);
+	}
+
+	/*
+	 * For the other attributes, we acquire the inode lock and
+	 * first do an error checking pass.
+	 */
+	tp = NULL;
+	lock_flags = XFS_ILOCK_EXCL;
+	if (!(mask & AT_SIZE)) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+		commit_flags = 0;
+		if ((code = xfs_trans_reserve(tp, 0,
+					     XFS_ICHANGE_LOG_RES(mp), 0,
+					     0, 0))) {
+			lock_flags = 0;
+			goto error_return;
+		}
+	} else {
+		if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
+		    !(flags & ATTR_DMI)) {
+			code = xfs_dm_send_data_event (DM_EVENT_TRUNCATE, bdp,
+				vap->va_size, 0, AT_DELAY_FLAG(flags), NULL);
+			if (code) {
+				lock_flags = 0;
+				goto error_return;
+			}
+		}
+		lock_flags |= XFS_IOLOCK_EXCL;
+	}
+
+	xfs_ilock(ip, lock_flags);
+
+	if (_MAC_XFS_IACCESS(ip, MACWRITE, credp)) {
+		code = XFS_ERROR(EACCES);
+		goto error_return;
+	}
+
+	/* boolean: are we the file owner? */
+	file_owner = (current->fsuid == ip->i_d.di_uid);
+
+	/*
+	 * Change various properties of a file.
+	 * Only the owner or users with CAP_FOWNER
+	 * capability may do these things.
+	 */
+	if (mask & (AT_MODE|AT_XFLAGS|AT_EXTSIZE|AT_UID|AT_GID|AT_PROJID)) {
+		/*
+		 * CAP_FOWNER overrides the following restrictions:
+		 *
+		 * The user ID of the calling process must be equal
+		 * to the file owner ID, except in cases where the
+		 * CAP_FSETID capability is applicable.
+		 */
+		if (!file_owner && !capable(CAP_FOWNER)) {
+			code = XFS_ERROR(EPERM);
+			goto error_return;
+		}
+
+		/*
+		 * CAP_FSETID overrides the following restrictions:
+		 *
+		 * The effective user ID of the calling process shall match
+		 * the file owner when setting the set-user-ID and
+		 * set-group-ID bits on that file.
+		 *
+		 * The effective group ID or one of the supplementary group
+		 * IDs of the calling process shall match the group owner of
+		 * the file when setting the set-group-ID bit on that file
+		 */
+		if (mask & AT_MODE) {
+			mode_t m = 0;
+
+			if ((vap->va_mode & ISUID) && !file_owner)
+				m |= ISUID;
+			if ((vap->va_mode & ISGID) &&
+			    !in_group_p((gid_t)ip->i_d.di_gid))
+				m |= ISGID;
+#if 0
+			/* Linux allows this, Irix doesn't. */
+			if ((vap->va_mode & ISVTX) && vp->v_type != VDIR)
+				m |= ISVTX;
+#endif
+			if (m && !capable(CAP_FSETID))
+				vap->va_mode &= ~m;
+		}
+	}
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 * If the system was configured with the "restricted_chown"
+	 * option, the owner is not permitted to give away the file,
+	 * and can change the group id only to a group of which he
+	 * or she is a member.
+	 */
+	if (mask & (AT_UID|AT_GID|AT_PROJID)) {
+		/*
+		 * These IDs could have changed since we last looked at them.
+		 * But, we're assured that if the ownership did change
+		 * while we didn't have the inode locked, inode's dquot(s)
+		 * would have changed also.
+		 */
+		iuid = ip->i_d.di_uid;
+		iprojid = ip->i_d.di_projid;
+		igid = ip->i_d.di_gid;
+		gid = (mask & AT_GID) ? vap->va_gid : igid;
+		uid = (mask & AT_UID) ? vap->va_uid : iuid;
+		projid = (mask & AT_PROJID) ? (xfs_prid_t)vap->va_projid :
+			 iprojid;
+
+		/*
+		 * CAP_CHOWN overrides the following restrictions:
+		 *
+		 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
+		 * shall override the restriction that a process cannot
+		 * change the user ID of a file it owns and the restriction
+		 * that the group ID supplied to the chown() function
+		 * shall be equal to either the group ID or one of the
+		 * supplementary group IDs of the calling process.
+		 *
+		 * XXX: How does restricted_chown affect projid?
+		 */
+		if (restricted_chown &&
+		    (iuid != uid || (igid != gid &&
+				     !in_group_p((gid_t)gid))) &&
+		    !capable(CAP_CHOWN)) {
+			code = XFS_ERROR(EPERM);
+			goto error_return;
+		}
+		/*
+		 * Do a quota reservation only if uid or gid is actually
+		 * going to change.
+		 */
+		if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+		    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
+			ASSERT(tp);
+			/*
+			 * XXX:casey - This may result in unnecessary auditing.
+			 */
+			privileged = capable(CAP_FOWNER);
+			if ((code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+							  privileged ?
+							  XFS_QMOPT_FORCE_RES :
+							  0)))
+				/* out of quota */
+				goto error_return;
+		}
+	}
+
+	/*
+	 * Truncate file.  Must have write permission and not be a directory.
+	 */
+	if (mask & AT_SIZE) {
+		/* Short circuit the truncate case for zero length files */
+		if ((vap->va_size == 0) &&
+		   (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			lock_flags &= ~XFS_ILOCK_EXCL;
+			if (mask & AT_CTIME)
+				xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+			code = 0;
+			goto error_return;
+		}
+
+		if (vp->v_type == VDIR) {
+			code = XFS_ERROR(EISDIR);
+			goto error_return;
+		} else if (vp->v_type != VREG) {
+			code = XFS_ERROR(EINVAL);
+			goto error_return;
+		}
+		/*
+		 * Make sure that the dquots are attached to the inode.
+		 */
+		if (XFS_IS_QUOTA_ON(mp) && XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((code = xfs_qm_dqattach(ip, XFS_QMOPT_ILOCKED)))
+				goto error_return;
+		}
+	}
+
+	/*
+	 * Change file access or modified times.
+	 */
+	if (mask & (AT_ATIME|AT_MTIME)) {
+		if (!file_owner) {
+			if ((flags & ATTR_UTIME) &&
+			    !capable(CAP_FOWNER)) {
+				code = XFS_ERROR(EPERM);
+				goto error_return;
+			}
+		}
+	}
+
+	/*
+	 * Change extent size or realtime flag.
+	 */
+	if (mask & (AT_EXTSIZE|AT_XFLAGS)) {
+		/*
+		 * Can't change extent size if any extents are allocated.
+		 */
+		if (ip->i_d.di_nextents && (mask & AT_EXTSIZE) &&
+		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+		     vap->va_extsize) ) {
+			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			goto error_return;
+		}
+
+		/*
+		 * Can't set extent size unless the file is marked, or
+		 * about to be marked as a realtime file.
+		 *
+		 * This check will be removed when fixed size extents
+		 * with buffered data writes is implemented.
+		 *
+		 */
+		if ((mask & AT_EXTSIZE)				&&
+		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+		     vap->va_extsize) &&
+		    (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
+		       ((mask & AT_XFLAGS) &&
+			(vap->va_xflags & XFS_XFLAG_REALTIME))))) {
+			code = XFS_ERROR(EINVAL);
+			goto error_return;
+		}
+
+		/*
+		 * Can't change realtime flag if any extents are allocated.
+		 */
+		if (ip->i_d.di_nextents && (mask & AT_XFLAGS) &&
+		    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
+		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
+			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			goto error_return;
+		}
+		/*
+		 * Extent size must be a multiple of the appropriate block
+		 * size, if set at all.
+		 */
+		if ((mask & AT_EXTSIZE) && vap->va_extsize != 0) {
+			xfs_extlen_t	size;
+
+			if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
+			    ((mask & AT_XFLAGS) &&
+			    (vap->va_xflags & XFS_XFLAG_REALTIME))) {
+				size = mp->m_sb.sb_rextsize <<
+				       mp->m_sb.sb_blocklog;
+			} else {
+				size = mp->m_sb.sb_blocksize;
+			}
+			if (vap->va_extsize % size) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
+		}
+		/*
+		 * If realtime flag is set then must have realtime data.
+		 */
+		if ((mask & AT_XFLAGS) &&
+		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
+			if ((mp->m_sb.sb_rblocks == 0) ||
+			    (mp->m_sb.sb_rextsize == 0) ||
+			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
+		}
+	}
+
+	/*
+	 * Now we can make the changes.	 Before we join the inode
+	 * to the transaction, if AT_SIZE is set then take care of
+	 * the part of the truncation that must be done without the
+	 * inode lock.	This needs to be done before joining the inode
+	 * to the transaction, because the inode cannot be unlocked
+	 * once it is a part of the transaction.
+	 */
+	if (mask & AT_SIZE) {
+		if (vap->va_size > ip->i_d.di_size) {
+			code = xfs_igrow_start(ip, vap->va_size, credp);
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		} else if (vap->va_size < ip->i_d.di_size) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+					    (xfs_fsize_t)vap->va_size);
+			code = 0;
+		} else {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			code = 0;
+		}
+		if (code) {
+			ASSERT(tp == NULL);
+			lock_flags &= ~XFS_ILOCK_EXCL;
+			ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+			goto error_return;
+		}
+		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+		if ((code = xfs_trans_reserve(tp, 0,
+					     XFS_ITRUNCATE_LOG_RES(mp), 0,
+					     XFS_TRANS_PERM_LOG_RES,
+					     XFS_ITRUNCATE_LOG_COUNT))) {
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return code;
+		}
+		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+	}
+
+	xfs_trans_ijoin(tp, ip, lock_flags);
+	xfs_trans_ihold(tp, ip);
+
+	/* determine whether mandatory locking mode changes */
+	mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
+
+	/*
+	 * Truncate file.  Must have write permission and not be a directory.
+	 */
+	if (mask & AT_SIZE) {
+		if (vap->va_size > ip->i_d.di_size) {
+			xfs_igrow_finish(tp, ip, vap->va_size,
+			    !(flags & ATTR_DMI));
+		} else if ((vap->va_size < ip->i_d.di_size) ||
+			   ((vap->va_size == 0) && ip->i_d.di_nextents)) {
+			/*
+			 * signal a sync transaction unless
+			 * we're truncating an already unlinked
+			 * file on a wsync filesystem
+			 */
+			code = xfs_itruncate_finish(&tp, ip,
+					    (xfs_fsize_t)vap->va_size,
+					    XFS_DATA_FORK,
+					    ((ip->i_d.di_nlink != 0 ||
+					      !(mp->m_flags & XFS_MOUNT_WSYNC))
+					     ? 1 : 0));
+			if (code) {
+				goto abort_return;
+			}
+		}
+		/*
+		 * Have to do this even if the file's size doesn't change.
+		 */
+		timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+	}
+
+	/*
+	 * Change file access modes.
+	 */
+	if (mask & AT_MODE) {
+		ip->i_d.di_mode &= IFMT;
+		ip->i_d.di_mode |= vap->va_mode & ~IFMT;
+
+		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+		timeflags |= XFS_ICHGTIME_CHG;
+	}
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 * If the system was configured with the "restricted_chown"
+	 * option, the owner is not permitted to give away the file,
+	 * and can change the group id only to a group of which he
+	 * or she is a member.
+	 */
+	if (mask & (AT_UID|AT_GID|AT_PROJID)) {
+		/*
+		 * CAP_FSETID overrides the following restrictions:
+		 *
+		 * The set-user-ID and set-group-ID bits of a file will be
+		 * cleared upon successful return from chown()
+		 */
+		if ((ip->i_d.di_mode & (ISUID|ISGID)) &&
+		    !capable(CAP_FSETID)) {
+			ip->i_d.di_mode &= ~(ISUID|ISGID);
+		}
+
+		/*
+		 * Change the ownerships and register quota modifications
+		 * in the transaction.
+		 */
+		if (iuid != uid) {
+			if (XFS_IS_UQUOTA_ON(mp)) {
+				ASSERT(mask & AT_UID);
+				ASSERT(udqp);
+				ASSERT(xfs_qm_dqid(udqp) == (xfs_dqid_t)uid);
+				olddquot1 = xfs_qm_vop_chown(tp, ip,
+							     &ip->i_udquot,
+							     udqp);
+				/*
+				 * We'll dqrele olddquot at the end.
+				 */
+			}
+			ip->i_d.di_uid = uid;
+		}
+		if (igid != gid) {
+			if (XFS_IS_GQUOTA_ON(mp)) {
+				ASSERT(mask & AT_GID);
+				ASSERT(gdqp);
+				ASSERT(xfs_qm_dqid(gdqp) == gid);
+				olddquot2 = xfs_qm_vop_chown(tp, ip,
+							     &ip->i_gdquot,
+							     gdqp);
+			}
+			ip->i_d.di_gid = gid;
+		}
+		if (iprojid != projid) {
+			ip->i_d.di_projid = projid;
+			/*
+			 * We may have to rev the inode as well as
+			 * the superblock version number since projids didn't
+			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
+			 */
+			if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+				xfs_bump_ino_vers2(tp, ip);
+		}
+
+		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+		timeflags |= XFS_ICHGTIME_CHG;
+	}
+
+
+	/*
+	 * Change file access or modified times.
+	 */
+	if (mask & (AT_ATIME|AT_MTIME)) {
+		if (mask & AT_ATIME) {
+			ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
+			ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
+			ip->i_update_core = 1;
+			timeflags &= ~XFS_ICHGTIME_ACC;
+		}
+		if (mask & AT_MTIME) {
+			ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
+			ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
+			timeflags &= ~XFS_ICHGTIME_MOD;
+			timeflags |= XFS_ICHGTIME_CHG;
+		}
+	}
+
+	/*
+	 * Change XFS-added attributes.
+	 */
+	if (mask & (AT_EXTSIZE|AT_XFLAGS)) {
+		if (mask & AT_EXTSIZE) {
+			/*
+			 * Converting bytes to fs blocks.
+			 */
+			ip->i_d.di_extsize = vap->va_extsize >>
+				mp->m_sb.sb_blocklog;
+		}
+		if (mask & AT_XFLAGS) {
+			ip->i_d.di_flags = 0;
+			if (vap->va_xflags & XFS_XFLAG_REALTIME) {
+				ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
+				/* This is replicated in the io core for
+				 * CXFS use
+				 */
+				ip->i_iocore.io_flags |= XFS_IOCORE_RT;
+			}
+			/* can't set PREALLOC this way, just ignore it */
+		}
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		timeflags |= XFS_ICHGTIME_CHG;
+	}
+
+	/*
+	 * Change file inode change time only if AT_CTIME set
+	 * AND we have been called by a DMI function.
+	 */
+
+	if ( (flags & ATTR_DMI) && (mask & AT_CTIME) ) {
+		ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
+		ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
+		ip->i_update_core = 1;
+		timeflags &= ~XFS_ICHGTIME_CHG;
+	}
+
+	/*
+	 * Send out timestamp changes that need to be set to the
+	 * current time.  Not done when called by a DMI function.
+	 */
+	if (timeflags && !(flags & ATTR_DMI))
+		xfs_ichgtime(ip, timeflags);
+
+	XFS_STATS_INC(xfsstats.xs_ig_attrchg);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 * This is slightly sub-optimal in that truncates require
+	 * two sync transactions instead of one for wsync filesytems.
+	 * One for the truncate and one for the timestamps since we
+	 * don't want to change the timestamps unless we're sure the
+	 * truncate worked.  Truncates are less than 1% of the laddis
+	 * mix so this probably isn't worth the trouble to optimize.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+
+	code = xfs_trans_commit(tp, commit_flags, NULL);
+
+	/*
+	 * If the (regular) file's mandatory locking mode changed, then
+	 * notify the vnode.  We do this under the inode lock to prevent
+	 * racing calls to vop_vnode_change.
+	 */
+	mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
+	if (mandlock_before != mandlock_after) {
+		VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
+				 mandlock_after);
+	}
+
+	xfs_iunlock(ip, lock_flags);
+
+	/*
+	 * release any dquot(s) inode had kept before chown
+	 */
+	if (olddquot1)
+		xfs_qm_dqrele(olddquot1);
+	if (olddquot2)
+		xfs_qm_dqrele(olddquot2);
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	if (code) {
+		return code;
+	}
+
+	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
+	    !(flags & ATTR_DMI)) {
+		(void) dm_send_namesp_event (DM_EVENT_ATTRIBUTE, bdp, DM_RIGHT_NULL,
+				NULL, DM_RIGHT_NULL, NULL, NULL,
+			0, 0, AT_DELAY_FLAG(flags));
+	}
+	return 0;
+
+ abort_return:
+	commit_flags |= XFS_TRANS_ABORT;
+	/* FALLTHROUGH */
+ error_return:
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+	if (tp) {
+		xfs_trans_cancel(tp, commit_flags);
+	}
+	if (lock_flags != 0) {
+		xfs_iunlock(ip, lock_flags);
+	}
+	return code;
+} /* xfs_setattr */
+
+
+/*
+ * xfs_access
+ * Null conversion from vnode mode bits to inode mode bits, as in efs.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_access(
+	bhv_desc_t	*bdp,
+	int		mode,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	int		error;
+
+	vn_trace_entry(BHV_TO_VNODE(bdp), "xfs_access",
+					       (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_iaccess(ip, mode, credp);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+
+/*
+ * xfs_readlink
+ *
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_readlink(
+	bhv_desc_t	*bdp,
+	uio_t		*uiop,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	int		count;
+	xfs_off_t	offset;
+	int		pathlen;
+	vnode_t		*vp;
+	int		error = 0;
+	xfs_mount_t	*mp;
+	int		nmaps;
+	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+	xfs_daddr_t	d;
+	int		byte_cnt;
+	int		n;
+	xfs_buf_t	*bp;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_readlink", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	ASSERT((ip->i_d.di_mode & IFMT) == IFLNK);
+
+	offset = uiop->uio_offset;
+	count = uiop->uio_resid;
+
+	if (offset < 0) {
+		error = XFS_ERROR(EINVAL);
+		goto error_return;
+	}
+	if (count <= 0) {
+		error = 0;
+		goto error_return;
+	}
+
+	if (!(uiop->uio_fmode & FINVIS)) {
+		xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
+	}
+
+	/*
+	 * See if the symlink is stored inline.
+	 */
+	pathlen = (int)ip->i_d.di_size;
+
+	if (ip->i_df.if_flags & XFS_IFINLINE) {
+		error = uiomove(ip->i_df.if_u1.if_data, pathlen, UIO_READ, uiop);
+	}
+	else {
+		/*
+		 * Symlink not inline.	Call bmap to get it in.
+		 */
+		nmaps = SYMLINK_MAPS;
+
+		error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
+				  0, NULL, 0, mval, &nmaps, NULL);
+
+		if (error) {
+			goto error_return;
+		}
+
+		for (n = 0; n < nmaps; n++) {
+			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+			bp = xfs_buf_read(mp->m_ddev_targp, d,
+				      BTOBB(byte_cnt), 0);
+			error = XFS_BUF_GETERROR(bp);
+			if (error) {
+				xfs_ioerror_alert("xfs_readlink",
+					  ip->i_mount, bp, XFS_BUF_ADDR(bp));
+				xfs_buf_relse(bp);
+				goto error_return;
+			}
+			if (pathlen < byte_cnt)
+				byte_cnt = pathlen;
+			pathlen -= byte_cnt;
+
+			error = uiomove(XFS_BUF_PTR(bp), byte_cnt,
+					 UIO_READ, uiop);
+			xfs_buf_relse (bp);
+		}
+
+	}
+
+
+error_return:
+
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	return error;
+}
+
+/*
+ * xfs_fsync
+ *
+ * This is called to sync the inode and its data out to disk.
+ * We need to hold the I/O lock while flushing the data, and
+ * the inode lock while flushing the inode.  The inode lock CANNOT
+ * be held while flushing the data, so acquire after we're done
+ * with that.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_fsync(
+	bhv_desc_t	*bdp,
+	int		flag,
+	cred_t		*credp,
+	xfs_off_t	start,
+	xfs_off_t	stop)
+{
+	xfs_inode_t	*ip;
+	int		error;
+	/* REFERENCED */
+	int		error2;
+					/* REFERENCED */
+	int		syncall;
+	vnode_t		*vp;
+	xfs_trans_t	*tp;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_fsync", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+
+	ASSERT(start >= 0 && stop >= -1);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return XFS_ERROR(EIO);
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	syncall = error = error2 = 0;
+
+	if (stop == -1)	 {
+		ASSERT(start >= 0);
+		if (start == 0)
+			syncall = 1;
+		stop = xfs_file_last_byte(ip);
+	}
+
+	/*
+	 * If we're invalidating, always flush since we want to
+	 * tear things down.  Otherwise, don't flush anything if
+	 * we're not dirty.
+	 */
+	if (flag & FSYNC_INVAL) {
+		if (ip->i_df.if_flags & XFS_IFEXTENTS &&
+		    ip->i_df.if_bytes > 0) {
+			VOP_FLUSHINVAL_PAGES(vp, start, -1, FI_REMAPF_LOCKED);
+		}
+		ASSERT(syncall == 0 || (VN_CACHED(vp) == 0));
+	} else {
+		/*
+		 * In the non-invalidating case, calls to fsync() do not
+		 * flush all the dirty mmap'd pages.  That requires a
+		 * call to msync().
+		 */
+		VOP_FLUSH_PAGES(vp, start, -1,
+				(flag & FSYNC_WAIT) ? 0 : XFS_B_ASYNC,
+				FI_NONE, error2);
+	}
+
+	if (error2) {
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return XFS_ERROR(error2);
+	}
+
+	/*
+	 * We always need to make sure that the required inode state
+	 * is safe on disk.  The vnode might be clean but because
+	 * of committed transactions that haven't hit the disk yet.
+	 * Likewise, there could be unflushed non-transactional
+	 * changes to the inode core that have to go to disk.
+	 *
+	 * The following code depends on one assumption:  that
+	 * any transaction that changes an inode logs the core
+	 * because it has to change some field in the inode core
+	 * (typically nextents or nblocks).  That assumption
+	 * implies that any transactions against an inode will
+	 * catch any non-transactional updates.	 If inode-altering
+	 * transactions exist that violate this assumption, the
+	 * code breaks.	 Right now, it figures that if the involved
+	 * update_* field is clear and the inode is unpinned, the
+	 * inode is clean.  Either it's been flushed or it's been
+	 * committed and the commit has hit the disk unpinning the inode.
+	 * (Note that xfs_inode_item_format() called at commit clears
+	 * the update_* fields.)
+	 */
+	if (!(flag & FSYNC_DATA)) {
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+		if (ip->i_update_core == 0)  {
+			/*
+			 * Timestamps/size haven't changed since last inode
+			 * flush or inode transaction commit.  That means
+			 * either nothing got written or a transaction
+			 * committed which caught the updates.	If the
+			 * latter happened and the transaction hasn't
+			 * hit the disk yet, the inode will be still
+			 * be pinned.  If it is, force the log.
+			 */
+			if (xfs_ipincount(ip) == 0)  {
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL |
+						XFS_ILOCK_SHARED);
+			} else	{
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL |
+						XFS_ILOCK_SHARED);
+				xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE |
+					      ((flag & FSYNC_WAIT)
+					       ? XFS_LOG_SYNC : 0));
+			}
+			error = 0;
+		} else	{
+			/*
+			 * Kick off a transaction to log the inode
+			 * core to get the updates.  Make it
+			 * sync if FSYNC_WAIT is passed in (which
+			 * is done by everybody but specfs).  The
+			 * sync transaction will also force the log.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+			if ((error = xfs_trans_reserve(tp, 0,
+					XFS_FSYNC_TS_LOG_RES(ip->i_mount),
+					0, 0, 0)))  {
+				xfs_trans_cancel(tp, 0);
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+				return error;
+			}
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+			/*
+			 * Note - it's possible that we might have pushed
+			 * ourselves out of the way during trans_reserve
+			 * which would flush the inode.	 But there's no
+			 * guarantee that the inode buffer has actually
+			 * gone out yet (it's delwri).	Plus the buffer
+			 * could be pinned anyway if it's part of an
+			 * inode in another recent transaction.	 So we
+			 * play it safe and fire off the transaction anyway.
+			 */
+			xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+			xfs_trans_ihold(tp, ip);
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+			if (flag & FSYNC_WAIT)
+				xfs_trans_set_sync(tp);
+			error = xfs_trans_commit(tp, 0, NULL);
+
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+		}
+	} else {
+		/*
+		 * We don't care about the timestamps here.  We
+		 * only care about the size field growing on us
+		 * and forcing any space allocation transactions.
+		 * We have to flush changes to the size fields
+		 * otherwise we could write out data that
+		 * becomes inaccessible after a crash.
+		 */
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+		if (ip->i_update_size == 0)  {
+			/*
+			 * Force the log if the inode is pinned.
+			 * That ensures that all transactions committed
+			 * against the inode hit the disk.  This may do
+			 * too much work but it's safe.
+			 */
+			if (xfs_ipincount(ip) == 0)  {
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL |
+						XFS_ILOCK_SHARED);
+			} else	{
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL |
+						XFS_ILOCK_SHARED);
+				xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE |
+					      ((flag & FSYNC_WAIT)
+					       ? XFS_LOG_SYNC : 0));
+			}
+			error = 0;
+		} else	{
+			/*
+			 * Kick off a sync transaction to log the inode
+			 * core.  The transaction has to be sync since
+			 * we need these updates to guarantee that the
+			 * data written will be seen.  The sync
+			 * transaction will also force the log.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+			tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+			if ((error = xfs_trans_reserve(tp, 0,
+					XFS_FSYNC_TS_LOG_RES(ip->i_mount),
+					0, 0, 0)))  {
+				xfs_trans_cancel(tp, 0);
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+				return error;
+			}
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+			/*
+			 * Note - it's possible that we might have pushed
+			 * ourselves out of the way during trans_reserve
+			 * which would flush the inode.	 But there's no
+			 * guarantee that the inode buffer has actually
+			 * gone out yet (it's delwri).	Plus the buffer
+			 * could be pinned anyway if it's part of an
+			 * inode in another recent transaction.	 So we
+			 * play it safe and fire off the transaction anyway.
+			 */
+			xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+			xfs_trans_ihold(tp, ip);
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+			if (flag & FSYNC_WAIT)
+				xfs_trans_set_sync(tp);
+			error = xfs_trans_commit(tp, 0, NULL);
+
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+		}
+	}
+	return error;
+}
+
+
+#if 0
+/*
+ * This is a utility routine for xfs_inactive.	It is called when a
+ * transaction attempting to free up the disk space for a file encounters
+ * an error.  It cancels the old transaction and starts up a new one
+ * to be used to free up the inode.  It also sets the inode size and extent
+ * counts to 0 and frees up any memory being used to store inline data,
+ * extents, or btree roots.
+ */
+STATIC void
+xfs_itruncate_cleanup(
+	xfs_trans_t	**tpp,
+	xfs_inode_t	*ip,
+	int		commit_flags,
+	int		fork)
+{
+	xfs_mount_t	*mp;
+	/* REFERENCED */
+	int		error;
+
+	mp = ip->i_mount;
+	if (*tpp) {
+		xfs_trans_cancel(*tpp, commit_flags | XFS_TRANS_ABORT);
+	}
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	*tpp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(*tpp, 0, XFS_IFREE_LOG_RES(mp), 0, 0,
+				  XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		return;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin(*tpp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ihold(*tpp, ip);
+
+	xfs_idestroy_fork(ip, fork);
+
+	if (fork == XFS_DATA_FORK) {
+		ip->i_d.di_nblocks = 0;
+		ip->i_d.di_nextents = 0;
+		ip->i_d.di_size = 0;
+	} else {
+		ip->i_d.di_anextents = 0;
+	}
+	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+}
+#endif
+
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof,
+ * when the link count isn't zero.
+ */
+STATIC int
+xfs_inactive_free_eofblocks(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	xfs_fileoff_t	end_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_filblks_t	map_len;
+	int		nimaps;
+	xfs_bmbt_irec_t imap;
+
+	/*
+	 * Figure out if there are any blocks beyond the end
+	 * of the file.	 If not, then there is nothing to do.
+	 */
+	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
+	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAX_FILE_OFFSET);
+	map_len = last_fsb - end_fsb;
+	if (map_len <= 0)
+		return (0);
+
+	nimaps = 1;
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
+			  NULL, 0, &imap, &nimaps, NULL);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!error && (nimaps != 0) &&
+	    (imap.br_startblock != HOLESTARTBLOCK)) {
+		/*
+		 * Attach the dquots to the inode up front.
+		 */
+		if (XFS_IS_QUOTA_ON(mp) &&
+		    ip->i_ino != mp->m_sb.sb_uquotino &&
+		    ip->i_ino != mp->m_sb.sb_gquotino) {
+			if (XFS_NOT_DQATTACHED(mp, ip)) {
+				if ((error = xfs_qm_dqattach(ip, 0)))
+					return (error);
+			}
+		}
+
+		/*
+		 * There are blocks after the end of file.
+		 * Free them up now by truncating the file to
+		 * its current size.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+		/*
+		 * Do the xfs_itruncate_start() call before
+		 * reserving any log space because
+		 * itruncate_start will call into the buffer
+		 * cache and we can't
+		 * do that within a transaction.
+		 */
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+				    ip->i_d.di_size);
+
+		error = xfs_trans_reserve(tp, 0,
+					  XFS_ITRUNCATE_LOG_RES(mp),
+					  0, XFS_TRANS_PERM_LOG_RES,
+					  XFS_ITRUNCATE_LOG_COUNT);
+		if (error) {
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return (error);
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip,
+				XFS_IOLOCK_EXCL |
+				XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+
+		error = xfs_itruncate_finish(&tp, ip,
+					     ip->i_d.di_size,
+					     XFS_DATA_FORK,
+					     0);
+		/*
+		 * If we get an error at this point we
+		 * simply don't bother truncating the file.
+		 */
+		if (error) {
+			xfs_trans_cancel(tp,
+					 (XFS_TRANS_RELEASE_LOG_RES |
+					  XFS_TRANS_ABORT));
+		} else {
+			error = xfs_trans_commit(tp,
+						XFS_TRANS_RELEASE_LOG_RES,
+						NULL);
+		}
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	}
+	return (error);
+}
+
+/*
+ * Free a symlink that has blocks associated with it.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp)
+{
+	xfs_buf_t	*bp;
+	int		committed;
+	int		done;
+	int		error;
+	xfs_fsblock_t	first_block;
+	xfs_bmap_free_t free_list;
+	int		i;
+	xfs_mount_t	*mp;
+	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+	int		nmaps;
+	xfs_trans_t	*ntp;
+	int		size;
+	xfs_trans_t	*tp;
+
+	tp = *tpp;
+	mp = ip->i_mount;
+	ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
+	/*
+	 * We're freeing a symlink that has some
+	 * blocks allocated to it.  Free the
+	 * blocks here.	 We know that we've got
+	 * either 1 or 2 extents and that we can
+	 * free them all in one bunmapi call.
+	 */
+	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, 0);
+		*tpp = NULL;
+		return error;
+	}
+	/*
+	 * Lock the inode, fix the size, and join it to the transaction.
+	 * Hold it so in the normal path, we still have it locked for
+	 * the second transaction.  In the error paths we need it
+	 * held so the cancel won't rele it, see below.
+	 */
+	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	size = (int)ip->i_d.di_size;
+	ip->i_d.di_size = 0;
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Find the block(s) so we can inval and unmap them.
+	 */
+	done = 0;
+	XFS_BMAP_INIT(&free_list, &first_block);
+	nmaps = sizeof(mval) / sizeof(mval[0]);
+	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
+			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
+			&free_list)))
+		goto error0;
+	/*
+	 * Invalidate the block(s).
+	 */
+	for (i = 0; i < nmaps; i++) {
+		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Unmap the dead block(s) to the free_list.
+	 */
+	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+			&first_block, &free_list, &done)))
+		goto error1;
+	ASSERT(done);
+	/*
+	 * Commit the first transaction.  This logs the EFI and the inode.
+	 */
+	if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
+		goto error1;
+	/*
+	 * The transaction must have been committed, since there were
+	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
+	 * The new tp has the extent freeing and EFDs.
+	 */
+	ASSERT(committed);
+	/*
+	 * The first xact was committed, so add the inode to the new one.
+	 * Mark it dirty so it will be logged and moved forward in the log as
+	 * part of every commit.
+	 */
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Get a new, empty transaction to return to our caller.
+	 */
+	ntp = xfs_trans_dup(tp);
+	/*
+	 * Commit the transaction containing extent freeing and EFD's.
+	 * If we get an error on the commit here or on the reserve below,
+	 * we need to unlock the inode since the new transaction doesn't
+	 * have the inode attached.
+	 */
+	error = xfs_trans_commit(tp, 0, NULL);
+	tp = ntp;
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		goto error0;
+	}
+	/*
+	 * Remove the memory for extent descriptions (just bookkeeping).
+	 */
+	if (ip->i_df.if_bytes)
+		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+	ASSERT(ip->i_df.if_bytes == 0);
+	/*
+	 * Put an itruncate log reservation in the new transaction
+	 * for our caller.
+	 */
+	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		goto error0;
+	}
+	/*
+	 * Return with the inode locked but not joined to the transaction.
+	 */
+	*tpp = tp;
+	return 0;
+
+ error1:
+	xfs_bmap_cancel(&free_list);
+ error0:
+	/*
+	 * Have to come here with the inode locked and either
+	 * (held and in the transaction) or (not in the transaction).
+	 * If the inode isn't held then cancel would iput it, but
+	 * that's wrong since this is inactive and the vnode ref
+	 * count is 0 already.
+	 * Cancel won't do anything to the inode if held, but it still
+	 * needs to be locked until the cancel is done, if it was
+	 * joined to the transaction.
+	 */
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	*tpp = NULL;
+	return error;
+
+}
+
+STATIC int
+xfs_inactive_symlink_local(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp)
+{
+	int		error;
+	ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
+	/*
+	 * We're freeing a symlink which fit into
+	 * the inode.  Just free the memory used
+	 * to hold the old symlink.
+	 */
+	error = xfs_trans_reserve(*tpp, 0,
+				  XFS_ITRUNCATE_LOG_RES(ip->i_mount),
+				  0, XFS_TRANS_PERM_LOG_RES,
+				  XFS_ITRUNCATE_LOG_COUNT);
+
+	if (error) {
+		xfs_trans_cancel(*tpp, 0);
+		*tpp = NULL;
+		return (error);
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	/*
+	 * Zero length symlinks _can_ exist.
+	 */
+	if (ip->i_df.if_bytes > 0) {
+		xfs_idata_realloc(ip,
+				  -(ip->i_df.if_bytes),
+				  XFS_DATA_FORK);
+		ASSERT(ip->i_df.if_bytes == 0);
+	}
+	return (0);
+}
+
+/*
+ *
+ */
+STATIC int
+xfs_inactive_attrs(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp,
+	int		*commitflags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	xfs_mount_t	*mp;
+
+	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+	tp = *tpp;
+	mp = ip->i_mount;
+	ASSERT(ip->i_d.di_forkoff != 0);
+	xfs_trans_commit(tp, *commitflags, NULL);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	*commitflags = 0;
+
+	error = xfs_attr_inactive(ip);
+	if (error) {
+		*tpp = NULL;
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return (error); /* goto out*/
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(tp, 0,
+				  XFS_IFREE_LOG_RES(mp),
+				  0, 0,
+				  XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, 0);
+		*tpp = NULL;
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return (error);
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+	ASSERT(ip->i_d.di_anextents == 0);
+
+	*tpp = tp;
+	return (0);
+}
+
+/*ARGSUSED*/
+STATIC int
+xfs_release(
+	bhv_desc_t	*bdp)
+{
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+	xfs_mount_t	*mp;
+	int		error;
+
+	vp = BHV_TO_VNODE(bdp);
+	ip = XFS_BHVTOI(bdp);
+
+	if ((vp->v_type != VREG) || (ip->i_d.di_mode == 0)) {
+		return 0;
+	}
+
+	/* If we are in the NFS reference cache then don't do this now */
+	if (ip->i_refcache)
+		return 0;
+
+	mp = ip->i_mount;
+
+	if (ip->i_d.di_nlink != 0) {
+		if ((((ip->i_d.di_mode & IFMT) == IFREG) &&
+		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
+		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+		    (!(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC))) {
+			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
+				return (error);
+			/* Update linux inode block count after free above */
+			LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
+				ip->i_d.di_nblocks + ip->i_delayed_blks);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero.  If the file has been unlinked, then it must
+ * now be truncated.  Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_inactive(
+	bhv_desc_t	*bdp,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+			/* REFERENCED */
+	vnode_t		*vp;
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	int		error;
+	int		commit_flags;
+	int		truncate;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_inactive", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+
+	/*
+	 * If the inode is already free, then there can be nothing
+	 * to clean up here.
+	 */
+	if (ip->i_d.di_mode == 0) {
+		ASSERT(ip->i_df.if_real_bytes == 0);
+		ASSERT(ip->i_df.if_broot_bytes == 0);
+		return VN_INACTIVE_CACHE;
+	}
+
+	/*
+	 * Only do a truncate if it's a regular file with
+	 * some actual space in it.  It's OK to look at the
+	 * inode's fields without the lock because we're the
+	 * only one with a reference to the inode.
+	 */
+	truncate = ((ip->i_d.di_nlink == 0) &&
+	    ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) &&
+	    ((ip->i_d.di_mode & IFMT) == IFREG));
+
+	mp = ip->i_mount;
+
+	if (ip->i_d.di_nlink == 0 &&
+	    DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
+		(void) dm_send_destroy_event(bdp, DM_RIGHT_NULL);
+	}
+
+	error = 0;
+	if (ip->i_d.di_nlink != 0) {
+		if ((((ip->i_d.di_mode & IFMT) == IFREG) &&
+		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
+		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+		    (!(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) ||
+		     (ip->i_delayed_blks != 0))) {
+			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
+				return (VN_INACTIVE_CACHE);
+			/* Update linux inode block count after free above */
+			LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
+				ip->i_d.di_nblocks + ip->i_delayed_blks);
+		}
+		goto out;
+	}
+
+	ASSERT(ip->i_d.di_nlink == 0);
+
+	if (XFS_IS_QUOTA_ON(mp) &&
+	    ip->i_ino != mp->m_sb.sb_uquotino &&
+	    ip->i_ino != mp->m_sb.sb_gquotino) {
+		if (XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((error = xfs_qm_dqattach(ip, 0)))
+				return (VN_INACTIVE_CACHE);
+		}
+	}
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	if (truncate) {
+		/*
+		 * Do the xfs_itruncate_start() call before
+		 * reserving any log space because itruncate_start
+		 * will call into the buffer cache and we can't
+		 * do that within a transaction.
+		 */
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
+
+		error = xfs_trans_reserve(tp, 0,
+					  XFS_ITRUNCATE_LOG_RES(mp),
+					  0, XFS_TRANS_PERM_LOG_RES,
+					  XFS_ITRUNCATE_LOG_COUNT);
+		if (error) {
+			/* Don't call itruncate_cleanup */
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return (VN_INACTIVE_CACHE);
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+
+		/*
+		 * normally, we have to run xfs_itruncate_finish sync.
+		 * But if filesystem is wsync and we're in the inactive
+		 * path, then we know that nlink == 0, and that the
+		 * xaction that made nlink == 0 is permanently committed
+		 * since xfs_remove runs as a synchronous transaction.
+		 */
+		error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
+				(!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
+		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+		if (error) {
+			xfs_trans_cancel(tp, commit_flags | XFS_TRANS_ABORT);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+			return (VN_INACTIVE_CACHE);
+		}
+	} else if ((ip->i_d.di_mode & IFMT) == IFLNK) {
+
+		/*
+		 * If we get an error while cleaning up a
+		 * symlink we bail out.
+		 */
+		error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
+			xfs_inactive_symlink_rmt(ip, &tp) :
+			xfs_inactive_symlink_local(ip, &tp);
+
+		if (error) {
+			ASSERT(tp == NULL);
+			return (VN_INACTIVE_CACHE);
+		}
+
+		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+	} else {
+		error = xfs_trans_reserve(tp, 0,
+					  XFS_IFREE_LOG_RES(mp),
+					  0, 0,
+					  XFS_DEFAULT_LOG_COUNT);
+		if (error) {
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			return (VN_INACTIVE_CACHE);
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+		commit_flags = 0;
+	}
+
+	/*
+	 * If there are attributes associated with the file
+	 * then blow them away now.  The code calls a routine
+	 * that recursively deconstructs the attribute fork.
+	 * We need to just commit the current transaction
+	 * because we can't use it for xfs_attr_inactive().
+	 */
+	if (ip->i_d.di_anextents > 0) {
+		error = xfs_inactive_attrs(ip, &tp, &commit_flags);
+		/*
+		 * If we got an error, the transaction is already
+		 * cancelled, and the inode is unlocked. Just get out.
+		 */
+		 if (error)
+			 return (VN_INACTIVE_CACHE);
+	} else if (ip->i_afp) {
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+	}
+
+	/*
+	 * Free the inode.
+	 */
+	error = xfs_ifree(tp, ip);
+	if (error) {
+		/*
+		 * If we fail to free the inode, shut down.  The cancel
+		 * might do that, we need to make sure.	 Otherwise the
+		 * inode might be lost for a long time or forever.
+		 */
+		if (!XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+			cmn_err(CE_NOTE,
+				"xfs_inactive:	xfs_ifree() returned an error = %d on %s",
+				error,tp->t_mountp->m_fsname);
+			xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR);
+		}
+		xfs_trans_cancel(tp, commit_flags | XFS_TRANS_ABORT);
+	} else {
+		/*
+		 * Credit the quota account(s). The inode is gone.
+		 */
+		if (XFS_IS_QUOTA_ON(tp->t_mountp))
+			xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT,
+							 -1);
+
+		/*
+		 * Just ignore errors at this point.  There is
+		 * nothing we can do except to try to keep going.
+		 */
+		(void) xfs_trans_commit(tp, commit_flags, NULL);
+	}
+	/*
+	 * Release the dquots held by inode, if any.
+	 */
+	if (ip->i_udquot || ip->i_gdquot)
+		xfs_qm_dqdettach_inode(ip);
+
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+
+ out:
+	return VN_INACTIVE_CACHE;
+}
+
+
+/*
+ * xfs_lookup
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_lookup(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	vnode_t		**vpp,
+	int		flags,
+	vnode_t		*rdir,
+	cred_t		*credp)
+{
+	xfs_inode_t		*dp, *ip;
+	struct vnode		*vp;
+	xfs_ino_t		e_inum;
+	int			error;
+	uint			lock_mode;
+	uint			lookup_flags;
+	vnode_t			*dir_vp;
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+
+	vn_trace_entry(dir_vp, "xfs_lookup", (inst_t *)__return_address);
+
+	dp = XFS_BHVTOI(dir_bdp);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return XFS_ERROR(EIO);
+
+	lock_mode = xfs_ilock_map_shared(dp);
+
+	lookup_flags = DLF_IGET;
+	if (lock_mode == XFS_ILOCK_SHARED) {
+		lookup_flags |= DLF_LOCK_SHARED;
+	}
+	error = xfs_dir_lookup_int(dir_bdp, lookup_flags, dentry, &e_inum, &ip);
+	if (error) {
+		xfs_iunlock_map_shared(dp, lock_mode);
+		return error;
+	}
+
+	vp = XFS_ITOV(ip);
+
+	ITRACE(ip);
+
+	xfs_iunlock_map_shared(dp, lock_mode);
+
+	*vpp = vp;
+
+	return 0;
+}
+
+#ifdef XFS_RW_TRACE
+STATIC void
+xfs_ctrunc_trace(
+	int		tag,
+	xfs_inode_t	*ip)
+{
+	if (ip->i_rwtrace == NULL) {
+		return;
+	}
+
+	ktrace_enter(ip->i_rwtrace,
+		     (void*)((long)tag),
+		     (void*)ip,
+		     (void*)((long)private.p_cpuid),
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0);
+}
+#endif /* XFS_RW_TRACE */
+
+#define XFS_CREATE_NEW_MAXTRIES 10000
+
+/*
+ * xfs_create (create a new file).
+ *   It might still find name exists out there, though.
+ *	But vpp, doens't point at a vnode.
+ */
+STATIC int
+xfs_create(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	vattr_t		*vap,
+	vnode_t		**vpp,
+	cred_t		*credp)
+{
+	char			*name = (char *)dentry->d_name.name;
+	vnode_t			*dir_vp;
+	xfs_inode_t		*dp, *ip;
+	vnode_t			*vp=NULL;
+	xfs_trans_t		*tp;
+	xfs_ino_t		e_inum;
+	xfs_mount_t		*mp;
+	dev_t			rdev;
+	int			error;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	boolean_t		dp_joined_to_trans;
+	boolean_t		truncated;
+	boolean_t		created = B_FALSE;
+	boolean_t		inode_change = B_FALSE;
+	int			dm_event_sent = 0;
+	uint			cancel_flags;
+	int			committed;
+	xfs_prid_t		prid;
+	struct xfs_dquot	*udqp, *gdqp;
+	uint			resblks;
+	int			dm_di_mode;
+	int			xfs_create_retries = 0;
+	xfs_ino_t		e_inum_saved;	/* for retry trap code */
+	int			namelen;
+
+	ASSERT(!*vpp);
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	dp = XFS_BHVTOI(dir_bdp);
+
+	vn_trace_entry(dir_vp, "xfs_create", (inst_t *)__return_address);
+
+	dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+	namelen = dentry->d_name.len;
+	if (namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
+		error = xfs_dm_send_create_event(dir_bdp, name,
+				dm_di_mode, &dm_event_sent);
+		if (error)
+			return error;
+	}
+
+	/* Return through std_return after this point. */
+
+	mp = dp->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	udqp = gdqp = NULL;
+	if (vap->va_mask & AT_PROJID)
+		prid = (xfs_prid_t)vap->va_projid;
+	else
+		prid = (xfs_prid_t)dfltprid;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		error = xfs_qm_vop_dqalloc(mp, dp,
+					   current->fsuid, current->fsgid,
+					   XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT,
+					   &udqp, &gdqp);
+		if (error)
+			goto std_return;
+	}
+
+ try_again:
+	ip = NULL;
+	dp_joined_to_trans = B_FALSE;
+	truncated = B_FALSE;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	resblks = XFS_CREATE_SPACE_RES(mp, namelen);
+	/*
+	 * Initially assume that the file does not exist and
+	 * reserve the resources for that case.	 If that is not
+	 * the case we'll drop the one we have and get a more
+	 * appropriate transaction later.
+	 */
+	error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		dp = NULL;
+		goto error_return;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * At this point we cannot do an xfs_iget() of the entry named
+	 * since we are already holding a log reservation and it could
+	 * be in xfs_inactive waiting for a log reservation.  We'd just
+	 * end up waiting forever for the inactive to complete.	 Instead
+	 * we just look to see if there is an entry with the name.  In
+	 * the case where there is not, we didn't need the inode anyway.
+	 * In the case where there is an entry, we'll get it later after
+	 * dropping our transaction.
+	 */
+	error = xfs_dir_lookup_int(dir_bdp, 0, dentry, &e_inum, NULL);
+	if (error && (error != ENOENT)) {
+		goto error_return;
+	}
+
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	if (error == ENOENT) {
+
+		ASSERT(ip == NULL);
+
+		/*
+		 * Reserve disk quota and the inode.
+		 */
+		if (XFS_IS_QUOTA_ON(mp)) {
+			if (xfs_trans_reserve_quota(tp, udqp, gdqp, resblks,
+						    1, 0)) {
+				error = EDQUOT;
+				goto error_return;
+			}
+		}
+		if (resblks == 0 &&
+		    (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
+			goto error_return;
+		rdev = (vap->va_mask & AT_RDEV) ? vap->va_rdev : 0;
+		error = xfs_dir_ialloc(&tp, dp,
+				MAKEIMODE(vap->va_type,vap->va_mode), 1,
+				rdev, credp, prid, resblks > 0,
+				&ip, &committed);
+		if (error) {
+			if (error == ENOSPC)
+				goto error_return;
+			goto abort_return;
+		}
+		ITRACE(ip);
+
+		/*
+		 * At this point, we've gotten a newly allocated inode.
+		 * It is locked (and joined to the transaction).
+		 */
+
+		ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
+
+		/*
+		 * Now we join the directory inode to the transaction.
+		 * We do not do it earlier because xfs_dir_ialloc
+		 * might commit the previous transaction (and release
+		 * all the locks).
+		 */
+
+		VN_HOLD(dir_vp);
+		xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+		dp_joined_to_trans = B_TRUE;
+
+		error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
+			&first_block, &free_list,
+			resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+		if (error) {
+			ASSERT(error != ENOSPC);
+			goto abort_return;
+		}
+		xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+		/*
+		 * If this is a synchronous mount, make sure that the
+		 * create transaction goes to disk before returning to
+		 * the user.
+		 */
+		if (mp->m_flags & XFS_MOUNT_WSYNC) {
+			xfs_trans_set_sync(tp);
+		}
+
+		dp->i_gen++;
+
+		/*
+		 * Attach the dquot(s) to the inodes and modify them incore.
+		 * These ids of the inode couldn't have changed since the new
+		 * inode has been locked ever since it was created.
+		 */
+		if (XFS_IS_QUOTA_ON(mp))
+			xfs_qm_vop_dqattach_and_dqmod_newinode(tp, ip, udqp,
+							       gdqp);
+		created = B_TRUE;
+	} else {
+		e_inum_saved = e_inum;
+
+		/*
+		 * The file already exists, so we're in the wrong
+		 * transaction for this operation.  Cancel the old
+		 * transaction and start a new one.  We have to drop
+		 * our locks in doing this. But, we don't care
+		 * if the directory changes. We have already checked
+		 * if the dir exists and is EXEC by the user.
+		 * After all, we already have the vnode held.
+		 *
+		 * All we need to do is truncate it.
+		 */
+		xfs_trans_cancel(tp, cancel_flags);
+		tp = NULL;
+
+		/*
+		 * Now that we've dropped our log reservation, get a
+		 * reference to the inode we are re-creating.  If we
+		 * have to drop the directory lock in the call and
+		 * the entry is removed, then just start over.
+		 */
+		error = xfs_dir_lookup_int(dir_bdp, DLF_IGET, dentry,
+					   &e_inum, &ip);
+		if (error) {
+			if (error == ENOENT) {
+				if (++xfs_create_retries >
+					XFS_CREATE_NEW_MAXTRIES) {
+					error = XFS_ERROR(EFSCORRUPTED);
+					goto error_return;
+				}
+
+				xfs_iunlock(dp, XFS_ILOCK_EXCL);
+				goto try_again;
+			}
+			goto error_return;
+		}
+		ITRACE(ip);
+
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+		dp = NULL;
+
+		/*
+		 * Done with directory. Don't care about it
+		 * anymore.
+		 */
+
+		/*
+		 * Since we're at a good, clean point, check for any
+		 * obvious problems and get out if they occur.
+		 */
+		vp = XFS_ITOV(ip);
+		if (!error) {
+			if (vp->v_type == VREG &&
+				(vap->va_mask & AT_SIZE) &&
+				DM_EVENT_ENABLED (vp->v_vfsp, ip,
+					DM_EVENT_TRUNCATE)) {
+					error = xfs_dm_send_data_event(
+						DM_EVENT_TRUNCATE,
+						XFS_ITOBHV(ip),
+						vap->va_size, 0, 0, NULL);
+			}
+		}
+
+		if (!error && XFS_IS_QUOTA_ON(mp)) {
+			if (XFS_NOT_DQATTACHED(mp, ip))
+				error = xfs_qm_dqattach(ip, 0);
+		}
+
+		if (error) {
+			IRELE(ip);
+			goto error_return;
+		}
+
+		/* Need the behaviour lock before the iolock as we are
+		 * potentially going to make a VOP call on vp. */
+		VN_BHV_READ_LOCK(&(vp)->v_bh);
+
+		/*
+		 * We need to do the xfs_itruncate_start call before
+		 * reserving any log space in the transaction.
+		 */
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		xfs_ctrunc_trace(XFS_CTRUNC1, ip);
+		if ((vp->v_type == VREG) &&
+		    (vap->va_mask & AT_SIZE) &&
+		    ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents != 0))) {
+			xfs_itruncate_start(ip, XFS_ITRUNC_MAYBE, 0);
+		}
+
+		/* Once we got the I/O lock we can drop the behaviour
+		 * lock.
+		 */
+		VN_BHV_READ_UNLOCK(&(vp)->v_bh);
+
+		tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TRUNC);
+		if ((error = xfs_trans_reserve(tp, 0,
+					      XFS_ITRUNCATE_LOG_RES(mp), 0,
+					      XFS_TRANS_PERM_LOG_RES,
+					      XFS_ITRUNCATE_LOG_COUNT))) {
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			IRELE(ip);
+			cancel_flags = 0;
+			xfs_ctrunc_trace(XFS_CTRUNC2, ip);
+			goto error_return;
+		}
+
+		/*
+		 * Now lock inode.
+		 */
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+		if (error) {
+			xfs_ctrunc_trace(XFS_CTRUNC4, ip);
+			goto error_return;
+		}
+
+		if (vp->v_type == VREG && (vap->va_mask & AT_SIZE)) {
+			/*
+			 * Truncate the file.  The timestamps must
+			 * be updated whether the file is changed
+			 * or not.
+			 */
+			ASSERT(vap->va_size == 0);
+			if ((ip->i_d.di_size > 0) || (ip->i_d.di_nextents)) {
+				xfs_ctrunc_trace(XFS_CTRUNC5, ip);
+				xfs_trans_ihold(tp, ip);
+				/*
+				 * always signal sync xaction.	We're
+				 * truncating an existing file so the
+				 * xaction must be sync regardless
+				 * of whether the filesystem is wsync
+				 * to make the truncate persistent
+				 * in the face of a crash.
+				 */
+				error = xfs_itruncate_finish(&tp, ip,
+							     (xfs_fsize_t)0,
+							     XFS_DATA_FORK, 1);
+				if (error) {
+					ASSERT(ip->i_transp == tp);
+					xfs_trans_ihold_release(tp, ip);
+					goto abort_return;
+				}
+				truncated = B_TRUE;
+				xfs_ichgtime(ip,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+				xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+			} else {
+				xfs_ichgtime(ip,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+			}
+			inode_change = B_TRUE;
+
+		}
+		xfs_ctrunc_trace(XFS_CTRUNC6, ip);
+	}
+
+
+	/*
+	 * xfs_trans_commit normally decrements the vnode ref count
+	 * when it unlocks the inode. Since we want to return the
+	 * vnode to the caller, we bump the vnode ref count now.
+	 */
+	IHOLD(ip);
+	vp = XFS_ITOV(ip);
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		if (truncated) {
+			/*
+			 * If we truncated the file, then the inode will
+			 * have been held within the previous transaction
+			 * and must be unlocked now.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+			ASSERT(vn_count(vp) >= 2);
+			IRELE(ip);
+		}
+		goto abort_rele;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (truncated) {
+		/*
+		 * If we truncated the file, then the inode will
+		 * have been held within the transaction and must
+		 * be unlocked now.
+		 */
+		xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+		ASSERT(vn_count(vp) >= 2);
+		IRELE(ip);
+	}
+	if (error) {
+		IRELE(ip);
+		tp = NULL;
+		goto error_return;
+	}
+
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	/*
+	 * Propogate the fact that the vnode changed after the
+	 * xfs_inode locks have been released.
+	 */
+	if (inode_change == B_TRUE) {
+		VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 0);
+	} else {
+		VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
+	}
+
+	*vpp = vp;
+
+	/* Fallthrough to std_return with error = 0  */
+
+std_return:
+	if ( (created || (error != 0 && dm_event_sent != 0)) &&
+			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
+							DM_EVENT_POSTCREATE)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTCREATE,
+			dir_bdp, DM_RIGHT_NULL,
+			created ? vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops):NULL,
+			DM_RIGHT_NULL, name, NULL,
+			dm_di_mode, error, 0);
+	}
+	return error;
+
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+	/* FALLTHROUGH */
+ error_return:
+
+	if (tp != NULL)
+		xfs_trans_cancel(tp, cancel_flags);
+
+	if (!dp_joined_to_trans && (dp != NULL))
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	goto std_return;
+
+ abort_rele:
+	/*
+	 * Wait until after the current transaction is aborted to
+	 * release the inode.  This prevents recursive transactions
+	 * and deadlocks from xfs_inactive.
+	 */
+	cancel_flags |= XFS_TRANS_ABORT;
+	xfs_trans_cancel(tp, cancel_flags);
+	IRELE(ip);
+
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	goto std_return;
+}
+
+#ifdef DEBUG
+
+/*
+ * Some counters to see if (and how often) we are hitting some deadlock
+ * prevention code paths.
+ */
+
+int xfs_rm_locks;
+int xfs_rm_lock_delays;
+int xfs_rm_attempts;
+#endif
+
+/*
+ * The following routine will lock the inodes associated with the
+ * directory and the named entry in the directory. The locks are
+ * acquired in increasing inode number.
+ *
+ * If the entry is "..", then only the directory is locked. The
+ * vnode ref count will still include that from the .. entry in
+ * this case.
+ *
+ * The inode passed in will have been looked up using xfs_get_dir_entry().
+ * Since that lookup the directory lock will have been dropped, so
+ * we need to validate that the inode given is still pointed to by the
+ * directory.  We use the directory inode in memory generation count
+ * as an optimization to tell if a new lookup is necessary.  If the
+ * directory no longer points to the given inode with the given name,
+ * then we drop the directory lock, set the entry_changed parameter to 1,
+ * and return.	It is up to the caller to drop the reference to the inode.
+ *
+ * There is a dealock we need to worry about. If the locked directory is
+ * in the AIL, it might be blocking up the log. The next inode we lock
+ * could be already locked by another thread waiting for log space (e.g
+ * a permanent log reservation with a long running transaction (see
+ * xfs_itruncate_finish)). To solve this, we must check if the directory
+ * is in the ail and use lock_nowait. If we can't lock, we need to
+ * drop the inode lock on the directory and try again. xfs_iunlock will
+ * potentially push the tail if we were holding up the log.
+ */
+STATIC int
+xfs_lock_dir_and_entry(
+	xfs_inode_t	*dp,
+	struct dentry	*dentry,
+	xfs_inode_t	*ip,	/* inode of entry 'name' */
+	int		*entry_changed)
+{
+	int			attempts;
+	xfs_ino_t		e_inum;
+	xfs_inode_t		*ips[2];
+	xfs_log_item_t		*lp;
+
+#ifdef DEBUG
+	xfs_rm_locks++;
+#endif
+	attempts = 0;
+
+again:
+	*entry_changed = 0;
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	e_inum = ip->i_ino;
+
+	ITRACE(ip);
+
+	/*
+	 * We want to lock in increasing inum. Since we've already
+	 * acquired the lock on the directory, we may need to release
+	 * if if the inum of the entry turns out to be less.
+	 */
+	if (e_inum > dp->i_ino) {
+		/*
+		 * We are already in the right order, so just
+		 * lock on the inode of the entry.
+		 * We need to use nowait if dp is in the AIL.
+		 */
+
+		lp = (xfs_log_item_t *)dp->i_itemp;
+		if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+			if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+				attempts++;
+#ifdef DEBUG
+				xfs_rm_attempts++;
+#endif
+
+				/*
+				 * Unlock dp and try again.
+				 * xfs_iunlock will try to push the tail
+				 * if the inode is in the AIL.
+				 */
+
+				xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+				if ((attempts % 5) == 0) {
+					delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+					xfs_rm_lock_delays++;
+#endif
+				}
+				goto again;
+			}
+		} else {
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		}
+	} else if (e_inum < dp->i_ino) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+		ips[0] = ip;
+		ips[1] = dp;
+		xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+	}
+	/* else	 e_inum == dp->i_ino */
+	/*     This can happen if we're asked to lock /x/..
+	 *     the entry is "..", which is also the parent directory.
+	 */
+
+	return 0;
+}
+
+#ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+
+/*
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
+ */
+void
+xfs_lock_inodes (xfs_inode_t **ips,
+	int inodes,
+	int first_locked,
+	uint lock_mode)
+{
+	int attempts = 0, i, j, try_lock;
+	xfs_log_item_t	*lp;
+
+	ASSERT(ips && (inodes >= 2)); /* we need at least two */
+
+	if (first_locked) {
+		try_lock = 1;
+		i = 1;
+	} else {
+		try_lock = 0;
+		i = 0;
+	}
+
+again:
+	for (; i < inodes; i++) {
+		ASSERT(ips[i]);
+
+		if (i && (ips[i] == ips[i-1]))	/* Already locked */
+			continue;
+
+		/*
+		 * If try_lock is not set yet, make sure all locked inodes
+		 * are not in the AIL.
+		 * If any are, set try_lock to be used later.
+		 */
+
+		if (!try_lock) {
+			for (j = (i - 1); j >= 0 && !try_lock; j--) {
+				lp = (xfs_log_item_t *)ips[j]->i_itemp;
+				if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+					try_lock++;
+				}
+			}
+		}
+
+		/*
+		 * If any of the previous locks we have locked is in the AIL,
+		 * we must TRY to get the second and subsequent locks. If
+		 * we can't get any, we must release all we have
+		 * and try again.
+		 */
+
+		if (try_lock) {
+			/* try_lock must be 0 if i is 0. */
+			/*
+			 * try_lock means we have an inode locked
+			 * that is in the AIL.
+			 */
+			ASSERT(i != 0);
+			if (!xfs_ilock_nowait(ips[i], lock_mode)) {
+				attempts++;
+
+				/*
+				 * Unlock all previous guys and try again.
+				 * xfs_iunlock will try to push the tail
+				 * if the inode is in the AIL.
+				 */
+
+				for(j = i - 1; j >= 0; j--) {
+
+					/*
+					 * Check to see if we've already
+					 * unlocked this one.
+					 * Not the first one going back,
+					 * and the inode ptr is the same.
+					 */
+					if ((j != (i - 1)) && ips[j] ==
+								ips[j+1])
+						continue;
+
+					xfs_iunlock(ips[j], lock_mode);
+				}
+
+				if ((attempts % 5) == 0) {
+					delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+					xfs_lock_delays++;
+#endif
+				}
+				i = 0;
+				try_lock = 0;
+				goto again;
+			}
+		} else {
+			xfs_ilock(ips[i], lock_mode);
+		}
+	}
+
+#ifdef DEBUG
+	if (attempts) {
+		if (attempts < 5) xfs_small_retries++;
+		else if (attempts < 100) xfs_middle_retries++;
+		else xfs_lots_retries++;
+	} else {
+		xfs_locked_n++;
+	}
+#endif
+}
+
+#ifdef	DEBUG
+#define REMOVE_DEBUG_TRACE(x)	{remove_which_error_return = (x);}
+int remove_which_error_return = 0;
+#else /* ! DEBUG */
+#define REMOVE_DEBUG_TRACE(x)
+#endif	/* ! DEBUG */
+
+/*
+ * xfs_remove
+ *
+ */
+STATIC int
+xfs_remove(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	cred_t		*credp)
+{
+	vnode_t			*dir_vp;
+	char			*name = (char *) dentry->d_name.name;
+	xfs_inode_t		*dp, *ip;
+	xfs_trans_t		*tp = NULL;
+	xfs_mount_t		*mp;
+	int			error = 0;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	int			cancel_flags;
+	int			committed;
+	int			entry_changed;
+	int			dm_di_mode = 0;
+	int			link_zero;
+	uint			resblks;
+	int			namelen;
+/*	bhv_desc_t		*bdp; */
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+
+	vn_trace_entry(dir_vp, "xfs_remove", (inst_t *)__return_address);
+
+	dp = XFS_BHVTOI(dir_bdp);
+	mp = dp->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	namelen = dentry->d_name.len;
+	if (namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
+		error = dm_send_namesp_event(DM_EVENT_REMOVE, dir_bdp, DM_RIGHT_NULL,
+					NULL, DM_RIGHT_NULL,
+					name, NULL, 0, 0, 0);
+		if (error)
+			return error;
+	}
+
+	/* From this point on, return through std_return */
+ retry:
+	ip = NULL;
+
+	/*
+	 * We need to get a reference to ip before we get our log
+	 * reservation. The reason for this is that we cannot call
+	 * xfs_iget for an inode for which we do not have a reference
+	 * once we've acquired a log reservation. This is because the
+	 * inode we are trying to get might be in xfs_inactive going
+	 * for a log reservation. Since we'll have to wait for the
+	 * inactive code to complete before returning from xfs_iget,
+	 * we need to make sure that we don't have log space reserved
+	 * when we call xfs_iget.  Instead we get an unlocked referece
+	 * to the inode before getting our log reservation.
+	 */
+	error = xfs_get_dir_entry(dentry, &ip);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto std_return;
+	}
+
+	dm_di_mode = ip->i_d.di_mode;
+
+	vn_trace_entry(XFS_ITOV(ip), "xfs_remove", (inst_t *)__return_address);
+
+	ITRACE(ip);
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		ASSERT(! error);
+		if (XFS_NOT_DQATTACHED(mp, dp))
+			error = xfs_qm_dqattach(dp, 0);
+		if (!error && dp != ip && XFS_NOT_DQATTACHED(mp, ip))
+			error = xfs_qm_dqattach(ip, 0);
+		if (error) {
+			REMOVE_DEBUG_TRACE(__LINE__);
+			IRELE(ip);
+			goto std_return;
+		}
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/*
+	 * We try to get the real space reservation first,
+	 * allowing for directory btree deletion(s) implying
+	 * possible bmap insert(s).  If we can't get the space
+	 * reservation then we use 0 instead, and avoid the bmap
+	 * btree insert(s) in the directory code by, if the bmap
+	 * insert tries to happen, instead trimming the LAST
+	 * block from the directory.
+	 */
+	resblks = XFS_REMOVE_SPACE_RES(mp);
+	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+	}
+	if (error) {
+		ASSERT(error != ENOSPC);
+		REMOVE_DEBUG_TRACE(__LINE__);
+		xfs_trans_cancel(tp, 0);
+		IRELE(ip);
+		return error;
+	}
+
+	error = xfs_lock_dir_and_entry(dp, dentry, ip, &entry_changed);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		xfs_trans_cancel(tp, cancel_flags);
+		IRELE(ip);
+		goto std_return;
+	}
+
+	/*
+	 * If the inode we found in the first pass is no longer
+	 * the entry with the given name, then drop our transaction and
+	 * inode reference and start over.
+	 */
+	if (entry_changed) {
+		xfs_trans_cancel(tp, cancel_flags);
+		IRELE(ip);
+		goto retry;
+	}
+
+	/*
+	 * At this point, we've gotten both the directory and the entry
+	 * inodes locked.
+	 */
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	if (dp != ip) {
+		/*
+		 * Increment vnode ref count only in this case since
+		 * there's an extra vnode reference in the case where
+		 * dp == ip.
+		 */
+		IHOLD(dp);
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	}
+
+	if ((error = _MAC_XFS_IACCESS(ip, MACWRITE, credp))) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error_return;
+	}
+
+	if ((ip->i_d.di_mode & IFMT) == IFDIR) {
+		error = XFS_ERROR(EPERM);
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error_return;
+	}
+
+	/*
+	 * Return error when removing . and ..
+	 */
+	if (name[0] == '.') {
+		if (name[1] == '\0') {
+			error = XFS_ERROR(EINVAL);
+			REMOVE_DEBUG_TRACE(__LINE__);
+			goto error_return;
+		}
+		else if (name[1] == '.' && name[2] == '\0') {
+			error = XFS_ERROR(EEXIST);
+			REMOVE_DEBUG_TRACE(__LINE__);
+			goto error_return;
+		}
+	}
+
+	/*
+	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
+	 */
+	XFS_BMAP_INIT(&free_list, &first_block);
+	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
+		&first_block, &free_list, 0);
+	if (error) {
+		ASSERT(error != ENOENT);
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error1;
+	}
+	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	dp->i_gen++;
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	error = xfs_droplink(tp, ip);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error1;
+	}
+
+	/* Determine if this is the last link while
+	 * we are in the transaction.
+	 */
+	link_zero = (ip)->i_d.di_nlink==0;
+
+	/*
+	 * Take an extra ref on the inode so that it doesn't
+	 * go to xfs_inactive() from within the commit.
+	 */
+	IHOLD(ip);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * remove transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error_rele;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (error) {
+		IRELE(ip);
+		goto std_return;
+	}
+
+	/*
+	 * Before we drop our extra reference to the inode, purge it
+	 * from the refcache if it is there.  By waiting until afterwards
+	 * to do the IRELE, we ensure that we won't go inactive in the
+	 * xfs_refcache_purge_ip routine (although that would be OK).
+	 */
+	xfs_refcache_purge_ip(ip);
+
+	vn_trace_exit(XFS_ITOV(ip), "xfs_remove",
+						(inst_t *)__return_address);
+
+	/*
+	 * Let interposed file systems know about removed links.
+	 */
+	VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
+
+	IRELE(ip);
+
+/*	Fall through to std_return with error = 0 */
+
+std_return:
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
+						DM_EVENT_POSTREMOVE)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTREMOVE,
+				dir_bdp, DM_RIGHT_NULL,
+				NULL, DM_RIGHT_NULL,
+				name, NULL, dm_di_mode, error, 0);
+	}
+	return error;
+
+ error1:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+	goto std_return;
+
+ error_rele:
+	/*
+	 * In this case make sure to not release the inode until after
+	 * the current transaction is aborted.	Releasing it beforehand
+	 * can cause us to go to xfs_inactive and start a recursive
+	 * transaction which can easily deadlock with the current one.
+	 */
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+	xfs_trans_cancel(tp, cancel_flags);
+
+	/*
+	 * Before we drop our extra reference to the inode, purge it
+	 * from the refcache if it is there.  By waiting until afterwards
+	 * to do the IRELE, we ensure that we won't go inactive in the
+	 * xfs_refcache_purge_ip routine (although that would be OK).
+	 */
+	xfs_refcache_purge_ip(ip);
+
+	IRELE(ip);
+
+	goto std_return;
+}
+
+
+/*
+ * xfs_link
+ *
+ */
+STATIC int
+xfs_link(
+	bhv_desc_t	*target_dir_bdp,
+	vnode_t		*src_vp,
+	struct dentry	*dentry,
+	cred_t		*credp)
+{
+	xfs_inode_t		*tdp, *sip;
+	xfs_ino_t		e_inum;
+	xfs_trans_t		*tp;
+	xfs_mount_t		*mp;
+	xfs_inode_t		*ips[2];
+	int			error;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	int			cancel_flags;
+	int			committed;
+	vnode_t			*target_dir_vp;
+	bhv_desc_t		*src_bdp;
+	int			resblks;
+	char			*target_name = (char *)dentry->d_name.name;
+	int			target_namelen;
+
+	target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
+
+	vn_trace_entry(target_dir_vp, "xfs_link", (inst_t *)__return_address);
+
+	target_namelen = dentry->d_name.len;
+	if (target_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+
+	vn_trace_entry(src_vp, "xfs_link", (inst_t *)__return_address);
+
+	if (src_vp->v_type == VDIR) {
+		return XFS_ERROR(EPERM);
+	}
+
+	/*
+	 * For now, manually find the XFS behavior descriptor for
+	 * the source vnode.  If it doesn't exist then something
+	 * is wrong and we should just return an error.
+	 * Eventually we need to figure out how link is going to
+	 * work in the face of stacked vnodes.
+	 */
+	src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
+	if (src_bdp == NULL) {
+		return XFS_ERROR(EXDEV);
+	}
+	sip = XFS_BHVTOI(src_bdp);
+	tdp = XFS_BHVTOI(target_dir_bdp);
+	mp = tdp->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
+		error = dm_send_namesp_event(DM_EVENT_LINK,
+					target_dir_bdp, DM_RIGHT_NULL,
+					src_bdp, DM_RIGHT_NULL,
+					target_name, NULL, 0, 0, 0);
+		if (error)
+			return error;
+	}
+
+	/* Return through std_return after this point. */
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		error = 0;
+		if (XFS_NOT_DQATTACHED(mp, sip))
+			error = xfs_qm_dqattach(sip, 0);
+		if (!error && sip != tdp && XFS_NOT_DQATTACHED(mp, tdp))
+			error = xfs_qm_dqattach(tdp, 0);
+		if (error)
+			goto std_return;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
+	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto error_return;
+	}
+
+	if (sip->i_ino < tdp->i_ino) {
+		ips[0] = sip;
+		ips[1] = tdp;
+	} else {
+		ips[0] = tdp;
+		ips[1] = sip;
+	}
+
+	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+
+	/*
+	 * Increment vnode ref counts since xfs_trans_commit &
+	 * xfs_trans_cancel will both unlock the inodes and
+	 * decrement the associated ref counts.
+	 */
+	VN_HOLD(src_vp);
+	VN_HOLD(target_dir_vp);
+	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+
+	/*
+	 * If the source has too many links, we can't make any more to it.
+	 */
+	if (sip->i_d.di_nlink >= XFS_MAXLINK) {
+		error = XFS_ERROR(EMLINK);
+		goto error_return;
+	}
+
+	/*
+	 * Make sure that nothing with the given name exists in the
+	 * target directory.
+	 */
+	error = xfs_dir_lookup_int(target_dir_bdp, 0, dentry, &e_inum, NULL);
+	if (error != ENOENT) {
+		if (error == 0) {
+			error = XFS_ERROR(EEXIST);
+		}
+		goto error_return;
+	}
+
+	if (resblks == 0 &&
+	    (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
+			target_namelen)))
+		goto error_return;
+
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
+				   sip->i_ino, &first_block, &free_list,
+				   resblks);
+	if (error)
+		goto abort_return;
+	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	tdp->i_gen++;
+	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+	error = xfs_bumplink(tp, sip);
+	if (error) {
+		goto abort_return;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * link transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		goto abort_return;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (error) {
+		goto std_return;
+	}
+
+	/* Fall through to std_return with error = 0. */
+std_return:
+	if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
+						DM_EVENT_POSTLINK)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTLINK,
+				target_dir_bdp, DM_RIGHT_NULL,
+				src_bdp, DM_RIGHT_NULL,
+				target_name, NULL, 0, error, 0);
+	}
+	return error;
+
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+	/* FALLTHROUGH */
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+
+	goto std_return;
+}
+
+
+
+
+/*
+ * xfs_mkdir
+ *
+ */
+STATIC int
+xfs_mkdir(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	vattr_t		*vap,
+	vnode_t		**vpp,
+	cred_t		*credp)
+{
+	char			*dir_name = (char *)dentry->d_name.name;
+	xfs_inode_t		*dp;
+	xfs_inode_t		*cdp;	/* inode of created dir */
+	vnode_t			*cvp;	/* vnode of created dir */
+	xfs_trans_t		*tp;
+	xfs_ino_t		e_inum;
+	dev_t			rdev;
+	mode_t			mode;
+	xfs_mount_t		*mp;
+	int			cancel_flags;
+	int			error;
+	int			committed;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	vnode_t			*dir_vp;
+	boolean_t		dp_joined_to_trans;
+	boolean_t		created = B_FALSE;
+	int			dm_event_sent = 0;
+	xfs_prid_t		prid;
+	struct xfs_dquot	*udqp, *gdqp;
+	uint			resblks;
+	int			dm_di_mode;
+	int			dir_namelen;
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	dp = XFS_BHVTOI(dir_bdp);
+	mp = dp->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	dir_namelen = dentry->d_name.len;
+	if (dir_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+
+	tp = NULL;
+	dp_joined_to_trans = B_FALSE;
+	dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
+		error = xfs_dm_send_create_event(dir_bdp, dir_name,
+					dm_di_mode, &dm_event_sent);
+		if (error)
+			return error;
+	}
+
+	/* Return through std_return after this point. */
+
+	vn_trace_entry(dir_vp, "xfs_mkdir", (inst_t *)__return_address);
+
+	mp = dp->i_mount;
+	udqp = gdqp = NULL;
+	if (vap->va_mask & AT_PROJID)
+		prid = (xfs_prid_t)vap->va_projid;
+	else
+		prid = (xfs_prid_t)dfltprid;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		error = xfs_qm_vop_dqalloc(mp, dp,
+					   current->fsuid, current->fsgid,
+					   XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT,
+					   &udqp, &gdqp);
+		if (error)
+			goto std_return;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
+	error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
+				  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_MKDIR_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		dp = NULL;
+		goto error_return;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Check for directory link count overflow.
+	 */
+	if (dp->i_d.di_nlink >= XFS_MAXLINK) {
+		error = XFS_ERROR(EMLINK);
+		goto error_return;
+	}
+
+	/*
+	 * Make sure that nothing with the given name exists in the
+	 * target directory.
+	 */
+	error = xfs_dir_lookup_int(dir_bdp, 0, dentry, &e_inum, NULL);
+	if (error != ENOENT) {
+		if (error == 0)
+			error = XFS_ERROR(EEXIST);
+		goto error_return;
+	}
+
+
+	/*
+	 * Reserve disk quota and the inode.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (xfs_trans_reserve_quota(tp, udqp, gdqp, resblks, 1, 0)) {
+			error = XFS_ERROR(EDQUOT);
+			goto error_return;
+		}
+	}
+
+	if (resblks == 0 &&
+	    (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
+		goto error_return;
+	/*
+	 * create the directory inode.
+	 */
+	rdev = (vap->va_mask & AT_RDEV) ? vap->va_rdev : 0;
+	mode = IFDIR | (vap->va_mode & ~IFMT);
+	error = xfs_dir_ialloc(&tp, dp, mode, 2, rdev, credp, prid, resblks > 0,
+		&cdp, NULL);
+	if (error) {
+		if (error == ENOSPC)
+			goto error_return;
+		goto abort_return;
+	}
+	ITRACE(cdp);
+
+	/*
+	 * Now we add the directory inode to the transaction.
+	 * We waited until now since xfs_dir_ialloc might start
+	 * a new transaction.  Had we joined the transaction
+	 * earlier, the locks might have gotten released.
+	 */
+	VN_HOLD(dir_vp);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	dp_joined_to_trans = B_TRUE;
+
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
+			cdp->i_ino, &first_block, &free_list,
+			resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	if (error) {
+		ASSERT(error != ENOSPC);
+		goto error1;
+	}
+	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/*
+	 * Bump the in memory version number of the parent directory
+	 * so that other processes accessing it will recognize that
+	 * the directory has changed.
+	 */
+	dp->i_gen++;
+
+	error = XFS_DIR_INIT(mp, tp, cdp, dp);
+	if (error) {
+		goto error2;
+	}
+
+	cdp->i_gen = 1;
+	error = xfs_bumplink(tp, dp);
+	if (error) {
+		goto error2;
+	}
+
+	cvp = XFS_ITOV(cdp);
+
+	created = B_TRUE;
+
+	*vpp = cvp;
+	IHOLD(cdp);
+
+	/*
+	 * Attach the dquots to the new inode and modify the icount incore.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		xfs_qm_vop_dqattach_and_dqmod_newinode(tp, cdp, udqp, gdqp);
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * mkdir transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		IRELE(cdp);
+		goto error2;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (udqp)
+		 xfs_qm_dqrele(udqp);
+	if (gdqp)
+		 xfs_qm_dqrele(gdqp);
+
+	if (error) {
+		IRELE(cdp);
+	}
+
+	/* Fall through to std_return with error = 0 or errno from
+	 * xfs_trans_commit. */
+
+std_return:
+	if ( (created || (error != 0 && dm_event_sent != 0)) &&
+			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
+						DM_EVENT_POSTCREATE)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTCREATE,
+					dir_bdp, DM_RIGHT_NULL,
+					created ? XFS_ITOBHV(cdp):NULL,
+					DM_RIGHT_NULL,
+					dir_name, NULL,
+					dm_di_mode, error, 0);
+	}
+	return error;
+
+ error2:
+ error1:
+	xfs_bmap_cancel(&free_list);
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	if (!dp_joined_to_trans && (dp != NULL)) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	}
+
+	goto std_return;
+}
+
+
+/*
+ * xfs_rmdir
+ *
+ */
+STATIC int
+xfs_rmdir(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	cred_t		*credp)
+{
+	char			*name = (char *)dentry->d_name.name;
+	xfs_inode_t		*dp;
+	xfs_inode_t		*cdp;	/* child directory */
+	xfs_trans_t		*tp;
+	xfs_mount_t		*mp;
+/*	bhv_desc_t		*bdp;*/
+	int			error;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	int			cancel_flags;
+	int			committed;
+	int			entry_changed;
+	vnode_t			*dir_vp;
+	int			dm_di_mode = 0;
+	int			last_cdp_link;
+	int			namelen;
+	uint			resblks;
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	dp = XFS_BHVTOI(dir_bdp);
+
+	vn_trace_entry(dir_vp, "xfs_rmdir", (inst_t *)__return_address);
+
+	if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
+		return XFS_ERROR(EIO);
+	namelen = dentry->d_name.len;
+	if (namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
+		error = dm_send_namesp_event(DM_EVENT_REMOVE,
+					dir_bdp, DM_RIGHT_NULL,
+					NULL, DM_RIGHT_NULL,
+					name, NULL, 0, 0, 0);
+		if (error)
+			return XFS_ERROR(error);
+	}
+
+	/* Return through std_return after this point. */
+
+ retry:
+	cdp = NULL;
+
+	/*
+	 * We need to get a reference to cdp before we get our log
+	 * reservation.	 The reason for this is that we cannot call
+	 * xfs_iget for an inode for which we do not have a reference
+	 * once we've acquired a log reservation.  This is because the
+	 * inode we are trying to get might be in xfs_inactive going
+	 * for a log reservation.  Since we'll have to wait for the
+	 * inactive code to complete before returning from xfs_iget,
+	 * we need to make sure that we don't have log space reserved
+	 * when we call xfs_iget.  Instead we get an unlocked referece
+	 * to the inode before getting our log reservation.
+	 */
+	error = xfs_get_dir_entry(dentry, &cdp);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto std_return;
+	}
+	mp = dp->i_mount;
+	dm_di_mode = cdp->i_d.di_mode;
+
+	/*
+	 * Get the dquots for the inodes.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		ASSERT(! error);
+		if (XFS_NOT_DQATTACHED(mp, dp))
+			error = xfs_qm_dqattach(dp, 0);
+		if (!error && dp != cdp && XFS_NOT_DQATTACHED(mp, cdp))
+			error = xfs_qm_dqattach(cdp, 0);
+		if (error) {
+			IRELE(cdp);
+			REMOVE_DEBUG_TRACE(__LINE__);
+			goto std_return;
+		}
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/*
+	 * We try to get the real space reservation first,
+	 * allowing for directory btree deletion(s) implying
+	 * possible bmap insert(s).  If we can't get the space
+	 * reservation then we use 0 instead, and avoid the bmap
+	 * btree insert(s) in the directory code by, if the bmap
+	 * insert tries to happen, instead trimming the LAST
+	 * block from the directory.
+	 */
+	resblks = XFS_REMOVE_SPACE_RES(mp);
+	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
+	}
+	if (error) {
+		ASSERT(error != ENOSPC);
+		cancel_flags = 0;
+		IRELE(cdp);
+		goto error_return;
+	}
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	/*
+	 * Now lock the child directory inode and the parent directory
+	 * inode in the proper order.  This will take care of validating
+	 * that the directory entry for the child directory inode has
+	 * not changed while we were obtaining a log reservation.
+	 */
+	error = xfs_lock_dir_and_entry(dp, dentry, cdp, &entry_changed);
+	if (error) {
+		xfs_trans_cancel(tp, cancel_flags);
+		IRELE(cdp);
+		goto std_return;
+	}
+
+	/*
+	 * If the inode we found in the first pass is no longer
+	 * the entry with the given name, then drop our transaction and
+	 * inode reference and start over.
+	 */
+	if (entry_changed) {
+		xfs_trans_cancel(tp, cancel_flags);
+		IRELE(cdp);
+		goto retry;
+	}
+
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	if (dp != cdp) {
+		/*
+		 * Only increment the parent directory vnode count if
+		 * we didn't bump it in looking up cdp.	 The only time
+		 * we don't bump it is when we're looking up ".".
+		 */
+		VN_HOLD(dir_vp);
+	}
+
+	ITRACE(cdp);
+	xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
+
+	if ((error = _MAC_XFS_IACCESS(cdp, MACWRITE, credp))) {
+		goto error_return;
+	}
+
+	ASSERT(cdp->i_d.di_nlink >= 2);
+	if (cdp->i_d.di_nlink != 2) {
+		error = XFS_ERROR(ENOTEMPTY);
+		goto error_return;
+	}
+	if (!XFS_DIR_ISEMPTY(mp, cdp)) {
+		error = XFS_ERROR(ENOTEMPTY);
+		goto error_return;
+	}
+
+	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
+		&first_block, &free_list, resblks);
+	if (error) {
+		goto error1;
+	}
+
+	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/*
+	 * Bump the in memory generation count on the parent
+	 * directory so that other can know that it has changed.
+	 */
+	dp->i_gen++;
+
+	/*
+	 * Drop the link from cdp's "..".
+	 */
+	error = xfs_droplink(tp, dp);
+	if (error) {
+		goto error1;
+	}
+
+	/*
+	 * Drop the link from dp to cdp.
+	 */
+	error = xfs_droplink(tp, cdp);
+	if (error) {
+		goto error1;
+	}
+
+	/*
+	 * Drop the "." link from cdp to self.
+	 */
+	error = xfs_droplink(tp, cdp);
+	if (error) {
+		goto error1;
+	}
+
+	/* Determine these before committing transaction */
+	last_cdp_link = (cdp)->i_d.di_nlink==0;
+
+	/*
+	 * Take an extra ref on the child vnode so that it
+	 * does not go to xfs_inactive() from within the commit.
+	 */
+	IHOLD(cdp);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * rmdir transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT));
+		IRELE(cdp);
+		goto std_return;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (error) {
+		IRELE(cdp);
+		goto std_return;
+	}
+
+
+	/*
+	 * Let interposed file systems know about removed links.
+	 */
+	VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
+
+	IRELE(cdp);
+
+	/* Fall through to std_return with error = 0 or the errno
+	 * from xfs_trans_commit. */
+std_return:
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
+						DM_EVENT_POSTREMOVE)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTREMOVE,
+					dir_bdp, DM_RIGHT_NULL,
+					NULL, DM_RIGHT_NULL,
+					name, NULL, dm_di_mode,
+					error, 0);
+	}
+	return error;
+
+ error1:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+	goto std_return;
+}
+
+
+
+/*
+ * xfs_readdir
+ *
+ * Read dp's entries starting at uiop->uio_offset and translate them into
+ * bufsize bytes worth of struct dirents starting at bufbase.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_readdir(
+	bhv_desc_t	*dir_bdp,
+	uio_t		*uiop,
+	cred_t		*credp,
+	int		*eofp)
+{
+	xfs_inode_t		*dp;
+	xfs_trans_t		*tp = NULL;
+	int			error = 0;
+	uint			lock_mode;
+	xfs_off_t		start_offset;
+
+	vn_trace_entry(BHV_TO_VNODE(dir_bdp), "xfs_readdir",
+					       (inst_t *)__return_address);
+	dp = XFS_BHVTOI(dir_bdp);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
+		return XFS_ERROR(EIO);
+	}
+
+	lock_mode = xfs_ilock_map_shared(dp);
+
+	if ((dp->i_d.di_mode & IFMT) != IFDIR) {
+		xfs_iunlock_map_shared(dp, lock_mode);
+		return XFS_ERROR(ENOTDIR);
+	}
+
+	start_offset = uiop->uio_offset;
+	error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
+	if (start_offset != uiop->uio_offset) {
+		xfs_ichgtime(dp, XFS_ICHGTIME_ACC);
+	}
+
+	xfs_iunlock_map_shared(dp, lock_mode);
+
+	return error;
+}
+
+/*
+ * xfs_symlink
+ *
+ */
+STATIC int
+xfs_symlink(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	vattr_t		*vap,
+	char		*target_path,
+	vnode_t		**vpp,
+	cred_t		*credp)
+{
+	xfs_trans_t		*tp;
+	xfs_mount_t		*mp;
+	xfs_inode_t		*dp;
+	xfs_inode_t		*ip;
+	int			error;
+	int			pathlen;
+	xfs_ino_t		e_inum;
+	dev_t			rdev;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	boolean_t		dp_joined_to_trans;
+	vnode_t			*dir_vp;
+	uint			cancel_flags;
+	int			committed;
+	xfs_fileoff_t		first_fsb;
+	xfs_filblks_t		fs_blocks;
+	int			nmaps;
+	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
+	xfs_daddr_t		d;
+	char			*cur_chunk;
+	int			byte_cnt;
+	int			n;
+	xfs_buf_t		*bp;
+	xfs_prid_t		prid;
+	struct xfs_dquot	*udqp, *gdqp;
+	uint			resblks;
+	char			*link_name = (char *)dentry->d_name.name;
+	int			link_namelen;
+
+	*vpp = NULL;
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	dp = XFS_BHVTOI(dir_bdp);
+	dp_joined_to_trans = B_FALSE;
+	error = 0;
+	ip = NULL;
+	tp = NULL;
+
+	vn_trace_entry(dir_vp, "xfs_symlink", (inst_t *)__return_address);
+
+	mp = dp->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	link_namelen = dentry->d_name.len;
+	if (link_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+	/*
+	 * Check component lengths of the target path name.
+	 */
+	pathlen = strlen(target_path);
+	if (pathlen >= MAXPATHLEN)	/* total string too long */
+		return XFS_ERROR(ENAMETOOLONG);
+	if (pathlen >= MAXNAMELEN) {	/* is any component too long? */
+		int len, total;
+		char *path;
+
+		for(total = 0, path = target_path; total < pathlen;) {
+			/*
+			 * Skip any slashes.
+			 */
+			while(*path == '/') {
+				total++;
+				path++;
+			}
+
+			/*
+			 * Count up to the next slash or end of path.
+			 * Error out if the component is bigger than MAXNAMELEN.
+			 */
+			for(len = 0; *path != '/' && total < pathlen;total++, path++) {
+				if (++len >= MAXNAMELEN) {
+					error = ENAMETOOLONG;
+					return error;
+				}
+			}
+		}
+	}
+
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
+		error = dm_send_namesp_event(DM_EVENT_SYMLINK, dir_bdp, DM_RIGHT_NULL,
+						NULL, DM_RIGHT_NULL,
+						link_name, target_path,
+						0, 0, 0);
+		if (error)
+			return error;
+	}
+
+	/* Return through std_return after this point. */
+
+	udqp = gdqp = NULL;
+	if (vap->va_mask & AT_PROJID)
+		prid = (xfs_prid_t)vap->va_projid;
+	else
+		prid = (xfs_prid_t)dfltprid;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		error = xfs_qm_vop_dqalloc(mp, dp,
+					   current->fsuid, current->fsgid,
+					   XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT,
+					   &udqp, &gdqp);
+		if (error)
+			goto std_return;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/*
+	 * The symlink will fit into the inode data fork?
+	 * There can't be any attributes so we get the whole variable part.
+	 */
+	if (pathlen <= XFS_LITINO(mp))
+		fs_blocks = 0;
+	else
+		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
+	resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
+	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+	if (error == ENOSPC && fs_blocks == 0) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		dp = NULL;
+		goto error_return;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Since we've already started a transaction, we cannot allow
+	 * the lookup to do a vn_get().
+	 */
+	error = xfs_dir_lookup_int(dir_bdp, 0, dentry, &e_inum, NULL);
+	if (error != ENOENT) {
+		if (!error) {
+			error = XFS_ERROR(EEXIST);
+		}
+		goto error_return;
+	}
+	/*
+	 * Reserve disk quota : blocks and inode.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (xfs_trans_reserve_quota(tp, udqp, gdqp, resblks, 1, 0)) {
+			error = XFS_ERROR(EDQUOT);
+			goto error_return;
+		}
+	}
+
+	/*
+	 * Check for ability to enter directory entry, if no space reserved.
+	 */
+	if (resblks == 0 &&
+	    (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
+		goto error_return;
+	/*
+	 * Initialize the bmap freelist prior to calling either
+	 * bmapi or the directory create code.
+	 */
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	/*
+	 * Allocate an inode for the symlink.
+	 */
+	rdev = (vap->va_mask & AT_RDEV) ? vap->va_rdev : 0;
+
+	error = xfs_dir_ialloc(&tp, dp, IFLNK | (vap->va_mode&~IFMT),
+			       1, rdev, credp, prid, resblks > 0, &ip, NULL);
+	if (error) {
+		if (error == ENOSPC)
+			goto error_return;
+		goto error1;
+	}
+	ITRACE(ip);
+
+	VN_HOLD(dir_vp);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	dp_joined_to_trans = B_TRUE;
+
+	/*
+	 * Also attach the dquot(s) to it, if applicable.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		xfs_qm_vop_dqattach_and_dqmod_newinode(tp, ip, udqp, gdqp);
+	}
+
+	if (resblks)
+		resblks -= XFS_IALLOC_SPACE_RES(mp);
+	/*
+	 * If the symlink will fit into the inode, write it inline.
+	 */
+	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+		bcopy(target_path, ip->i_df.if_u1.if_data, pathlen);
+		ip->i_d.di_size = pathlen;
+
+		/*
+		 * The inode was initially created in extent format.
+		 */
+		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+		ip->i_df.if_flags |= XFS_IFINLINE;
+
+		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+
+	} else {
+		first_fsb = 0;
+		nmaps = SYMLINK_MAPS;
+
+		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
+				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
+				  &first_block, resblks, mval, &nmaps,
+				  &free_list);
+		if (error) {
+			goto error1;
+		}
+
+		if (resblks)
+			resblks -= fs_blocks;
+		ip->i_d.di_size = pathlen;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+		cur_chunk = target_path;
+		for (n = 0; n < nmaps; n++) {
+			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					       BTOBB(byte_cnt), 0);
+			ASSERT(bp && !XFS_BUF_GETERROR(bp));
+			if (pathlen < byte_cnt) {
+				byte_cnt = pathlen;
+			}
+			pathlen -= byte_cnt;
+
+			bcopy(cur_chunk, XFS_BUF_PTR(bp), byte_cnt);
+			cur_chunk += byte_cnt;
+
+			xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
+		}
+	}
+
+	/*
+	 * Create the directory entry for the symlink.
+	 */
+	error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
+			ip->i_ino, &first_block, &free_list, resblks);
+	if (error) {
+		goto error1;
+	}
+	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	/*
+	 * Bump the in memory version number of the parent directory
+	 * so that other processes accessing it will recognize that
+	 * the directory has changed.
+	 */
+	dp->i_gen++;
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * symlink transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	/*
+	 * xfs_trans_commit normally decrements the vnode ref count
+	 * when it unlocks the inode. Since we want to return the
+	 * vnode to the caller, we bump the vnode ref count now.
+	 */
+	IHOLD(ip);
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		goto error2;
+	}
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	/* Fall through to std_return with error = 0 or errno from
+	 * xfs_trans_commit	*/
+std_return:
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
+			     DM_EVENT_POSTSYMLINK)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTSYMLINK,
+						dir_bdp, DM_RIGHT_NULL,
+						error? NULL:XFS_ITOBHV(ip),
+						DM_RIGHT_NULL,
+						link_name, target_path,
+						0, error, 0);
+	}
+
+	if (!error) {
+		vnode_t *vp;
+
+		ASSERT(ip);
+		vp = XFS_ITOV(ip);
+		*vpp = vp;
+	}
+	return error;
+
+ error2:
+	IRELE(ip);
+ error1:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	if (!dp_joined_to_trans && (dp != NULL)) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	}
+
+	goto std_return;
+}
+
+
+/*
+ * xfs_fid2
+ *
+ * A fid routine that takes a pointer to a previously allocated
+ * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
+ */
+STATIC int
+xfs_fid2(
+	bhv_desc_t	*bdp,
+	fid_t		*fidp)
+{
+	xfs_inode_t	*ip;
+	xfs_fid2_t	*xfid;
+
+	vn_trace_entry(BHV_TO_VNODE(bdp), "xfs_fid2",
+		       (inst_t *)__return_address);
+	ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
+
+	xfid = (xfs_fid2_t *)fidp;
+	ip = XFS_BHVTOI(bdp);
+	xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
+	xfid->fid_pad = 0;
+	/*
+	 * use bcopy because the inode is a long long and there's no
+	 * assurance that xfid->fid_ino is properly aligned.
+	 */
+	bcopy(&ip->i_ino, &xfid->fid_ino, sizeof xfid->fid_ino);
+	xfid->fid_gen = ip->i_d.di_gen;
+
+	return 0;
+}
+
+
+/*
+ * xfs_rwlock
+ */
+int
+xfs_rwlock(
+	bhv_desc_t	*bdp,
+	vrwlock_t	locktype)
+{
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+
+	vp = BHV_TO_VNODE(bdp);
+	if (vp->v_type == VDIR)
+		return 1;
+	ip = XFS_BHVTOI(bdp);
+	if (locktype == VRWLOCK_WRITE) {
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	} else if (locktype == VRWLOCK_TRY_READ) {
+		return (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED));
+	} else if (locktype == VRWLOCK_TRY_WRITE) {
+		return (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL));
+	} else {
+		ASSERT((locktype == VRWLOCK_READ) ||
+		       (locktype == VRWLOCK_WRITE_DIRECT));
+		xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	}
+
+	return 1;
+}
+
+
+/*
+ * xfs_rwunlock
+ */
+void
+xfs_rwunlock(
+	bhv_desc_t	*bdp,
+	vrwlock_t	locktype)
+{
+	xfs_inode_t	*ip;
+	xfs_inode_t	*release_ip;
+	vnode_t		*vp;
+	int		error;
+
+	vp = BHV_TO_VNODE(bdp);
+	if (vp->v_type == VDIR)
+		return;
+	ip = XFS_BHVTOI(bdp);
+	if (locktype == VRWLOCK_WRITE) {
+		/*
+		 * In the write case, we may have added a new entry to
+		 * the reference cache.	 This might store a pointer to
+		 * an inode to be released in this inode.  If it is there,
+		 * clear the pointer and release the inode after unlocking
+		 * this one.
+		 */
+		release_ip = ip->i_release;
+		ip->i_release = NULL;
+		xfs_iunlock (ip, XFS_IOLOCK_EXCL);
+
+		if (release_ip != NULL) {
+			VOP_RELEASE(XFS_ITOV(release_ip), error);
+			VN_RELE(XFS_ITOV(release_ip));
+		}
+	} else {
+		ASSERT((locktype == VRWLOCK_READ) ||
+		       (locktype == VRWLOCK_WRITE_DIRECT));
+		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	}
+	return;
+}
+
+STATIC int
+xfs_inode_flush(bhv_desc_t	*bdp,
+		int		flags)
+{
+	xfs_inode_t	*ip;
+	xfs_dinode_t	*dip;
+	xfs_mount_t	*mp;
+	xfs_buf_t	*bp;
+	int		error = 0;
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/* Bypass inodes which have already been cleaned by
+	 * the inode flush clustering code inside xfs_iflush
+	 */
+	if ((ip->i_update_core == 0) &&
+	    ((ip->i_itemp == NULL) ||
+	     !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)))
+		return 0;
+
+	if (flags & FLUSH_LOG) {
+		xfs_inode_log_item_t *iip = ip->i_itemp;
+ 
+		if (iip && iip->ili_last_lsn) {
+			xlog_t	*log = mp->m_log;
+			xfs_lsn_t	sync_lsn;
+			int		s, log_flags = XFS_LOG_FORCE;
+
+			s = GRANT_LOCK(log);
+			sync_lsn = log->l_last_sync_lsn;
+			GRANT_UNLOCK(log, s);
+
+			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
+				return 0;
+
+			if (flags & FLUSH_SYNC)
+				log_flags |= XFS_LOG_SYNC;
+			return xfs_log_force(mp, iip->ili_last_lsn,
+						log_flags);
+		}
+	}
+
+	/* We make this non-blocking if the inode is contended,
+	 * return EAGAIN to indicate to the caller that they
+	 * did not succeed. This prevents the flush path from
+	 * blocking on inodes inside another operation right
+	 * now, they get caught later by xfs_sync.
+	 */
+	if (flags & FLUSH_INODE) {
+		if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+			if ((xfs_ipincount(ip) == 0) && xfs_iflock_nowait(ip)) {
+				int	flush_flags;
+
+#if 0
+				/* not turning this on until some
+				 * performance analysis is done
+				 */
+				if (flags & FLUSH_SYNC)
+					flush_flags = XFS_IFLUSH_SYNC;
+				else
+#endif
+					flush_flags = XFS_IFLUSH_DELWRI;
+
+				xfs_ifunlock(ip);
+				xfs_iunlock(ip, XFS_ILOCK_SHARED);
+				error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
+				if (error)
+					goto eagain;
+				xfs_buf_relse(bp);
+
+				if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED) == 0)
+					goto eagain;
+
+				if ((xfs_ipincount(ip) == 0) &&
+				     xfs_iflock_nowait(ip))
+					error = xfs_iflush(ip, flush_flags);
+			} else {
+				error = EAGAIN;
+			}
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		} else {
+eagain:
+			error = EAGAIN;
+		}
+	}
+
+	return error;
+}
+
+
+int
+xfs_set_dmattrs (
+	bhv_desc_t	*bdp,
+	u_int		evmask,
+	u_int16_t	state,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	int		error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
+	ip->i_iocore.io_dmstate	 = ip->i_d.di_dmstate  = state;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	IHOLD(ip);
+	error = xfs_trans_commit(tp, 0, NULL);
+
+	return error;
+}
+
+
+/*
+ * xfs_reclaim
+ */
+STATIC int
+xfs_reclaim(
+	bhv_desc_t	*bdp)
+{
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_reclaim", (inst_t *)__return_address);
+
+	ASSERT(!VN_MAPPED(vp));
+	ip = XFS_BHVTOI(bdp);
+
+	if ((ip->i_d.di_mode & IFMT) == IFREG) {
+		if (ip->i_d.di_size > 0) {
+			/*
+			 * Flush and invalidate any data left around that is
+			 * a part of this file.
+			 *
+			 * Get the inode's i/o lock so that buffers are pushed
+			 * out while holding the proper lock.  We can't hold
+			 * the inode lock here since flushing out buffers may
+			 * cause us to try to get the lock in xfs_strategy().
+			 *
+			 * We don't have to call remapf() here, because there
+			 * cannot be any mapped file references to this vnode
+			 * since it is being reclaimed.
+			 */
+			xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+			/*
+			 * If we hit an IO error, we need to make sure that the
+			 * buffer and page caches of file data for
+			 * the file are tossed away. We don't want to use
+			 * VOP_FLUSHINVAL_PAGES here because we don't want dirty
+			 * pages to stay attached to the vnode, but be
+			 * marked P_BAD. pdflush/vnode_pagebad
+			 * hates that.
+			 */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_NONE);
+			} else {
+				VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
+			}
+
+			ASSERT(VN_CACHED(vp) == 0);
+			ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+			       ip->i_delayed_blks == 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		} else if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+			/*
+			 * di_size field may not be quite accurate if we're
+			 * shutting down.
+			 */
+			VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
+			ASSERT(VN_CACHED(vp) == 0);
+		}
+	}
+
+	/* If we have nothing to flush with this inode then complete the
+	 * teardown now, otherwise break the link between the xfs inode
+	 * and the linux inode and clean up the xfs inode later. This
+	 * avoids flushing the inode to disk during the delete operation
+	 * itself.
+	 */
+	if (!ip->i_update_core && (ip->i_itemp == NULL)) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
+	} else {
+		xfs_mount_t	*mp = ip->i_mount;
+
+		/* Protect sync from us */
+		XFS_MOUNT_ILOCK(mp);
+		vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
+		XFS_MOUNT_IUNLOCK(mp);
+	}
+	return 0;
+}
+
+int
+xfs_finish_reclaim(
+	xfs_inode_t	*ip,
+	int		locked,
+	int		sync_mode)
+{
+	xfs_ihash_t	*ih = ip->i_hash;
+	int		error;
+
+	if (!locked)
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	/* The hash lock here protects a thread in xfs_iget_core from
+	 * racing with us on linking the inode back with a vnode.
+	 * Once we have the XFS_IRECLAIM flag set it will not touch
+	 * us.
+	 */
+	write_lock(&ih->ih_lock);
+	if (ip->i_flags & XFS_IRECLAIM || (!locked && XFS_ITOV_NULL(ip))) {
+		write_unlock(&ih->ih_lock);
+		if (!locked)
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return(1);
+	}
+	ip->i_flags |= XFS_IRECLAIM;
+	write_unlock(&ih->ih_lock);
+
+	/*
+	 * If the inode is still dirty, then flush it out.  If the inode
+	 * is not in the AIL, then it will be OK to flush it delwri as
+	 * long as xfs_iflush() does not keep any references to the inode.
+	 * We leave that decision up to xfs_iflush() since it has the
+	 * knowledge of whether it's OK to simply do a delwri flush of
+	 * the inode or whether we need to wait until the inode is
+	 * pulled from the AIL.
+	 * We get the flush lock regardless, though, just to make sure
+	 * we don't free it while it is being flushed.
+	 */
+	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		if (!locked) {
+			xfs_iflock(ip);
+		}
+
+		if (ip->i_update_core ||
+		    ((ip->i_itemp != NULL) &&
+		     (ip->i_itemp->ili_format.ilf_fields != 0))) {
+			error = xfs_iflush(ip, sync_mode);
+			/*
+			 * If we hit an error, typically because of filesystem
+			 * shutdown, we don't need to let vn_reclaim to know
+			 * because we're gonna reclaim the inode anyway.
+			 */
+			if (error) {
+				xfs_iunlock(ip, XFS_ILOCK_EXCL);
+				xfs_ireclaim(ip);
+				return (0);
+			}
+			xfs_iflock(ip); /* synchronize with xfs_iflush_done */
+		}
+
+		ASSERT(ip->i_update_core == 0);
+		ASSERT(ip->i_itemp == NULL ||
+		       ip->i_itemp->ili_format.ilf_fields == 0);
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	xfs_ireclaim(ip);
+	return 0;
+}
+
+/*
+ * xfs_alloc_file_space()
+ *	This routine allocates disk space for the given file.
+ *
+ *	If alloc_type == 0, this request is for an ALLOCSP type
+ *	request which will change the file size.  In this case, no
+ *	DMAPI event will be generated by the call.  A TRUNCATE event
+ *	will be generated later by xfs_setattr.
+ *
+ *	If alloc_type != 0, this request is for a RESVSP type
+ *	request, and a DMAPI DM_EVENT_WRITE will be generated if the
+ *	lower block boundary byte address is less than the file's
+ *	length.
+ *
+ * RETURNS:
+ *	 0 on success
+ *	errno on error
+ *
+ */
+int
+xfs_alloc_file_space(
+	xfs_inode_t		*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len,
+	int			alloc_type,
+	int			attr_flags)
+{
+	xfs_filblks_t		allocated_fsb;
+	xfs_filblks_t		allocatesize_fsb;
+	int			committed;
+	xfs_off_t		count;
+	xfs_filblks_t		datablocks;
+	int			error;
+	xfs_fsblock_t		firstfsb;
+	xfs_bmap_free_t		free_list;
+	xfs_bmbt_irec_t		*imapp;
+	xfs_bmbt_irec_t		imaps[1];
+	xfs_mount_t		*mp;
+	int			numrtextents;
+	int			reccount;
+	uint			resblks;
+	int			rt;
+	int			rtextsize;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_trans_t		*tp;
+	int			xfs_bmapi_flags;
+
+	vn_trace_entry(XFS_ITOV(ip), "xfs_alloc_file_space",
+					       (inst_t *)__return_address);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * determine if this is a realtime file
+	 */
+	if ((rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) != 0) {
+		if (ip->i_d.di_extsize)
+			rtextsize = ip->i_d.di_extsize;
+		else
+			rtextsize = mp->m_sb.sb_rextsize;
+	} else
+		rtextsize = 0;
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((error = xfs_qm_dqattach(ip, 0)))
+				return error;
+		}
+	}
+
+	if (len <= 0)
+		return XFS_ERROR(EINVAL);
+
+	count = len;
+	error = 0;
+	imapp = &imaps[0];
+	reccount = 1;
+	xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
+	startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
+	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+
+	/*	Generate a DMAPI event if needed.	*/
+	if (alloc_type != 0 && offset < ip->i_d.di_size &&
+			(attr_flags&ATTR_DMI) == 0  &&
+			DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
+		xfs_off_t	    end_dmi_offset;
+
+		end_dmi_offset = offset+len;
+		if (end_dmi_offset > ip->i_d.di_size)
+			end_dmi_offset = ip->i_d.di_size;
+		error = xfs_dm_send_data_event(DM_EVENT_WRITE, XFS_ITOBHV(ip),
+			offset, end_dmi_offset - offset,
+			0, NULL);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * allocate file space until done or until there is an error
+	 */
+retry:
+	while (allocatesize_fsb && !error) {
+		/*
+		 * determine if reserving space on
+		 * the data or realtime partition.
+		 */
+		if (rt) {
+			xfs_fileoff_t s, e;
+
+			s = startoffset_fsb;
+			do_div(s, rtextsize);
+			s *= rtextsize;
+			e = roundup_64(startoffset_fsb + allocatesize_fsb,
+				rtextsize);
+			numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize;
+			datablocks = 0;
+		} else {
+			datablocks = allocatesize_fsb;
+			numrtextents = 0;
+		}
+
+		/*
+		 * allocate and setup the transaction
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
+		error = xfs_trans_reserve(tp,
+					  resblks,
+					  XFS_WRITE_LOG_RES(mp),
+					  numrtextents,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_WRITE_LOG_COUNT);
+
+		/*
+		 * check for running out of space
+		 */
+		if (error) {
+			/*
+			 * Free the transaction structure.
+			 */
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (XFS_IS_QUOTA_ON(mp)) {
+			if (xfs_trans_reserve_quota(tp,
+						    ip->i_udquot,
+						    ip->i_gdquot,
+						    resblks, 0, 0)) {
+				error = XFS_ERROR(EDQUOT);
+				goto error1;
+			}
+		}
+
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+
+		/*
+		 * issue the bmapi() call to allocate the blocks
+		 */
+		XFS_BMAP_INIT(&free_list, &firstfsb);
+		error = xfs_bmapi(tp, ip, startoffset_fsb,
+				  allocatesize_fsb, xfs_bmapi_flags,
+				  &firstfsb, 0, imapp, &reccount,
+				  &free_list);
+		if (error) {
+			goto error0;
+		}
+
+		/*
+		 * complete the transaction
+		 */
+		error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+		if (error) {
+			goto error0;
+		}
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error) {
+			break;
+		}
+
+		allocated_fsb = imapp->br_blockcount;
+
+		if (reccount == 0) {
+			error = XFS_ERROR(ENOSPC);
+			break;
+		}
+
+		startoffset_fsb += allocated_fsb;
+		allocatesize_fsb -= allocated_fsb;
+	}
+dmapi_enospc_check:
+	if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
+	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
+
+		error = dm_send_namesp_event(DM_EVENT_NOSPACE,
+				XFS_ITOBHV(ip), DM_RIGHT_NULL,
+				XFS_ITOBHV(ip), DM_RIGHT_NULL,
+				NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
+		if (error == 0)
+			goto retry;	/* Maybe DMAPI app. has made space */
+		/* else fall through with error = xfs_dm_send_data_event result. */
+	}
+
+	return error;
+
+ error0:
+	xfs_bmap_cancel(&free_list);
+ error1:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	goto dmapi_enospc_check;
+}
+
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+	xfs_inode_t		*ip,
+	xfs_off_t		startoff,
+	xfs_off_t		endoff)
+{
+	xfs_buf_t		*bp;
+	int			error=0;
+	xfs_bmbt_irec_t		imap;
+	xfs_off_t		lastoffset;
+	xfs_mount_t		*mp;
+	int			nimap;
+	xfs_off_t		offset;
+	xfs_fileoff_t		offset_fsb;
+
+	mp = ip->i_mount;
+	bp = XFS_ngetrbuf(mp->m_sb.sb_blocksize,mp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+
+	if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+		XFS_BUF_SET_TARGET(bp, mp->m_rtdev_targp);
+	} else {
+		XFS_BUF_SET_TARGET(bp, mp->m_ddev_targp);
+	}
+
+	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+		nimap = 1;
+		error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
+			&nimap, NULL);
+		if (error || nimap < 1)
+			break;
+		ASSERT(imap.br_blockcount >= 1);
+		ASSERT(imap.br_startoff == offset_fsb);
+		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+		if (lastoffset > endoff)
+			lastoffset = endoff;
+		if (imap.br_startblock == HOLESTARTBLOCK)
+			continue;
+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+		if (imap.br_state == XFS_EXT_UNWRITTEN)
+			continue;
+		XFS_BUF_UNDONE(bp);
+		XFS_BUF_UNWRITE(bp);
+		XFS_BUF_READ(bp);
+		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
+		xfsbdstrat(mp, bp);
+		if ((error = xfs_iowait(bp))) {
+			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
+					  mp, bp, XFS_BUF_ADDR(bp));
+			break;
+		}
+		bzero(XFS_BUF_PTR(bp) +
+			(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+		      lastoffset - offset + 1);
+		XFS_BUF_UNDONE(bp);
+		XFS_BUF_UNREAD(bp);
+		XFS_BUF_WRITE(bp);
+		xfsbdstrat(mp, bp);
+		if ((error = xfs_iowait(bp))) {
+			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
+					  mp, bp, XFS_BUF_ADDR(bp));
+			break;
+		}
+	}
+	XFS_nfreerbuf(bp);
+	return error;
+}
+
+/*
+ * xfs_free_file_space()
+ *	This routine frees disk space for the given file.
+ *
+ *	This routine is only called by xfs_change_file_space
+ *	for an UNRESVSP type call.
+ *
+ * RETURNS:
+ *	 0 on success
+ *	errno on error
+ *
+ */
+STATIC int
+xfs_free_file_space(
+	xfs_inode_t		*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len,
+	int			attr_flags)
+{
+	int			committed;
+	int			done;
+	xfs_off_t		end_dmi_offset;
+	xfs_fileoff_t		endoffset_fsb;
+	int			error;
+	xfs_fsblock_t		firstfsb;
+	xfs_bmap_free_t		free_list;
+	xfs_off_t		ilen;
+	xfs_bmbt_irec_t		imap;
+	xfs_off_t		ioffset;
+	xfs_extlen_t		mod=0;
+	xfs_mount_t		*mp;
+	int			nimap;
+	uint			resblks;
+	int			rounding;
+	int			specrt;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_trans_t		*tp;
+
+	vn_trace_entry(XFS_ITOV(ip), "xfs_free_file_space",
+					       (inst_t *)__return_address);
+	mp = ip->i_mount;
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((error = xfs_qm_dqattach(ip, 0)))
+				return error;
+		}
+	}
+
+	error = 0;
+	if (len <= 0)	/* if nothing being freed */
+		return error;
+	specrt =
+		(ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+		!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb);
+	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+	end_dmi_offset = offset + len;
+	endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
+
+	if (offset < ip->i_d.di_size &&
+	    (attr_flags&ATTR_DMI) == 0	&&
+	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
+		if (end_dmi_offset > ip->i_d.di_size)
+			end_dmi_offset = ip->i_d.di_size;
+		error = xfs_dm_send_data_event(DM_EVENT_WRITE, XFS_ITOBHV(ip),
+			offset, end_dmi_offset - offset,
+			AT_DELAY_FLAG(attr_flags), NULL);
+		if (error)
+			return(error);
+	}
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
+			(__uint8_t)NBPP);
+	ilen = len + (offset & (rounding - 1));
+	ioffset = offset & ~(rounding - 1);
+	if (ilen & (rounding - 1))
+		ilen = (ilen + rounding) & ~(rounding - 1);
+	xfs_inval_cached_pages(XFS_ITOV(ip), &(ip->i_iocore), ioffset, 0, 0);
+	/*
+	 * Need to zero the stuff we're not freeing, on disk.
+	 * If its specrt (realtime & can't use unwritten extents) then
+	 * we actually need to zero the extent edges.  Otherwise xfs_bunmapi
+	 * will take care of it for us.
+	 */
+	if (specrt) {
+		nimap = 1;
+		error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
+			&imap, &nimap, NULL);
+		if (error)
+			return error;
+		ASSERT(nimap == 0 || nimap == 1);
+		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+			xfs_daddr_t	block;
+
+			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			block = imap.br_startblock;
+			mod = do_div(block, mp->m_sb.sb_rextsize);
+			if (mod)
+				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+		}
+		nimap = 1;
+		error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
+			&imap, &nimap, NULL);
+		if (error)
+			return error;
+		ASSERT(nimap == 0 || nimap == 1);
+		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			mod++;
+			if (mod && (mod != mp->m_sb.sb_rextsize))
+				endoffset_fsb -= mod;
+		}
+	}
+	if ((done = (endoffset_fsb <= startoffset_fsb)))
+		/*
+		 * One contiguous piece to clear
+		 */
+		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+	else {
+		/*
+		 * Some full blocks, possibly two pieces to clear
+		 */
+		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+			error = xfs_zero_remaining_bytes(ip, offset,
+				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+		if (!error &&
+		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+			error = xfs_zero_remaining_bytes(ip,
+				XFS_FSB_TO_B(mp, endoffset_fsb),
+				offset + len - 1);
+	}
+
+	/*
+	 * free file space until done or until there is an error
+	 */
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+	while (!error && !done) {
+
+		/*
+		 * allocate and setup the transaction
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		error = xfs_trans_reserve(tp,
+					  resblks,
+					  XFS_WRITE_LOG_RES(mp),
+					  0,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_WRITE_LOG_COUNT);
+
+		/*
+		 * check for running out of space
+		 */
+		if (error) {
+			/*
+			 * Free the transaction structure.
+			 */
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (XFS_IS_QUOTA_ON(mp)) {
+			if (xfs_trans_reserve_quota(tp,
+						    ip->i_udquot,
+						    ip->i_gdquot,
+						    resblks, 0, 0)) {
+				error = XFS_ERROR(EDQUOT);
+				goto error1;
+			}
+		}
+
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+
+		/*
+		 * issue the bunmapi() call to free the blocks
+		 */
+		XFS_BMAP_INIT(&free_list, &firstfsb);
+		error = xfs_bunmapi(tp, ip, startoffset_fsb,
+				  endoffset_fsb - startoffset_fsb,
+				  0, 2, &firstfsb, &free_list, &done);
+		if (error) {
+			goto error0;
+		}
+
+		/*
+		 * complete the transaction
+		 */
+		error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+		if (error) {
+			goto error0;
+		}
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+
+ error0:
+	xfs_bmap_cancel(&free_list);
+ error1:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	return error;
+}
+
+/*
+ * xfs_change_file_space()
+ *	This routine allocates or frees disk space for the given file.
+ *	The user specified parameters are checked for alignment and size
+ *	limitations.
+ *
+ * RETURNS:
+ *	 0 on success
+ *	errno on error
+ *
+ */
+int
+xfs_change_file_space(
+	bhv_desc_t	*bdp,
+	int		cmd,
+	xfs_flock64_t	*bf,
+	xfs_off_t	offset,
+	cred_t		*credp,
+	int		attr_flags)
+{
+	int		clrprealloc;
+	int		error;
+	xfs_fsize_t	fsize;
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+	int		setprealloc;
+	xfs_off_t	startoffset;
+	xfs_off_t	llen;
+	xfs_trans_t	*tp;
+	vattr_t		va;
+	vnode_t		*vp;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_change_file_space",
+						(inst_t *)__return_address);
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	/*
+	 * must be a regular file and have write permission
+	 */
+	if (vp->v_type != VREG)
+		return XFS_ERROR(EINVAL);
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	if ((error = xfs_iaccess(ip, IWRITE, credp))) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return error;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	switch (bf->l_whence) {
+	case 0: /*SEEK_SET*/
+		break;
+	case 1: /*SEEK_CUR*/
+		bf->l_start += offset;
+		break;
+	case 2: /*SEEK_END*/
+		bf->l_start += ip->i_d.di_size;
+		break;
+	default:
+		return XFS_ERROR(EINVAL);
+	}
+
+	llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+
+	if (   (bf->l_start < 0)
+	    || (bf->l_start > XFS_MAX_FILE_OFFSET)
+	    || (bf->l_start + llen < 0)
+	    || (bf->l_start + llen > XFS_MAX_FILE_OFFSET))
+		return XFS_ERROR(EINVAL);
+
+	bf->l_whence = 0;
+
+	startoffset = bf->l_start;
+	fsize = ip->i_d.di_size;
+
+	/*
+	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
+	 * file space.
+	 * These calls do NOT zero the data space allocated to the file,
+	 * nor do they change the file size.
+	 *
+	 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
+	 * space.
+	 * These calls cause the new file data to be zeroed and the file
+	 * size to be changed.
+	 */
+	setprealloc = clrprealloc = 0;
+
+	switch (cmd) {
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_RESVSP64:
+		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
+								1, attr_flags);
+		if (error)
+			return error;
+		setprealloc = 1;
+		break;
+
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_UNRESVSP64:
+		if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
+								attr_flags)))
+			return error;
+		break;
+
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_FREESP64:
+		if (startoffset > fsize) {
+			error = xfs_alloc_file_space(ip, fsize,
+					startoffset - fsize, 0, attr_flags);
+			if (error)
+				break;
+		}
+
+		va.va_mask = AT_SIZE;
+		va.va_size = startoffset;
+
+		error = xfs_setattr(bdp, &va, attr_flags, credp);
+
+		if (error)
+			return error;
+
+		clrprealloc = 1;
+		break;
+
+	default:
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * update the inode timestamp, mode, and prealloc flag bits
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+
+	if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
+				      0, 0, 0))) {
+		/* ASSERT(0); */
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+
+	ip->i_d.di_mode &= ~ISUID;
+
+	/*
+	 * Note that we don't have to worry about mandatory
+	 * file locking being disabled here because we only
+	 * clear the ISGID bit if the Group execute bit is
+	 * on, but if it was on then mandatory locking wouldn't
+	 * have been enabled.
+	 */
+	if (ip->i_d.di_mode & (IEXEC >> 3))
+		ip->i_d.di_mode &= ~ISGID;
+
+	xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	if (setprealloc)
+		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+	else if (clrprealloc)
+		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp, 0, NULL);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
+vnodeops_t xfs_vnodeops = {
+	.vop_open		= xfs_open,
+	.vop_read		= xfs_read,
+	.vop_write		= xfs_write,
+	.vop_ioctl		= xfs_ioctl,
+	.vop_getattr		= xfs_getattr,
+	.vop_setattr		= xfs_setattr,
+	.vop_access		= xfs_access,
+	.vop_lookup		= xfs_lookup,
+	.vop_create		= xfs_create,
+	.vop_remove		= xfs_remove,
+	.vop_link		= xfs_link,
+	.vop_rename		= xfs_rename,
+	.vop_mkdir		= xfs_mkdir,
+	.vop_rmdir		= xfs_rmdir,
+	.vop_readdir		= xfs_readdir,
+	.vop_symlink		= xfs_symlink,
+	.vop_readlink		= xfs_readlink,
+	.vop_fsync		= xfs_fsync,
+	.vop_inactive		= xfs_inactive,
+	.vop_fid2		= xfs_fid2,
+	.vop_rwlock		= xfs_rwlock,
+	.vop_rwunlock		= xfs_rwunlock,
+	.vop_bmap		= xfs_bmap,
+	.vop_strategy		= xfs_strategy,
+	.vop_reclaim		= xfs_reclaim,
+	.vop_attr_get		= xfs_attr_get,
+	.vop_attr_set		= xfs_attr_set,
+	.vop_attr_remove	= xfs_attr_remove,
+	.vop_attr_list		= xfs_attr_list,
+	.vop_link_removed	= (vop_link_removed_t)fs_noval,
+	.vop_vnode_change	= (vop_vnode_change_t)fs_noval,
+	.vop_tosspages		= fs_tosspages,
+	.vop_flushinval_pages	= fs_flushinval_pages,
+	.vop_flush_pages	= fs_flush_pages,
+	.vop_release		= xfs_release,
+	.vop_iflush		= xfs_inode_flush,
+};
diff -Nru linux-2.4.20-pre10-mjc1/fs/xfs/xfsidbg.c linux-2.4.20-pre10-mjc2/fs/xfs/xfsidbg.c
--- linux-2.4.20-pre10-mjc1/fs/xfs/xfsidbg.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.20-pre10-mjc2/fs/xfs/xfsidbg.c	2002-10-25 16:51:22.000000000 -0400
@@ -0,0 +1,5230 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+#include <xfs_log_recover.h>
+#include "pagebuf/page_buf_internal.h"
+
+#include <linux/ctype.h>
+#include <linux/kdb.h>
+#include <linux/kdbprivate.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+
+MODULE_AUTHOR("SGI <sgi.com>");
+MODULE_DESCRIPTION("Additional kdb commands for debugging XFS");
+MODULE_LICENSE("GPL");
+EXPORT_NO_SYMBOLS;
+
+/*
+ * Command table functions.
+ */
+static void	xfsidbg_xagf(xfs_agf_t *);
+static void	xfsidbg_xagi(xfs_agi_t *);
+static void	xfsidbg_xaildump(xfs_mount_t *);
+static void	xfsidbg_xalloc(xfs_alloc_arg_t *);
+static void	xfsidbg_xattrcontext(xfs_attr_list_context_t *);
+static void	xfsidbg_xattrleaf(xfs_attr_leafblock_t *);
+static void	xfsidbg_xattrsf(xfs_attr_shortform_t *);
+static void	xfsidbg_xbirec(xfs_bmbt_irec_t *r);
+static void	xfsidbg_xbmalla(xfs_bmalloca_t *);
+static void	xfsidbg_xbrec(xfs_bmbt_rec_64_t *);
+static void	xfsidbg_xbroot(xfs_inode_t *);
+static void	xfsidbg_xbroota(xfs_inode_t *);
+static void	xfsidbg_xbtcur(xfs_btree_cur_t *);
+static void	xfsidbg_xbuf(xfs_buf_t *);
+static void	xfsidbg_xbuf_real(xfs_buf_t *, int);
+static void	xfsidbg_xchash(xfs_mount_t *mp);
+static void	xfsidbg_xchashlist(xfs_chashlist_t *chl);
+static void	xfsidbg_xdaargs(xfs_da_args_t *);
+static void	xfsidbg_xdabuf(xfs_dabuf_t *);
+static void	xfsidbg_xdanode(xfs_da_intnode_t *);
+static void	xfsidbg_xdastate(xfs_da_state_t *);
+static void	xfsidbg_xdirleaf(xfs_dir_leafblock_t *);
+static void	xfsidbg_xdirsf(xfs_dir_shortform_t *);
+static void	xfsidbg_xdir2free(xfs_dir2_free_t *);
+static void	xfsidbg_xdir2sf(xfs_dir2_sf_t *);
+static void	xfsidbg_xexlist(xfs_inode_t *);
+static void	xfsidbg_xflist(xfs_bmap_free_t *);
+static void	xfsidbg_xhelp(void);
+static void	xfsidbg_xiclog(xlog_in_core_t *);
+static void	xfsidbg_xiclogall(xlog_in_core_t *);
+static void	xfsidbg_xiclogcb(xlog_in_core_t *);
+static void	xfsidbg_xihash(xfs_mount_t *mp);
+static void	xfsidbg_xinodes(xfs_mount_t *);
+static void	xfsidbg_delayed_blocks(xfs_mount_t *);
+static void	xfsidbg_xinodes_quiesce(xfs_mount_t *);
+static void	xfsidbg_xlog(xlog_t *);
+static void	xfsidbg_xlog_ritem(xlog_recover_item_t *);
+static void	xfsidbg_xlog_rtrans(xlog_recover_t *);
+static void	xfsidbg_xlog_rtrans_entire(xlog_recover_t *);
+static void	xfsidbg_xlog_tic(xlog_ticket_t *);
+static void	xfsidbg_xlogitem(xfs_log_item_t *);
+static void	xfsidbg_xmount(xfs_mount_t *);
+static void	xfsidbg_xnode(xfs_inode_t *ip);
+static void	xfsidbg_xcore(xfs_iocore_t *io);
+static void	xfsidbg_xperag(xfs_mount_t *);
+static void	xfsidbg_xqm_diskdq(xfs_disk_dquot_t *);
+static void	xfsidbg_xqm_dqattached_inos(xfs_mount_t *);
+static void	xfsidbg_xqm_dquot(xfs_dquot_t *);
+static void	xfsidbg_xqm_mplist(xfs_mount_t *);
+static void	xfsidbg_xqm_qinfo(xfs_mount_t *mp);
+static void	xfsidbg_xqm_tpdqinfo(xfs_trans_t *tp);
+static void	xfsidbg_xsb(xfs_sb_t *, int convert);
+static void	xfsidbg_xtp(xfs_trans_t *);
+static void	xfsidbg_xtrans_res(xfs_mount_t *);
+#ifdef	CONFIG_XFS_QUOTA
+static void	xfsidbg_xqm(void);
+static void	xfsidbg_xqm_htab(void);
+static void	xfsidbg_xqm_freelist_print(xfs_frlist_t *qlist, char *title);
+static void	xfsidbg_xqm_freelist(void);
+#endif
+
+/* kdb wrappers */
+
+static int	kdbm_xfs_xagf(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xagf((xfs_agf_t *)addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xagi(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xagi((xfs_agi_t *)addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xaildump(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xaildump((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xalloc(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xalloc((xfs_alloc_arg_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xattrcontext(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xattrcontext((xfs_attr_list_context_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xattrleaf(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xattrleaf((xfs_attr_leafblock_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xattrsf(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xattrsf((xfs_attr_shortform_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xbirec(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xbirec((xfs_bmbt_irec_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xbmalla(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xbmalla((xfs_bmalloca_t *)addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xbrec(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xbrec((xfs_bmbt_rec_64_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xbroot(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xbroot((xfs_inode_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xbroota(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xbroota((xfs_inode_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xbtcur(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xbtcur((xfs_btree_cur_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xbuf(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xbuf((xfs_buf_t *) addr);
+	return 0;
+}
+
+
+static int	kdbm_xfs_xchash(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xchash((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xchashlist(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xchashlist((xfs_chashlist_t *) addr);
+	return 0;
+}
+
+
+static int	kdbm_xfs_xdaargs(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xdaargs((xfs_da_args_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xdabuf(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xdabuf((xfs_dabuf_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xdanode(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xdanode((xfs_da_intnode_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xdastate(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xdastate((xfs_da_state_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xdirleaf(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xdirleaf((xfs_dir_leafblock_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xdirsf(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xdirsf((xfs_dir_shortform_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xdir2free(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xdir2free((xfs_dir2_free_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xdir2sf(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xdir2sf((xfs_dir2_sf_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xexlist(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xexlist((xfs_inode_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xflist(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xflist((xfs_bmap_free_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xhelp(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+
+	xfsidbg_xhelp();
+	return 0;
+}
+
+static int	kdbm_xfs_xiclog(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xiclog((xlog_in_core_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xiclogall(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xiclogall((xlog_in_core_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xiclogcb(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xiclogcb((xlog_in_core_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xihash(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xihash((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xinodes(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xinodes((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_delayed_blocks(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_delayed_blocks((xfs_mount_t *) addr);
+	return 0;
+}
+
+
+static int	kdbm_xfs_xinodes_quiesce(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xinodes_quiesce((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xlog(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xlog((xlog_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xlog_ritem(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xlog_ritem((xlog_recover_item_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xlog_rtrans(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xlog_rtrans((xlog_recover_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xlog_rtrans_entire(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xlog_rtrans_entire((xlog_recover_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xlog_tic(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xlog_tic((xlog_ticket_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xlogitem(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xlogitem((xfs_log_item_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xmount(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xmount((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xnode(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xnode((xfs_inode_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xcore(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xcore((xfs_iocore_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xperag(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xperag((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xqm_diskdq(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xqm_diskdq((xfs_disk_dquot_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xqm_dqattached_inos(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xqm_dqattached_inos((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xqm_dquot(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xqm_dquot((xfs_dquot_t *) addr);
+	return 0;
+}
+
+#ifdef	CONFIG_XFS_QUOTA
+static int	kdbm_xfs_xqm(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+
+	xfsidbg_xqm();
+	return 0;
+}
+
+static int	kdbm_xfs_xqm_freelist(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+
+	xfsidbg_xqm_freelist();
+	return 0;
+}
+
+static int	kdbm_xfs_xqm_htab(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+
+	xfsidbg_xqm_htab();
+	return 0;
+}
+#endif
+
+static int	kdbm_xfs_xqm_mplist(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xqm_mplist((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xqm_qinfo(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xqm_qinfo((xfs_mount_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xqm_tpdqinfo(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xqm_tpdqinfo((xfs_trans_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xsb(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	unsigned long convert=0;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1 && argc!=2)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+	if (argc==2) {
+	    /* extra argument - conversion flag */
+	    diag = kdbgetaddrarg(argc, argv, &nextarg, &convert, &offset, NULL, regs);
+	    if (diag)
+		    return diag;
+	}
+
+	xfsidbg_xsb((xfs_sb_t *) addr, (int)convert);
+	return 0;
+}
+
+static int	kdbm_xfs_xtp(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xtp((xfs_trans_t *) addr);
+	return 0;
+}
+
+static int	kdbm_xfs_xtrans_res(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+	if (diag)
+		return diag;
+
+	xfsidbg_xtrans_res((xfs_mount_t *) addr);
+	return 0;
+}
+
+/*
+ * Vnode descriptor dump.
+ * This table is a string version of all the flags defined in vnode.h.
+ */
+char *tab_vflags[] = {
+	/* local only flags */
+	"VINACT",		/*	 0x01 */
+	"VRECLM",		/*	 0x02 */
+	"VWAIT",		/*	 0x04 */
+	"VMODIFIED",		/*	 0x08 */
+	"INVALID0x10",		/*	 0x10 */
+	"INVALID0x20",		/*	 0x20 */
+	"INVALID0x40",		/*	 0x40 */
+	"INVALID0x80",		/*	 0x80 */
+	"INVALID0x100",		/*	0x100 */
+	"INVALID0x200",		/*	0x200 */
+	"INVALID0x400",		/*	0x400 */
+	"INVALID0x800",		/*	0x800 */
+	"INVALID0x1000",	/*     0x1000 */
+	"INVALID0x2000",	/*     0x2000 */
+	"INVALID0x4000",	/*     0x4000 */
+	"INVALID0x8000",	/*     0x8000 */
+	"INVALID0x10000",	/*    0x10000 */
+	"INVALID0x20000",	/*    0x20000 */
+	"INVALID0x40000",	/*    0x40000 */
+	"INVALID0x80000",	/*    0x80000 */
+	"VROOT",		/*   0x100000 */
+	"VNOSWAP",		/*   0x200000 */
+	"VISSWAP",		/*   0x400000 */
+	"VREPLICABLE",		/*   0x800000 */
+	"VNOTREPLICABLE",	/*  0x1000000 */
+	"VDOCMP",		/*  0x2000000 */
+	"VSHARE",		/*  0x4000000 */
+	"VFRLOCKS",		/*  0x8000000 */
+	"VENF_LOCKING",		/* 0x10000000 */
+	"VOPLOCK",		/* 0x20000000 */
+	"VPURGE",		/* 0x40000000 */
+	"INVALID0x80000000",	/* 0x80000000 */
+	0
+};
+
+
+static char *vnode_type[] = {
+	"VNON", "VREG", "VDIR", "VBLK", "VLNK", "VFIFO", "VBAD", "VSOCK"
+};
+
+static void
+printflags(register uint64_t flags,
+	register char **strings,
+	register char *name)
+{
+	register uint64_t mask = 1;
+
+	if (name)
+		kdb_printf("%s 0x%llx <", name, (unsigned long long)flags);
+
+	while (flags != 0 && *strings) {
+		if (mask & flags) {
+			kdb_printf("%s ", *strings);
+			flags &= ~mask;
+		}
+		mask <<= 1;
+		strings++;
+	}
+
+	if (name)
+		kdb_printf("> ");
+
+	return;
+}
+
+
+static void	printvnode(vnode_t *vp)
+{
+	bhv_desc_t	*bh;
+	kdb_symtab_t	 symtab;
+
+
+	kdb_printf("vnode: 0x%p type ", vp);
+	if ((size_t)vp->v_type >= sizeof(vnode_type)/sizeof(vnode_type[0]))
+		kdb_printf("out of range 0x%x\n", vp->v_type);
+	else
+		kdb_printf("%s\n", vnode_type[vp->v_type]);
+
+	if ((bh = vp->v_bh.bh_first)) {
+		kdb_printf("   v_inode 0x%p v_bh->bh_first 0x%p pobj 0x%p\n",
+					LINVFS_GET_IP(vp), bh, bh->bd_pdata);
+
+		if (kdbnearsym((unsigned long)bh->bd_ops, &symtab))
+			kdb_printf("   ops %s ", symtab.sym_name);
+		else
+			kdb_printf("   ops %s/0x%p ",
+						"???", (void *)bh->bd_ops);
+	} else {
+		kdb_printf("   v_inode 0x%p v_bh->bh_first = NULLBHV ",
+					LINVFS_GET_IP(vp));
+	}
+
+	printflags((__psunsigned_t)vp->v_flag, tab_vflags, "flag =");
+	kdb_printf("\n");
+
+#ifdef	CONFIG_XFS_VNODE_TRACING
+	kdb_printf("   v_trace 0x%p\n", vp->v_trace);
+#endif	/* CONFIG_XFS_VNODE_TRACING */
+}
+
+
+static int	kdbm_vnode(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr;
+	int nextarg = 1;
+	long offset = 0;
+	int diag;
+	vnode_t		*vp;
+/*	bhv_desc_t	*bh; */
+/*	kdb_symtab_t	 symtab;*/
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+
+	if (diag)
+		return diag;
+
+	vp = (vnode_t *)addr;
+
+	printvnode(vp);
+
+	return 0;
+}
+
+#ifdef	CONFIG_XFS_VNODE_TRACING
+/*
+ * Print a vnode trace entry.
+ */
+static int
+vn_trace_pr_entry(ktrace_entry_t *ktep)
+{
+	char		funcname[128];
+	kdb_symtab_t	symtab;
+
+
+	if ((__psint_t)ktep->val[0] == 0)
+		return 0;
+
+	if (kdbnearsym((unsigned int)ktep->val[8], &symtab)) {
+		unsigned long offval;
+
+		offval = (unsigned int)ktep->val[8] - symtab.sym_start;
+
+		if (offval)
+			sprintf(funcname, "%s+0x%lx", symtab.sym_name, offval);
+		else
+			sprintf(funcname, "%s", symtab.sym_name);
+	} else
+		funcname[0] = '\0';
+
+
+	switch ((__psint_t)ktep->val[0]) {
+	case VNODE_KTRACE_ENTRY:
+		kdb_printf("entry to %s i_count = %d",
+						(char *)ktep->val[1],
+						(__psint_t)ktep->val[3]);
+		break;
+
+	case VNODE_KTRACE_EXIT:
+		kdb_printf("exit from %s i_count = %d",
+						(char *)ktep->val[1],
+						(__psint_t)ktep->val[3]);
+		break;
+
+	case VNODE_KTRACE_HOLD:
+		if ((__psint_t)ktep->val[3] != 1)
+			kdb_printf("hold @%s:%d(%s) i_count %d => %d ",
+						(char *)ktep->val[1],
+						(__psint_t)ktep->val[2],
+						funcname,
+						(__psint_t)ktep->val[3] - 1,
+						(__psint_t)ktep->val[3]);
+		else
+			kdb_printf("get @%s:%d(%s) i_count = %d",
+						(char *)ktep->val[1],
+						(__psint_t)ktep->val[2],
+						funcname,
+						(__psint_t)ktep->val[3]);
+		break;
+
+	case VNODE_KTRACE_REF:
+		kdb_printf("ref @%s:%d(%s) i_count = %d",
+						(char *)ktep->val[1],
+						(__psint_t)ktep->val[2],
+						funcname,
+						(__psint_t)ktep->val[3]);
+		break;
+
+	case VNODE_KTRACE_RELE:
+		if ((__psint_t)ktep->val[3] != 1)
+			kdb_printf("rele @%s:%d(%s) i_count %d => %d ",
+						(char *)ktep->val[1],
+						(__psint_t)ktep->val[2],
+						funcname,
+						(__psint_t)ktep->val[3],
+						(__psint_t)ktep->val[3] - 1);
+		else
+			kdb_printf("free @%s:%d(%s) i_count = %d",
+						(char *)ktep->val[1],
+						(__psint_t)ktep->val[2],
+						funcname,
+						(__psint_t)ktep->val[3]);
+		break;
+
+	default:
+		kdb_printf("unknown vntrace record\n");
+		return 1;
+	}
+
+	kdb_printf("\n");
+
+	kdb_printf("  cpu = %d pid = %d ",
+			(__psint_t)ktep->val[6], (pid_t)ktep->val[7]);
+
+	printflags((__psunsigned_t)ktep->val[5], tab_vflags, "flag =");
+
+	if (kdbnearsym((unsigned int)ktep->val[4], &symtab)) {
+		unsigned long offval;
+
+		offval = (unsigned int)ktep->val[4] - symtab.sym_start;
+
+		if (offval)
+			kdb_printf("  ra = %s+0x%lx", symtab.sym_name, offval);
+		else
+			kdb_printf("  ra = %s", symtab.sym_name);
+	} else
+		kdb_printf("  ra = ?? 0x%p", (void *)ktep->val[4]);
+
+	return 1;
+}
+
+
+/*
+ * Print out the trace buffer attached to the given vnode.
+ */
+static int	kdbm_vntrace(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	int		diag;
+	int		nextarg = 1;
+	long		offset = 0;
+	unsigned long	addr;
+	vnode_t		*vp;
+	ktrace_entry_t	*ktep;
+	ktrace_snap_t	kts;
+
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+
+	if (diag)
+		return diag;
+
+	vp = (vnode_t *)addr;
+
+	if (vp->v_trace == NULL) {
+		kdb_printf("The vnode trace buffer is not initialized\n");
+
+		return 0;
+	}
+
+	kdb_printf("vntrace vp 0x%p\n", vp);
+
+	ktep = ktrace_first(vp->v_trace, &kts);
+
+	while (ktep != NULL) {
+		if (vn_trace_pr_entry(ktep))
+			kdb_printf("\n");
+
+		ktep = ktrace_next(vp->v_trace, &kts);
+	}
+
+	return 0;
+}
+/*
+ * Print out the trace buffer attached to the given vnode.
+ */
+static int	kdbm_vntraceaddr(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	int		diag;
+	int		nextarg = 1;
+	long		offset = 0;
+	unsigned long	addr;
+	struct ktrace	*kt;
+	ktrace_entry_t	*ktep;
+	ktrace_snap_t	kts;
+
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+
+	if (diag)
+		return diag;
+
+	kt = (struct ktrace *)addr;
+
+	kdb_printf("vntraceaddr kt 0x%p\n", kt);
+
+	ktep = ktrace_first(kt, &kts);
+
+	while (ktep != NULL) {
+		if (vn_trace_pr_entry(ktep))
+			kdb_printf("\n");
+
+		ktep = ktrace_next(kt, &kts);
+	}
+
+	return 0;
+}
+#endif	/* CONFIG_XFS_VNODE_TRACING */
+
+
+static void	printinode(struct inode *ip)
+{
+	unsigned long	addr;
+
+
+	if (ip == NULL)
+		return;
+
+	kdb_printf(" i_ino = %lu i_count = %u i_dev = 0x%x i_size %Ld\n",
+					ip->i_ino, atomic_read(&ip->i_count),
+					kdev_t_to_nr(ip->i_dev), ip->i_size);
+
+	kdb_printf(
+		" i_mode = 0x%x	 i_nlink = %d  i_rdev = 0x%x i_state = 0x%lx\n",
+					ip->i_mode, ip->i_nlink,
+					kdev_t_to_nr(ip->i_rdev), ip->i_state);
+
+	kdb_printf(" i_hash.nxt = 0x%p i_hash.prv = 0x%p\n",
+					ip->i_hash.next, ip->i_hash.prev);
+	kdb_printf(" i_list.nxt = 0x%p i_list.prv = 0x%p\n",
+					ip->i_list.next, ip->i_list.prev);
+	kdb_printf(" i_dentry.nxt = 0x%p i_dentry.prv = 0x%p\n",
+					ip->i_dentry.next,
+					ip->i_dentry.prev);
+
+	addr = (unsigned long)ip;
+
+	kdb_printf(" i_sb = 0x%p i_op = 0x%p i_data = 0x%lx nrpages = %lu\n",
+					ip->i_sb, ip->i_op,
+					addr + offsetof(struct inode, i_data),
+					ip->i_data.nrpages);
+
+	kdb_printf("  vnode ptr 0x%p\n", LINVFS_GET_VP(ip));
+}
+
+
+static int	kdbm_vn(
+	int	argc,
+	const char **argv,
+	const char **envp,
+	struct pt_regs *regs)
+{
+	int		diag;
+	int		nextarg = 1;
+/*	char		*symname; */
+	long		offset = 0;
+	unsigned long	addr;
+	struct inode	*ip;
+/*	bhv_desc_t	*bh; */
+#ifdef	CONFIG_XFS_VNODE_TRACING
+	ktrace_entry_t	*ktep;
+	ktrace_snap_t	kts;
+#endif
+	vnode_t		*vp;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+
+	if (diag)
+		return diag;
+
+	vp = (vnode_t *)addr;
+
+	ip = LINVFS_GET_IP(vp);
+
+	kdb_printf("--> Inode @ 0x%p\n", ip);
+	printinode(ip);
+
+	kdb_printf("--> Vnode @ 0x%p\n", vp);
+	printvnode(vp);
+
+#ifdef	CONFIG_XFS_VNODE_TRACING
+
+	kdb_printf("--> Vntrace @ 0x%p/0x%p\n", vp, vp->v_trace);
+
+	if (vp->v_trace == NULL)
+		return 0;
+
+	ktep = ktrace_first(vp->v_trace, &kts);
+
+	while (ktep != NULL) {
+		if (vn_trace_pr_entry(ktep))
+			kdb_printf("\n");
+
+		ktep = ktrace_next(vp->v_trace, &kts);
+	}
+#endif	/* CONFIG_XFS_VNODE_TRACING */
+
+	return 0;
+}
+
+
+/* pagebuf stuff */
+
+static char	*pb_flag_vals[] = {
+	"READ", "WRITE", "MAPPED", "PARTIAL",
+	"ASYNC", "NONE", "DELWRI", "FREED", "SYNC",
+	"MAPPABLE", "STALE", "FS_MANAGED", "RELEASE",
+	"LOCK", "TRYLOCK", "ALLOCATE", "FILE_ALLOCATE", "DONT_BLOCK",
+	"DIRECT", "LOCKABLE", "NEXT_KEY", "ENTER_PAGES",
+	"ALL_PAGES_MAPPED", "SOME_INVALID_PAGES", "ADDR_ALLOCATED",
+	"MEM_ALLOCATED", "GRIO", "FORCEIO", "SHUTDOWN",
+	NULL };
+
+static char	*pbm_flag_vals[] = {
+	"EOF", "HOLE", "DELAY", "FLUSH_OVERLAPS",
+	"READAHEAD", "UNWRITTEN", "DONTALLOC", "NEW",
+	NULL };
+
+
+static char	*map_flags(unsigned long flags, char *mapping[])
+{
+	static	char	buffer[256];
+	int	index;
+	int	offset = 12;
+
+	buffer[0] = '\0';
+
+	for (index = 0; flags && mapping[index]; flags >>= 1, index++) {
+		if (flags & 1) {
+			if ((offset + strlen(mapping[index]) + 1) >= 80) {
+				strcat(buffer, "\n	      ");
+				offset = 12;
+			} else if (offset > 12) {
+				strcat(buffer, " ");
+				offset++;
+			}
+			strcat(buffer, mapping[index]);
+			offset += strlen(mapping[index]);
+		}
+	}
+
+	return (buffer);
+}
+
+static char	*pb_flags(page_buf_flags_t pb_flag)
+{
+	return(map_flags((unsigned long) pb_flag, pb_flag_vals));
+}
+
+static int
+kdbm_pb_flags(int argc, const char **argv, const char **envp, struct pt_regs *regs)
+{
+	unsigned long flags;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	diag = kdbgetularg(argv[1], &flags);
+	if (diag)
+		return diag;
+
+	kdb_printf("pb flags 0x%lx = %s\n", flags, pb_flags(flags));
+
+	return 0;
+}
+
+static int
+kdbm_pb(int argc, const char **argv, const char **envp, struct pt_regs *regs)
+{
+	page_buf_private_t bp;
+	unsigned long addr;
+	long	offset=0;
+	int nextarg;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	nextarg = 1;
+	if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs)) ||
+	    (diag = kdb_getarea(bp, addr)))
+		return diag;
+
+	kdb_printf("page_buf_t at 0x%lx\n", addr);
+	kdb_printf("  pb_flags %s\n", pb_flags(bp.pb_common.pb_flags));
+	kdb_printf("  pb_target 0x%p pb_hold %d pb_next 0x%p pb_prev 0x%p\n",
+		   bp.pb_common.pb_target, bp.pb_common.pb_hold.counter,
+		   bp.pb_common.pb_list.next, bp.pb_common.pb_list.prev);
+	kdb_printf("  pb_hash_index %d pb_hash_next 0x%p pb_hash_prev 0x%p\n",
+		   bp.pb_common.pb_hash_index,
+		   bp.pb_common.pb_hash_list.next,
+		   bp.pb_common.pb_hash_list.prev);
+	kdb_printf("  pb_file_offset 0x%llx pb_buffer_length 0x%llx pb_addr 0x%p\n",
+		   (unsigned long long) bp.pb_common.pb_file_offset,
+		   (unsigned long long) bp.pb_common.pb_buffer_length,
+		   bp.pb_common.pb_addr);
+	kdb_printf("  pb_bn 0x%Lx pb_count_desired 0x%lx\n",
+		   bp.pb_common.pb_bn,
+		   (unsigned long) bp.pb_common.pb_count_desired);
+	kdb_printf("  pb_io_remaining %d pb_error %u\n",
+		   bp.pb_io_remaining.counter, bp.pb_common.pb_error);
+	kdb_printf("  pb_page_count %u pb_offset 0x%x pb_pages 0x%p\n",
+		bp.pb_common.pb_page_count, bp.pb_common.pb_offset,
+		bp.pb_common.pb_pages);
+#ifdef PAGEBUF_LOCK_TRACKING
+	kdb_printf("  pb_iodonesema (%d,%d) pb_sema (%d,%d) pincount (%d) last holder %d\n",
+		   bp.pb_common.pb_iodonesema.count.counter,
+		   bp.pb_common.pb_iodonesema.sleepers,
+		   bp.pb_sema.count.counter, bp.pb_sema.sleepers,
+		   bp.pb_pin_count.counter, bp.pb_last_holder);
+#else
+	kdb_printf("  pb_iodonesema (%d,%d) pb_sema (%d,%d) pincount (%d)\n",
+		   bp.pb_common.pb_iodonesema.count.counter,
+		   bp.pb_common.pb_iodonesema.sleepers,
+		   bp.pb_sema.count.counter, bp.pb_sema.sleepers,
+		   bp.pb_pin_count.counter);
+#endif
+	if (bp.pb_common.pb_fspriv || bp.pb_common.pb_fspriv2) {
+		kdb_printf(  "pb_fspriv 0x%p pb_fspriv2 0x%p\n",
+			   bp.pb_common.pb_fspriv, bp.pb_common.pb_fspriv2);
+	}
+
+	return 0;
+}
+
+/* XXXXXXXXXXXXXXXXXXXXXX */
+/* The start of this deliberately looks like a read_descriptor_t in layout */
+typedef struct {
+	read_descriptor_t io_rdesc;
+
+	/* 0x10 */
+	page_buf_rw_t io_dir;	/* read or write */
+	loff_t io_offset;	/* Starting offset of I/O */
+	int io_iovec_nr;	/* Number of entries in iovec */
+
+	/* 0x20 */
+	struct iovec **io_iovec;	/* iovec list indexed by iovec_index */
+	loff_t io_iovec_offset; /* offset into current iovec. */
+	int io_iovec_index;	/* current iovec being processed */
+	unsigned int io_sshift; /* sector bit shift */
+	loff_t io_i_size;	/* size of the file */
+} pb_io_desc_t;
+
+static int
+kdbm_pbiodesc(int argc, const char **argv, const char **envp,
+	struct pt_regs *regs)
+{
+	pb_io_desc_t	pbio;
+	unsigned long addr;
+	long	offset=0;
+	int nextarg;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	nextarg = 1;
+	if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs)) ||
+	    (diag = kdb_getarea(pbio, addr)))
+
+	kdb_printf("pb_io_desc_t at 0x%lx\n", addr);
+	kdb_printf("  io_rdesc [ written 0x%lx count 0x%lx buf 0x%p error %d ]\n",
+			(unsigned long) pbio.io_rdesc.written,
+			(unsigned long) pbio.io_rdesc.count,
+			pbio.io_rdesc.buf, pbio.io_rdesc.error);
+
+	kdb_printf("  io_dir %d io_offset 0x%Lx io_iovec_nr 0x%d\n",
+			pbio.io_dir, pbio.io_offset, pbio.io_iovec_nr);
+
+	kdb_printf("  io_iovec 0x%p io_iovec_offset 0x%Lx io_iovec_index 0x%d\n",
+		pbio.io_iovec, pbio.io_iovec_offset, pbio.io_iovec_index);
+
+	kdb_printf("  io_sshift 0x%d io_i_size 0x%Lx\n",
+		pbio.io_sshift, pbio.io_i_size);
+
+	return 0;
+}
+
+static int
+kdbm_pbmap(int argc, const char **argv, const char **envp,
+	struct pt_regs *regs)
+{
+	page_buf_bmap_t pbm;
+	unsigned long addr;
+	long	offset=0;
+	int nextarg;
+	int diag;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	nextarg = 1;
+	if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs)) ||
+	    (diag = kdb_getarea(pbm, addr)))
+
+	kdb_printf("page_buf_bmap_t at 0x%lx\n", addr);
+	kdb_printf("  pbm_bn 0x%llx pbm_offset 0x%Lx pbm_delta 0x%lx pbm_bsize 0x%lx\n",
+		(long long) pbm.pbm_bn, pbm.pbm_offset,
+		(unsigned long) pbm.pbm_delta, (unsigned long) pbm.pbm_bsize);
+
+	kdb_printf("  pbm_flags %s\n", map_flags(pbm.pbm_flags, pbm_flag_vals));
+
+	return 0;
+}
+
+#ifdef PAGEBUF_TRACE
+# ifdef __PAGEBUF_TRACE__
+# undef __PAGEBUF_TRACE__
+# undef PB_DEFINE_TRACES
+# undef PB_TRACE_START
+# undef PB_TRACE_REC
+# undef PB_TRACE_END
+# endif
+#include "pagebuf/page_buf_trace.h"
+
+#define EV_SIZE (sizeof(event_names)/sizeof(char *))
+
+void
+pb_trace_core(
+	unsigned long match,
+	char	*event_match,
+	unsigned long long offset,
+	long long mask)
+{
+	extern struct pagebuf_trace_buf pb_trace;
+	int i, total, end;
+	pagebuf_trace_t *trace;
+	char	*event;
+	char	value[10];
+
+	end = pb_trace.start - 1;
+	if (end < 0)
+		end = PB_TRACE_BUFSIZE - 1;
+
+	if (match && (match < PB_TRACE_BUFSIZE)) {
+		for (i = pb_trace.start, total = 0; i != end; i = CIRC_INC(i)) {
+			trace = &pb_trace.buf[i];
+			if (trace->pb == 0)
+				continue;
+			total++;
+		}
+		total = total - match;
+		for (i = pb_trace.start; i != end && total; i = CIRC_INC(i)) {
+			trace = &pb_trace.buf[i];
+			if (trace->pb == 0)
+				continue;
+			total--;
+		}
+		match = 0;
+	} else
+		i = pb_trace.start;
+	for ( ; i != end; i = CIRC_INC(i)) {
+		trace = &pb_trace.buf[i];
+
+		if (offset) {
+			if ((trace->offset & ~mask) != offset)
+				continue;
+		}
+
+		if (trace->pb == 0)
+			continue;
+
+		if ((match != 0) && (trace->pb != match))
+			continue;
+
+		if ((trace->event < EV_SIZE-1) && event_names[trace->event]) {
+			event = event_names[trace->event];
+		} else if (trace->event == EV_SIZE) {
+			event = (char *)trace->misc;
+		} else {
+			event = value;
+			sprintf(value, "%8d", trace->event);
+		}
+
+		if (event_match && strcmp(event, event_match)) {
+			continue;
+		}
+
+
+		kdb_printf("pb 0x%lx [%s] (hold %u lock %d) misc 0x%p",
+			   trace->pb, event,
+			   trace->hold, trace->lock_value,
+			   trace->misc);
+		kdb_symbol_print((unsigned int)trace->ra, NULL,
+			KDB_SP_SPACEB|KDB_SP_PAREN|KDB_SP_NEWLINE);
+		kdb_printf("	offset 0x%Lx size 0x%x task 0x%p\n",
+			   trace->offset, trace->size, trace->task);
+		kdb_printf("	flags: %s\n",
+			   pb_flags(trace->flags));
+	}
+}
+
+
+static int
+kdbm_pbtrace_offset(int argc, const char **argv, const char **envp,
+	struct pt_regs *regs)
+{
+	long mask = 0;
+	unsigned long offset = 0;
+	int diag;
+
+	if (argc > 2)
+		return KDB_ARGCOUNT;
+
+	if (argc > 0) {
+		diag = kdbgetularg(argv[1], &offset);
+		if (diag)
+			return diag;
+	}
+
+	if (argc > 1) {
+		diag = kdbgetularg(argv[1], &mask);
+		if (diag)
+			return diag;
+	}
+
+	pb_trace_core(0, NULL, (unsigned long long)offset,
+			       (long long)mask);	/* sign extent mask */
+	return 0;
+}
+
+static int
+kdbm_pbtrace(int argc, const char **argv, const char **envp,
+	struct pt_regs *regs)
+{
+	unsigned long addr = 0;
+	int diag, nextarg;
+	long offset = 0;
+	char	*event_match = NULL;
+
+	if (argc > 1)
+		return KDB_ARGCOUNT;
+
+	if (argc == 1) {
+		if (isupper(argv[1][0]) || islower(argv[1][0])) {
+			event_match = (char *)argv[1];
+			printk("event match on \"%s\"\n", event_match);
+			argc = 0;
+		} else {
+			nextarg = 1;
+			diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs);
+			if (diag) {
+				printk("failed to parse %s as a number\n",
+								argv[1]);
+				return diag;
+			}
+		}
+	}
+
+	pb_trace_core(addr, event_match, 0LL, 0LL);
+	return 0;
+}
+
+#else	/* PAGEBUF_TRACE */
+static int
+kdbm_pbtrace(int argc, const char **argv, const char **envp,
+	struct pt_regs *regs)
+{
+	kdb_printf("pagebuf tracing not compiled in\n");
+
+	return 0;
+}
+#endif	/* PAGEBUF_TRACE */
+
+static struct xif {
+	char	*name;
+	int	(*func)(int, const char **, const char **, struct pt_regs *);
+	char	*args;
+	char	*help;
+} xfsidbg_funcs[] = {
+  {  "vn",	kdbm_vn,	"<vnode>", "Dump inode/vnode/trace"},
+  {  "vnode",	kdbm_vnode,	"<vnode>", "Dump vnode"},
+#ifdef	CONFIG_XFS_VNODE_TRACING
+  {  "vntrace", kdbm_vntrace,	"<vntrace>", "Dump vnode Trace"},
+  {  "vntraceaddr",	kdbm_vntraceaddr, "<vntrace>", "Dump vnode Trace by Address"},
+#endif	/* CONFIG_XFS_VNODE_TRACING */
+  {  "xagf",	kdbm_xfs_xagf,	"<agf>",
+				"Dump XFS allocation group freespace" },
+  {  "xagi",	kdbm_xfs_xagi,	"<agi>",
+				"Dump XFS allocation group inode" },
+  {  "xail",	kdbm_xfs_xaildump,	"<xfs_mount_t>",
+				"Dump XFS AIL for a mountpoint" },
+  {  "xalloc",	kdbm_xfs_xalloc,	"<xfs_alloc_arg_t>",
+				"Dump XFS allocation args structure" },
+  {  "xattrcx", kdbm_xfs_xattrcontext,	"<xfs_attr_list_context_t>",
+				"Dump XFS attr_list context struct"},
+  {  "xattrlf", kdbm_xfs_xattrleaf,	"<xfs_attr_leafblock_t>",
+				"Dump XFS attribute leaf block"},
+  {  "xattrsf", kdbm_xfs_xattrsf,	"<xfs_attr_shortform_t>",
+				"Dump XFS attribute shortform"},
+  {  "xbirec",	kdbm_xfs_xbirec,	"<xfs_bmbt_irec_t",
+				"Dump XFS bmap incore record"},
+  {  "xbmalla", kdbm_xfs_xbmalla,	"<xfs_bmalloca_t>",
+				"Dump XFS bmalloc args structure"},
+  {  "xbrec",	kdbm_xfs_xbrec,		"<xfs_bmbt_rec_64_t",
+				"Dump XFS bmap record"},
+  {  "xbroot",	kdbm_xfs_xbroot,	"<xfs_inode_t>",
+				"Dump XFS bmap btree root (data)"},
+  {  "xbroota", kdbm_xfs_xbroota,	"<xfs_inode_t>",
+				"Dump XFS bmap btree root (attr)"},
+  {  "xbtcur",	kdbm_xfs_xbtcur,	"<xfs_btree_cur_t>",
+				"Dump XFS btree cursor"},
+  {  "xbuf",	kdbm_xfs_xbuf,		"<xfs_buf_t>",
+				"Dump XFS data from a buffer"},
+  {  "xchash",	kdbm_xfs_xchash,	"<xfs_mount_t>",
+				"Dump XFS cluster hash"},
+  {  "xchlist", kdbm_xfs_xchashlist,	"<xfs_chashlist_t>",
+				"Dump XFS cluster hash list"},
+  {  "xd2free", kdbm_xfs_xdir2free,	"<xfs_dir2_free_t>",
+				"Dump XFS directory v2 freemap"},
+  {  "xdaargs", kdbm_xfs_xdaargs,	"<xfs_da_args_t>",
+				"Dump XFS dir/attr args structure"},
+  {  "xdabuf",	kdbm_xfs_xdabuf,	"<xfs_dabuf_t>",
+				"Dump XFS dir/attr buf structure"},
+  {  "xdanode", kdbm_xfs_xdanode,	"<xfs_da_intnode_t>",
+				"Dump XFS dir/attr node block"},
+  {  "xdastat", kdbm_xfs_xdastate,	"<xfs_da_state_t>",
+				"Dump XFS dir/attr state_blk struct"},
+  {  "xdelay",	kdbm_xfs_delayed_blocks,	"<xfs_mount_t>",
+				"Dump delayed block totals"},
+  {  "xdirlf",	kdbm_xfs_xdirleaf,	"<xfs_dir_leafblock_t>",
+				"Dump XFS directory leaf block"},
+  {  "xdirsf",	kdbm_xfs_xdirsf,	"<xfs_dir_shortform_t>",
+				"Dump XFS directory shortform"},
+  {  "xdir2sf", kdbm_xfs_xdir2sf,	"<xfs_dir2_sf_t>",
+				"Dump XFS directory v2 shortform"},
+  {  "xdiskdq", kdbm_xfs_xqm_diskdq,	"<xfs_disk_dquot_t>",
+				"Dump XFS ondisk dquot (quota) struct"},
+  {  "xdqatt",	kdbm_xfs_xqm_dqattached_inos,	"<xfs_mount_t>",
+				 "All incore inodes with dquots"},
+  {  "xdqinfo", kdbm_xfs_xqm_tpdqinfo,	"<xfs_trans_t>",
+				"Dump dqinfo structure of a trans"},
+  {  "xdquot",	kdbm_xfs_xqm_dquot,	"<xfs_dquot_t>",
+				"Dump XFS dquot (quota) structure"},
+  {  "xexlist", kdbm_xfs_xexlist,	"<xfs_inode_t>",
+				"Dump XFS bmap extents in inode"},
+  {  "xflist",	kdbm_xfs_xflist,	"<xfs_bmap_free_t>",
+				"Dump XFS to-be-freed extent list"},
+  {  "xhelp",	kdbm_xfs_xhelp,		"",
+				"Print idbg-xfs help"},
+  {  "xicall",	kdbm_xfs_xiclogall,	"<xlog_in_core_t>",
+				"Dump All XFS in-core logs"},
+  {  "xiclog",	kdbm_xfs_xiclog,	"<xlog_in_core_t>",
+				"Dump XFS in-core log"},
+  {  "xihash",	kdbm_xfs_xihash,	"<xfs_mount_t>",
+				"Dump XFS inode hash statistics"},
+  {  "xinodes", kdbm_xfs_xinodes,	"<xfs_mount_t>",
+				"Dump XFS inodes per mount"},
+  {  "xquiesce",kdbm_xfs_xinodes_quiesce, "<xfs_mount_t>",
+				"Dump non-quiesced XFS inodes per mount"},
+  {  "xl_rcit", kdbm_xfs_xlog_ritem,	"<xlog_recover_item_t>",
+				"Dump XFS recovery item"},
+  {  "xl_rctr", kdbm_xfs_xlog_rtrans,	"<xlog_recover_t>",
+				"Dump XFS recovery transaction"},
+  {  "xl_rctr2",kdbm_xfs_xlog_rtrans_entire,	"<xlog_recover_t>",
+				"Dump entire recovery transaction"},
+  {  "xl_tic",	kdbm_xfs_xlog_tic,	"<xlog_ticket_t>",
+				"Dump XFS log ticket"},
+  {  "xlog",	kdbm_xfs_xlog,	"<xlog_t>",
+				"Dump XFS log"},
+  {  "xlogcb",	kdbm_xfs_xiclogcb,	"<xlog_in_core_t>",
+				"Dump XFS in-core log callbacks"},
+  {  "xlogitm", kdbm_xfs_xlogitem,	"<xfs_log_item_t>",
+				"Dump XFS log item structure"},
+  {  "xmount",	kdbm_xfs_xmount,	"<xfs_mount_t>",
+				"Dump XFS mount structure"},
+  {  "xnode",	kdbm_xfs_xnode,		"<xfs_inode_t>",
+				"Dump XFS inode"},
+  {  "xiocore", kdbm_xfs_xcore,		"<xfs_iocore_t>",
+				"Dump XFS iocore"},
+  {  "xperag",	kdbm_xfs_xperag,	"<xfs_mount_t>",
+				"Dump XFS per-allocation group data"},
+  {  "xqinfo",	kdbm_xfs_xqm_qinfo,	"<xfs_mount_t>",
+				"Dump mount->m_quotainfo structure"},
+#ifdef	CONFIG_XFS_QUOTA
+  {  "xqm",	kdbm_xfs_xqm,		"",
+				"Dump XFS quota manager structure"},
+  {  "xqmfree", kdbm_xfs_xqm_freelist,	"",
+				"Dump XFS global freelist of dquots"},
+  {  "xqmhtab", kdbm_xfs_xqm_htab,	"",
+				"Dump XFS hashtable of dquots"},
+#endif	/* CONFIG_XFS_QUOTA */
+  {  "xqmplist",kdbm_xfs_xqm_mplist,	"<xfs_mount_t>",
+				"Dump XFS all dquots of a f/s"},
+  {  "xsb",	kdbm_xfs_xsb,		"<xfs_sb_t> <cnv>",
+				"Dump XFS superblock"},
+  {  "xtp",	kdbm_xfs_xtp,		"<xfs_trans_t>",
+				"Dump XFS transaction structure"},
+  {  "xtrres",	kdbm_xfs_xtrans_res,	"<xfs_mount_t>",
+				"Dump XFS reservation values"},
+  {  0,		0,	0 }
+};
+
+static int
+__init xfsidbg_init(void)
+{
+	struct xif	*p;
+
+	for (p = xfsidbg_funcs; p->name; p++)
+		kdb_register(p->name, p->func, p->args, p->help, 0);
+
+	kdb_register("pb", kdbm_pb, "<vaddr>", "Display page_buf_t", 0);
+	kdb_register("pbflags", kdbm_pb_flags, "<flags>",
+					"Display page buf flags", 0);
+	kdb_register("pbiodesc", kdbm_pbiodesc, "<pb_io_desc_t *>",
+					"Display I/O Descriptor", 0);
+	kdb_register("pbmap", kdbm_pbmap, "<page_buf_bmap_t *>",
+					"Display Bmap", 0);
+	kdb_register("pbtrace", kdbm_pbtrace, "<vaddr>|<count>",
+					"page_buf_t trace", 0);
+#ifdef PAGEBUF_TRACE
+	kdb_register("pboffset", kdbm_pbtrace_offset, "<addr> [<mask>]",
+					"page_buf_t trace", 0);
+#endif
+	return 0;
+}
+
+static void
+__exit xfsidbg_exit(void)
+{
+	struct xif	*p;
+
+	for (p = xfsidbg_funcs; p->name; p++)
+		kdb_unregister(p->name);
+
+	kdb_unregister("pb");
+	kdb_unregister("pbflags");
+	kdb_unregister("pbmap");
+	kdb_unregister("pbiodesc");
+	kdb_unregister("pbtrace");
+#ifdef PAGEBUF_TRACE
+	kdb_unregister("pboffset");
+#endif
+
+}
+
+/*
+ * Argument to xfs_alloc routines, for allocation type.
+ */
+static char *xfs_alloctype[] = {
+	"any_ag", "first_ag", "start_ag", "this_ag",
+	"start_bno", "near_bno", "this_bno"
+};
+
+
+/*
+ * Prototypes for static functions.
+ */
+static void xfs_broot(xfs_inode_t *ip, xfs_ifork_t *f);
+static void xfs_btalloc(xfs_alloc_block_t *bt, int bsz);
+static void xfs_btbmap(xfs_bmbt_block_t *bt, int bsz);
+static void xfs_btino(xfs_inobt_block_t *bt, int bsz);
+static void xfs_buf_item_print(xfs_buf_log_item_t *blip, int summary);
+static void xfs_convert_extent(xfs_bmbt_rec_64_t *rp, xfs_dfiloff_t *op,
+			xfs_dfsbno_t *sp, xfs_dfilblks_t *cp, int *fp);
+static void xfs_dastate_path(xfs_da_state_path_t *p);
+static void xfs_dir2data(void *addr, int size);
+static void xfs_dir2leaf(xfs_dir2_leaf_t *leaf, int size);
+static void xfs_dquot_item_print(xfs_dq_logitem_t *lip, int summary);
+static void xfs_efd_item_print(xfs_efd_log_item_t *efdp, int summary);
+static void xfs_efi_item_print(xfs_efi_log_item_t *efip, int summary);
+static char *xfs_fmtformat(xfs_dinode_fmt_t f);
+static char *xfs_fmtfsblock(xfs_fsblock_t bno, xfs_mount_t *mp);
+static char *xfs_fmtino(xfs_ino_t ino, xfs_mount_t *mp);
+static char *xfs_fmtlsn(xfs_lsn_t *lsnp);
+static char *xfs_fmtmode(int m);
+static char *xfs_fmtsize(size_t i);
+static char *xfs_fmtuuid(uuid_t *);
+static void xfs_inode_item_print(xfs_inode_log_item_t *ilip, int summary);
+static void xfs_inodebuf(xfs_buf_t *bp);
+static void xfs_prdinode(xfs_dinode_t *di, int coreonly, int convert);
+static void xfs_prdinode_core(xfs_dinode_core_t *dip, int convert);
+static void xfs_qoff_item_print(xfs_qoff_logitem_t *lip, int summary);
+static void xfs_xexlist_fork(xfs_inode_t *ip, int whichfork);
+static void xfs_xnode_fork(char *name, xfs_ifork_t *f);
+
+/*
+ * Static functions.
+ */
+
+
+/*
+ * Print an xfs in-inode bmap btree root.
+ */
+static void
+xfs_broot(xfs_inode_t *ip, xfs_ifork_t *f)
+{
+	xfs_bmbt_block_t	*broot;
+	int			format;
+	int			i;
+	xfs_bmbt_key_t		*kp;
+	xfs_bmbt_ptr_t		*pp;
+
+	format = f == &ip->i_df ? ip->i_d.di_format : ip->i_d.di_aformat;
+	if ((f->if_flags & XFS_IFBROOT) == 0 ||
+	    format != XFS_DINODE_FMT_BTREE) {
+		kdb_printf("inode 0x%p not btree format\n", ip);
+		return;
+	}
+	broot = f->if_broot;
+	kdb_printf("block @0x%p magic %x level %d numrecs %d\n",
+		broot, INT_GET(broot->bb_magic, ARCH_CONVERT), INT_GET(broot->bb_level, ARCH_CONVERT), INT_GET(broot->bb_numrecs, ARCH_CONVERT));
+	kp = XFS_BMAP_BROOT_KEY_ADDR(broot, 1, f->if_broot_bytes);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(broot, 1, f->if_broot_bytes);
+	for (i = 1; i <= INT_GET(broot->bb_numrecs, ARCH_CONVERT); i++)
+		kdb_printf("\t%d: startoff %Ld ptr %Lx %s\n",
+			i, INT_GET(kp[i - 1].br_startoff, ARCH_CONVERT), INT_GET(pp[i - 1], ARCH_CONVERT),
+			xfs_fmtfsblock(INT_GET(pp[i - 1], ARCH_CONVERT), ip->i_mount));
+}
+
+/*
+ * Print allocation btree block.
+ */
+static void
+xfs_btalloc(xfs_alloc_block_t *bt, int bsz)
+{
+	int i;
+
+	kdb_printf("magic 0x%x level %d numrecs %d leftsib 0x%x rightsib 0x%x\n",
+		INT_GET(bt->bb_magic, ARCH_CONVERT), INT_GET(bt->bb_level, ARCH_CONVERT), INT_GET(bt->bb_numrecs, ARCH_CONVERT),
+		INT_GET(bt->bb_leftsib, ARCH_CONVERT), INT_GET(bt->bb_rightsib, ARCH_CONVERT));
+	if (INT_ISZERO(bt->bb_level, ARCH_CONVERT)) {
+
+		for (i = 1; i <= INT_GET(bt->bb_numrecs, ARCH_CONVERT); i++) {
+			xfs_alloc_rec_t *r;
+
+			r = XFS_BTREE_REC_ADDR(bsz, xfs_alloc, bt, i, 0);
+			kdb_printf("rec %d startblock 0x%x blockcount %d\n",
+				i, INT_GET(r->ar_startblock, ARCH_CONVERT), INT_GET(r->ar_blockcount, ARCH_CONVERT));
+		}
+	} else {
+		int mxr;
+
+		mxr = XFS_BTREE_BLOCK_MAXRECS(bsz, xfs_alloc, 0);
+		for (i = 1; i <= INT_GET(bt->bb_numrecs, ARCH_CONVERT); i++) {
+			xfs_alloc_key_t *k;
+			xfs_alloc_ptr_t *p;
+
+			k = XFS_BTREE_KEY_ADDR(bsz, xfs_alloc, bt, i, mxr);
+			p = XFS_BTREE_PTR_ADDR(bsz, xfs_alloc, bt, i, mxr);
+			kdb_printf("key %d startblock 0x%x blockcount %d ptr 0x%x\n",
+				i, INT_GET(k->ar_startblock, ARCH_CONVERT), INT_GET(k->ar_blockcount, ARCH_CONVERT), *p);
+		}
+	}
+}
+
+/*
+ * Print a bmap btree block.
+ */
+static void
+xfs_btbmap(xfs_bmbt_block_t *bt, int bsz)
+{
+	int i;
+
+	kdb_printf("magic 0x%x level %d numrecs %d leftsib %Lx ",
+		INT_GET(bt->bb_magic, ARCH_CONVERT),
+		INT_GET(bt->bb_level, ARCH_CONVERT),
+		INT_GET(bt->bb_numrecs, ARCH_CONVERT),
+		INT_GET(bt->bb_leftsib, ARCH_CONVERT));
+	kdb_printf("rightsib %Lx\n", INT_GET(bt->bb_rightsib, ARCH_CONVERT));
+	if (INT_ISZERO(bt->bb_level, ARCH_CONVERT)) {
+		for (i = 1; i <= INT_GET(bt->bb_numrecs, ARCH_CONVERT); i++) {
+			xfs_bmbt_rec_64_t *r;
+			xfs_dfiloff_t o;
+			xfs_dfsbno_t s;
+			xfs_dfilblks_t c;
+			int fl;
+
+			r = (xfs_bmbt_rec_64_t *)XFS_BTREE_REC_ADDR(bsz,
+				xfs_bmbt, bt, i, 0);
+			xfs_convert_extent(r, &o, &s, &c, &fl);
+			kdb_printf("rec %d startoff %Ld ", i, o);
+			kdb_printf("startblock %Lx ", s);
+			kdb_printf("blockcount %Ld flag %d\n", c, fl);
+		}
+	} else {
+		int mxr;
+
+		mxr = XFS_BTREE_BLOCK_MAXRECS(bsz, xfs_bmbt, 0);
+		for (i = 1; i <= INT_GET(bt->bb_numrecs, ARCH_CONVERT); i++) {
+			xfs_bmbt_key_t *k;
+			xfs_bmbt_ptr_t *p;
+
+			k = XFS_BTREE_KEY_ADDR(bsz, xfs_bmbt, bt, i, mxr);
+			p = XFS_BTREE_PTR_ADDR(bsz, xfs_bmbt, bt, i, mxr);
+			kdb_printf("key %d startoff %Ld ",
+				i, INT_GET(k->br_startoff, ARCH_CONVERT));
+			kdb_printf("ptr %Lx\n", INT_GET(*p, ARCH_CONVERT));
+		}
+	}
+}
+
+/*
+ * Print an inode btree block.
+ */
+static void
+xfs_btino(xfs_inobt_block_t *bt, int bsz)
+{
+	int i;
+
+	kdb_printf("magic 0x%x level %d numrecs %d leftsib 0x%x rightsib 0x%x\n",
+		INT_GET(bt->bb_magic, ARCH_CONVERT), INT_GET(bt->bb_level, ARCH_CONVERT), INT_GET(bt->bb_numrecs, ARCH_CONVERT),
+		INT_GET(bt->bb_leftsib, ARCH_CONVERT), INT_GET(bt->bb_rightsib, ARCH_CONVERT));
+	if (INT_ISZERO(bt->bb_level, ARCH_CONVERT)) {
+
+		for (i = 1; i <= INT_GET(bt->bb_numrecs, ARCH_CONVERT); i++) {
+			xfs_inobt_rec_t *r;
+
+			r = XFS_BTREE_REC_ADDR(bsz, xfs_inobt, bt, i, 0);
+			kdb_printf("rec %d startino 0x%x freecount %d, free %Lx\n",
+				i, INT_GET(r->ir_startino, ARCH_CONVERT), INT_GET(r->ir_freecount, ARCH_CONVERT),
+				INT_GET(r->ir_free, ARCH_CONVERT));
+		}
+	} else {
+		int mxr;
+
+		mxr = XFS_BTREE_BLOCK_MAXRECS(bsz, xfs_inobt, 0);
+		for (i = 1; i <= INT_GET(bt->bb_numrecs, ARCH_CONVERT); i++) {
+			xfs_inobt_key_t *k;
+			xfs_inobt_ptr_t *p;
+
+			k = XFS_BTREE_KEY_ADDR(bsz, xfs_inobt, bt, i, mxr);
+			p = XFS_BTREE_PTR_ADDR(bsz, xfs_inobt, bt, i, mxr);
+			kdb_printf("key %d startino 0x%x ptr 0x%x\n",
+				i, INT_GET(k->ir_startino, ARCH_CONVERT), INT_GET(*p, ARCH_CONVERT));
+		}
+	}
+}
+
+/*
+ * Print a buf log item.
+ */
+static void
+xfs_buf_item_print(xfs_buf_log_item_t *blip, int summary)
+{
+	static char *bli_flags[] = {
+		"hold",		/* 0x1 */
+		"dirty",	/* 0x2 */
+		"stale",	/* 0x4 */
+		"logged",	/* 0x8 */
+		"ialloc",	/* 0x10 */
+		0
+		};
+	static char *blf_flags[] = {
+		"inode",	/* 0x1 */
+		"cancel",	/* 0x2 */
+		0
+		};
+
+	if (summary) {
+		kdb_printf("buf 0x%p blkno 0x%Lx ", blip->bli_buf,
+			     blip->bli_format.blf_blkno);
+		printflags(blip->bli_flags, bli_flags, "flags:");
+		kdb_printf("\n	 ");
+		xfsidbg_xbuf_real(blip->bli_buf, 1);
+		return;
+	}
+	kdb_printf("buf 0x%p recur %d refcount %d flags:",
+		blip->bli_buf, blip->bli_recur,
+		atomic_read(&blip->bli_refcount));
+	printflags(blip->bli_flags, bli_flags, NULL);
+	kdb_printf("\n");
+	kdb_printf("size %d blkno 0x%Lx len 0x%x map size %d map 0x%p\n",
+		blip->bli_format.blf_size, blip->bli_format.blf_blkno,
+		(uint) blip->bli_format.blf_len, blip->bli_format.blf_map_size,
+		&(blip->bli_format.blf_data_map[0]));
+	kdb_printf("blf flags: ");
+	printflags((uint)blip->bli_format.blf_flags, blf_flags, NULL);
+#ifdef XFS_TRANS_DEBUG
+	kdb_printf("orig 0x%x logged 0x%x",
+		blip->bli_orig, blip->bli_logged);
+#endif
+	kdb_printf("\n");
+}
+
+/*
+ * Convert an external extent descriptor to internal form.
+ */
+static void
+xfs_convert_extent(xfs_bmbt_rec_64_t *rp, xfs_dfiloff_t *op, xfs_dfsbno_t *sp,
+		   xfs_dfilblks_t *cp, int *fp)
+{
+	xfs_dfiloff_t o;
+	xfs_dfsbno_t s;
+	xfs_dfilblks_t c;
+	int flag;
+
+	flag = (int)((INT_GET(rp->l0, ARCH_CONVERT)) >> (64 - 1 ));
+	o = ((xfs_fileoff_t)INT_GET(rp->l0, ARCH_CONVERT) &
+			   (((__uint64_t)1 << ( 64 - 1	)) - 1) ) >> 9;
+	s = (((xfs_fsblock_t)INT_GET(rp->l0, ARCH_CONVERT) & (((__uint64_t)1 << ( 9 )) - 1) ) << 43) |
+			   (((xfs_fsblock_t)INT_GET(rp->l1, ARCH_CONVERT)) >> 21);
+	c = (xfs_filblks_t)(INT_GET(rp->l1, ARCH_CONVERT) & (((__uint64_t)1 << ( 21 )) - 1) );
+	*op = o;
+	*sp = s;
+	*cp = c;
+	*fp = flag;
+}
+
+
+/*
+ * Print an xfs_da_state_path structure.
+ */
+static void
+xfs_dastate_path(xfs_da_state_path_t *p)
+{
+	int i;
+
+	kdb_printf("active %d\n", p->active);
+	for (i = 0; i < XFS_DA_NODE_MAXDEPTH; i++) {
+		kdb_printf(" blk %d bp 0x%p blkno 0x%x",
+			i, p->blk[i].bp, p->blk[i].blkno);
+		kdb_printf(" index %d hashval 0x%x ",
+			p->blk[i].index, (uint_t)p->blk[i].hashval);
+		switch(p->blk[i].magic) {
+		case XFS_DA_NODE_MAGIC:		kdb_printf("NODE\n");	break;
+		case XFS_DIR_LEAF_MAGIC:	kdb_printf("DIR\n");	break;
+		case XFS_ATTR_LEAF_MAGIC:	kdb_printf("ATTR\n");	break;
+		case XFS_DIR2_LEAFN_MAGIC:	kdb_printf("DIR2\n");	break;
+		default:			kdb_printf("type ??\n");	break;
+		}
+	}
+}
+
+
+/*
+ * Print an efd log item.
+ */
+static void
+xfs_efd_item_print(xfs_efd_log_item_t *efdp, int summary)
+{
+	int		i;
+	xfs_extent_t	*ep;
+
+	if (summary) {
+		kdb_printf("Extent Free Done: ID 0x%Lx nextents %d (at 0x%p)\n",
+				efdp->efd_format.efd_efi_id,
+				efdp->efd_format.efd_nextents, efdp);
+		return;
+	}
+	kdb_printf("size %d nextents %d next extent %d efip 0x%p\n",
+		efdp->efd_format.efd_size, efdp->efd_format.efd_nextents,
+		efdp->efd_next_extent, efdp->efd_efip);
+	kdb_printf("efi_id 0x%Lx\n", efdp->efd_format.efd_efi_id);
+	kdb_printf("efd extents:\n");
+	ep = &(efdp->efd_format.efd_extents[0]);
+	for (i = 0; i < efdp->efd_next_extent; i++, ep++) {
+		kdb_printf("	block %Lx len %d\n",
+			ep->ext_start, ep->ext_len);
+	}
+}
+
+/*
+ * Print an efi log item.
+ */
+static void
+xfs_efi_item_print(xfs_efi_log_item_t *efip, int summary)
+{
+	int		i;
+	xfs_extent_t	*ep;
+	static char *efi_flags[] = {
+		"recovered",	/* 0x1 */
+		"committed",	/* 0x2 */
+		"cancelled",	/* 0x4 */
+		0,
+		};
+
+	if (summary) {
+		kdb_printf("Extent Free Intention: ID 0x%Lx nextents %d (at 0x%p)\n",
+				efip->efi_format.efi_id,
+				efip->efi_format.efi_nextents, efip);
+		return;
+	}
+	kdb_printf("size %d nextents %d next extent %d\n",
+		efip->efi_format.efi_size, efip->efi_format.efi_nextents,
+		efip->efi_next_extent);
+	kdb_printf("id %Lx", efip->efi_format.efi_id);
+	printflags(efip->efi_flags, efi_flags, "flags :");
+	kdb_printf("\n");
+	kdb_printf("efi extents:\n");
+	ep = &(efip->efi_format.efi_extents[0]);
+	for (i = 0; i < efip->efi_next_extent; i++, ep++) {
+		kdb_printf("	block %Lx len %d\n",
+			ep->ext_start, ep->ext_len);
+	}
+}
+
+/*
+ * Format inode "format" into a static buffer & return it.
+ */
+static char *
+xfs_fmtformat(xfs_dinode_fmt_t f)
+{
+	static char *t[] = {
+		"dev",
+		"local",
+		"extents",
+		"btree",
+		"uuid"
+	};
+
+	return t[f];
+}
+
+/*
+ * Format fsblock number into a static buffer & return it.
+ */
+static char *
+xfs_fmtfsblock(xfs_fsblock_t bno, xfs_mount_t *mp)
+{
+	static char rval[50];
+
+	if (bno == NULLFSBLOCK)
+		sprintf(rval, "NULLFSBLOCK");
+	else if (ISNULLSTARTBLOCK(bno))
+		sprintf(rval, "NULLSTARTBLOCK(%Ld)", STARTBLOCKVAL(bno));
+	else if (mp)
+		sprintf(rval, "%Ld[%x:%x]", (xfs_dfsbno_t)bno,
+			XFS_FSB_TO_AGNO(mp, bno), XFS_FSB_TO_AGBNO(mp, bno));
+	else
+		sprintf(rval, "%Ld", (xfs_dfsbno_t)bno);
+	return rval;
+}
+
+/*
+ * Format inode number into a static buffer & return it.
+ */
+static char *
+xfs_fmtino(xfs_ino_t ino, xfs_mount_t *mp)
+{
+	static char rval[50];
+
+	if (mp)
+		sprintf(rval, "%llu[%x:%x:%x]",
+			(unsigned long long) ino,
+			XFS_INO_TO_AGNO(mp, ino),
+			XFS_INO_TO_AGBNO(mp, ino),
+			XFS_INO_TO_OFFSET(mp, ino));
+	else
+		sprintf(rval, "%llu", (unsigned long long) ino);
+	return rval;
+}
+
+/*
+ * Format an lsn for printing into a static buffer & return it.
+ */
+static char *
+xfs_fmtlsn(xfs_lsn_t *lsnp)
+{
+	uint		*wordp;
+	uint		*word2p;
+	static char	buf[20];
+
+	wordp = (uint *)lsnp;
+	word2p = wordp++;
+	sprintf(buf, "[%u:%u]", *wordp, *word2p);
+
+	return buf;
+}
+
+/*
+ * Format file mode into a static buffer & return it.
+ */
+static char *
+xfs_fmtmode(int m)
+{
+	static char rval[16];
+
+	sprintf(rval, "%c%c%c%c%c%c%c%c%c%c%c%c%c",
+		"?fc?dxb?r?l?S?m?"[(m & IFMT) >> 12],
+		m & ISUID ? 'u' : '-',
+		m & ISGID ? 'g' : '-',
+		m & ISVTX ? 'v' : '-',
+		m & IREAD ? 'r' : '-',
+		m & IWRITE ? 'w' : '-',
+		m & IEXEC ? 'x' : '-',
+		m & (IREAD >> 3) ? 'r' : '-',
+		m & (IWRITE >> 3) ? 'w' : '-',
+		m & (IEXEC >> 3) ? 'x' : '-',
+		m & (IREAD >> 6) ? 'r' : '-',
+		m & (IWRITE >> 6) ? 'w' : '-',
+		m & (IEXEC >> 6) ? 'x' : '-');
+	return rval;
+}
+
+/*
+ * Format a size into a static buffer & return it.
+ */
+static char *
+xfs_fmtsize(size_t i)
+{
+	static char rval[20];
+
+	/* size_t is 32 bits in 32-bit kernel, 64 bits in 64-bit kernel */
+	sprintf(rval, "0x%lx", (unsigned long) i);
+	return rval;
+}
+
+/*
+ * Format a uuid into a static buffer & return it.
+ */
+static char *
+xfs_fmtuuid(uuid_t *uu)
+{
+	static char rval[40];
+	char	    *o		= rval;
+	char	    *i		= (unsigned char*)uu;
+	int	    b;
+
+	for (b=0;b<16;b++) {
+	    o+=sprintf(o, "%02x", *i++);
+	    if (b==3||b==5||b==7||b==9) *o++='-';
+	}
+	*o='\0';
+
+	return rval;
+}
+
+/*
+ * Print an inode log item.
+ */
+static void
+xfs_inode_item_print(xfs_inode_log_item_t *ilip, int summary)
+{
+	static char *ili_flags[] = {
+		"hold",		/* 0x1 */
+		"iolock excl",	/* 0x2 */
+		"iolock shrd",	/* 0x4 */
+		0
+		};
+	static char *ilf_fields[] = {
+		"core",		/* 0x001 */
+		"ddata",	/* 0x002 */
+		"dexts",	/* 0x004 */
+		"dbroot",	/* 0x008 */
+		"dev",		/* 0x010 */
+		"uuid",		/* 0x020 */
+		"adata",	/* 0x040 */
+		"aext",		/* 0x080 */
+		"abroot",	/* 0x100 */
+		0
+		};
+
+	if (summary) {
+		kdb_printf("inode 0x%p logged %d ",
+			ilip->ili_inode, ilip->ili_logged);
+		printflags(ilip->ili_flags, ili_flags, "flags:");
+		printflags(ilip->ili_format.ilf_fields, ilf_fields, "format:");
+		printflags(ilip->ili_last_fields, ilf_fields, "lastfield:");
+		kdb_printf("\n");
+		return;
+	}
+	kdb_printf("inode 0x%p ino 0x%llu logged %d flags: ",
+		ilip->ili_inode, (unsigned long long) ilip->ili_format.ilf_ino,
+		ilip->ili_logged);
+	printflags(ilip->ili_flags, ili_flags, NULL);
+	kdb_printf("\n");
+	kdb_printf("ilock recur %d iolock recur %d ext buf 0x%p\n",
+		ilip->ili_ilock_recur, ilip->ili_iolock_recur,
+		ilip->ili_extents_buf);
+#ifdef XFS_TRANS_DEBUG
+	kdb_printf("root bytes %d root orig 0x%x\n",
+		ilip->ili_root_size, ilip->ili_orig_root);
+#endif
+	kdb_printf("size %d fields: ", ilip->ili_format.ilf_size);
+	printflags(ilip->ili_format.ilf_fields, ilf_fields, "formatfield");
+	kdb_printf(" last fields: ");
+	printflags(ilip->ili_last_fields, ilf_fields, "lastfield");
+	kdb_printf("\n");
+	kdb_printf(" flush lsn %s last lsn %s\n",
+		xfs_fmtlsn(&(ilip->ili_flush_lsn)),
+		xfs_fmtlsn(&(ilip->ili_last_lsn)));
+	kdb_printf("dsize %d, asize %d, rdev 0x%x\n",
+		ilip->ili_format.ilf_dsize,
+		ilip->ili_format.ilf_asize,
+		ilip->ili_format.ilf_u.ilfu_rdev);
+	kdb_printf("blkno 0x%Lx len 0x%x boffset 0x%x\n",
+		ilip->ili_format.ilf_blkno,
+		ilip->ili_format.ilf_len,
+		ilip->ili_format.ilf_boffset);
+}
+
+/*
+ * Print a dquot log item.
+ */
+/* ARGSUSED */
+static void
+xfs_dquot_item_print(xfs_dq_logitem_t *lip, int summary)
+{
+	kdb_printf("dquot 0x%p\n",
+		lip->qli_dquot);
+
+}
+
+/*
+ * Print a quotaoff log item.
+ */
+/* ARGSUSED */
+static void
+xfs_qoff_item_print(xfs_qoff_logitem_t *lip, int summary)
+{
+	kdb_printf("start qoff item 0x%p flags 0x%x\n",
+		lip->qql_start_lip, lip->qql_format.qf_flags);
+
+}
+
+/*
+ * Print buffer full of inodes.
+ */
+static void
+xfs_inodebuf(xfs_buf_t *bp)
+{
+	xfs_dinode_t *di;
+	int n, i;
+
+	n = XFS_BUF_COUNT(bp) >> 8;
+	for (i = 0; i < n; i++) {
+		di = (xfs_dinode_t *)xfs_buf_offset(bp,
+					i * 256);
+		xfs_prdinode(di, 0, ARCH_CONVERT);
+	}
+}
+
+
+/*
+ * Print disk inode.
+ */
+static void
+xfs_prdinode(xfs_dinode_t *di, int coreonly, int convert)
+{
+	xfs_prdinode_core(&di->di_core, convert);
+	if (!coreonly)
+		kdb_printf("next_unlinked 0x%x u@0x%p\n",
+			INT_GET(di->di_next_unlinked, convert),
+			&di->di_u);
+}
+
+/*
+ * Print disk inode core.
+ */
+static void
+xfs_prdinode_core(xfs_dinode_core_t *dip, int convert)
+{
+	static char *diflags[] = {
+		"realtime",		/* XFS_DIFLAG_REALTIME */
+		"prealloc",		/* XFS_DIFLAG_PREALLOC */
+		NULL
+	};
+
+	kdb_printf("magic 0x%x mode 0%o (%s) version 0x%x format 0x%x (%s)\n",
+		INT_GET(dip->di_magic, convert),
+		INT_GET(dip->di_mode, convert),
+		xfs_fmtmode(INT_GET(dip->di_mode, convert)),
+		INT_GET(dip->di_version, convert),
+		INT_GET(dip->di_format, convert),
+		xfs_fmtformat(
+		    (xfs_dinode_fmt_t)INT_GET(dip->di_format, convert)));
+	kdb_printf("nlink 0x%x uid 0x%x gid 0x%x projid 0x%x\n",
+		INT_GET(dip->di_nlink, convert),
+		INT_GET(dip->di_uid, convert),
+		INT_GET(dip->di_gid, convert),
+		(uint)INT_GET(dip->di_projid, convert));
+	kdb_printf("atime 0x%x:%x mtime 0x%x:%x ctime 0x%x:%x\n",
+		INT_GET(dip->di_atime.t_sec, convert),
+		INT_GET(dip->di_atime.t_nsec, convert),
+		INT_GET(dip->di_mtime.t_sec, convert),
+		INT_GET(dip->di_mtime.t_nsec, convert),
+		INT_GET(dip->di_ctime.t_sec, convert),
+		INT_GET(dip->di_ctime.t_nsec, convert));
+	kdb_printf("size 0x%Lx ", INT_GET(dip->di_size, convert));
+	kdb_printf("nblocks %Ld extsize 0x%x nextents 0x%x anextents 0x%x\n",
+		INT_GET(dip->di_nblocks, convert),
+		INT_GET(dip->di_extsize, convert),
+		INT_GET(dip->di_nextents, convert),
+		INT_GET(dip->di_anextents, convert));
+	kdb_printf("forkoff %d aformat 0x%x (%s) dmevmask 0x%x dmstate 0x%x ",
+		INT_GET(dip->di_forkoff, convert),
+		INT_GET(dip->di_aformat, convert),
+		xfs_fmtformat(
+		    (xfs_dinode_fmt_t)INT_GET(dip->di_aformat, convert)),
+		INT_GET(dip->di_dmevmask, convert),
+		INT_GET(dip->di_dmstate, convert));
+	printflags(INT_GET(dip->di_flags, convert), diflags, "flags");
+	kdb_printf("gen 0x%x\n", INT_GET(dip->di_gen, convert));
+}
+
+/*
+ * Print xfs extent list for a fork.
+ */
+static void
+xfs_xexlist_fork(xfs_inode_t *ip, int whichfork)
+{
+	int nextents, i;
+	xfs_dfiloff_t o;
+	xfs_dfsbno_t s;
+	xfs_dfilblks_t c;
+	int flag;
+	xfs_ifork_t *ifp;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (ifp->if_flags & XFS_IFEXTENTS) {
+		nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_64_t);
+		kdb_printf("inode 0x%p %cf extents 0x%p nextents 0x%x\n",
+			ip, "da"[whichfork], ifp->if_u1.if_extents, nextents);
+		for (i = 0; i < nextents; i++) {
+			xfs_convert_extent(
+				(xfs_bmbt_rec_64_t *)&ifp->if_u1.if_extents[i],
+				&o, &s, &c, &flag);
+			kdb_printf(
+		"%d: startoff %Ld startblock %s blockcount %Ld flag %d\n",
+				i, o, xfs_fmtfsblock(s, ip->i_mount), c, flag);
+		}
+	}
+}
+
+static void
+xfs_xnode_fork(char *name, xfs_ifork_t *f)
+{
+	static char *tab_flags[] = {
+		"inline",	/* XFS_IFINLINE */
+		"extents",	/* XFS_IFEXTENTS */
+		"broot",	/* XFS_IFBROOT */
+		NULL
+	};
+	int *p;
+
+	kdb_printf("%s fork", name);
+	if (f == NULL) {
+		kdb_printf(" empty\n");
+		return;
+	} else
+		kdb_printf("\n");
+	kdb_printf(" bytes %s ", xfs_fmtsize(f->if_bytes));
+	kdb_printf("real_bytes %s lastex 0x%x u1:%s 0x%p\n",
+		xfs_fmtsize(f->if_real_bytes), f->if_lastex,
+		f->if_flags & XFS_IFINLINE ? "data" : "extents",
+		f->if_flags & XFS_IFINLINE ?
+			f->if_u1.if_data :
+			(char *)f->if_u1.if_extents);
+	kdb_printf(" broot 0x%p broot_bytes %s ext_max %d ",
+		f->if_broot, xfs_fmtsize(f->if_broot_bytes), f->if_ext_max);
+	printflags(f->if_flags, tab_flags, "flags");
+	kdb_printf("\n");
+	kdb_printf(" u2");
+	for (p = (int *)&f->if_u2;
+	     p < (int *)((char *)&f->if_u2 + XFS_INLINE_DATA);
+	     p++)
+		kdb_printf(" 0x%x", *p);
+	kdb_printf("\n");
+}
+
+/*
+ * Command-level xfs-idbg functions.
+ */
+
+/*
+ * Print xfs allocation group freespace header.
+ */
+static void
+xfsidbg_xagf(xfs_agf_t *agf)
+{
+	kdb_printf("magicnum 0x%x versionnum 0x%x seqno 0x%x length 0x%x\n",
+		INT_GET(agf->agf_magicnum, ARCH_CONVERT),
+		INT_GET(agf->agf_versionnum, ARCH_CONVERT),
+		INT_GET(agf->agf_seqno, ARCH_CONVERT),
+		INT_GET(agf->agf_length, ARCH_CONVERT));
+	kdb_printf("roots b 0x%x c 0x%x levels b %d c %d\n",
+		INT_GET(agf->agf_roots[XFS_BTNUM_BNO], ARCH_CONVERT),
+		INT_GET(agf->agf_roots[XFS_BTNUM_CNT], ARCH_CONVERT),
+		INT_GET(agf->agf_levels[XFS_BTNUM_BNO], ARCH_CONVERT),
+		INT_GET(agf->agf_levels[XFS_BTNUM_CNT], ARCH_CONVERT));
+	kdb_printf("flfirst %d fllast %d flcount %d freeblks %d longest %d\n",
+		INT_GET(agf->agf_flfirst, ARCH_CONVERT),
+		INT_GET(agf->agf_fllast, ARCH_CONVERT),
+		INT_GET(agf->agf_flcount, ARCH_CONVERT),
+		INT_GET(agf->agf_freeblks, ARCH_CONVERT),
+		INT_GET(agf->agf_longest, ARCH_CONVERT));
+}
+
+/*
+ * Print xfs allocation group inode header.
+ */
+static void
+xfsidbg_xagi(xfs_agi_t *agi)
+{
+	int	i;
+	int	j;
+
+	kdb_printf("magicnum 0x%x versionnum 0x%x seqno 0x%x length 0x%x\n",
+		INT_GET(agi->agi_magicnum, ARCH_CONVERT),
+		INT_GET(agi->agi_versionnum, ARCH_CONVERT),
+		INT_GET(agi->agi_seqno, ARCH_CONVERT),
+		INT_GET(agi->agi_length, ARCH_CONVERT));
+	kdb_printf("count 0x%x root 0x%x level 0x%x\n",
+		INT_GET(agi->agi_count, ARCH_CONVERT),
+		INT_GET(agi->agi_root, ARCH_CONVERT),
+		INT_GET(agi->agi_level, ARCH_CONVERT));
+	kdb_printf("freecount 0x%x newino 0x%x dirino 0x%x\n",
+		INT_GET(agi->agi_freecount, ARCH_CONVERT),
+		INT_GET(agi->agi_newino, ARCH_CONVERT),
+		INT_GET(agi->agi_dirino, ARCH_CONVERT));
+
+	kdb_printf("unlinked buckets\n");
+	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+		for (j = 0; j < 4; j++, i++) {
+			kdb_printf("0x%08x ",
+				INT_GET(agi->agi_unlinked[i], ARCH_CONVERT));
+		}
+		kdb_printf("\n");
+	}
+}
+
+
+/*
+ * Print an allocation argument structure for XFS.
+ */
+static void
+xfsidbg_xalloc(xfs_alloc_arg_t *args)
+{
+	kdb_printf("tp 0x%p mp 0x%p agbp 0x%p pag 0x%p fsbno %s\n",
+		args->tp, args->mp, args->agbp, args->pag,
+		xfs_fmtfsblock(args->fsbno, args->mp));
+	kdb_printf("agno 0x%x agbno 0x%x minlen 0x%x maxlen 0x%x mod 0x%x\n",
+		args->agno, args->agbno, args->minlen, args->maxlen, args->mod);
+	kdb_printf("prod 0x%x minleft 0x%x total 0x%x alignment 0x%x\n",
+		args->prod, args->minleft, args->total, args->alignment);
+	kdb_printf("minalignslop 0x%x len 0x%x type %s otype %s wasdel %d\n",
+		args->minalignslop, args->len, xfs_alloctype[args->type],
+		xfs_alloctype[args->otype], args->wasdel);
+	kdb_printf("wasfromfl %d isfl %d userdata %d\n",
+		args->wasfromfl, args->isfl, args->userdata);
+}
+
+
+
+/*
+ * Print an attr_list() context structure.
+ */
+static void
+xfsidbg_xattrcontext(xfs_attr_list_context_t *context)
+{
+	static char *attr_arg_flags[] = {
+		"DONTFOLLOW",	/* 0x0001 */
+		"?",		/* 0x0002 */
+		"?",		/* 0x0004 */
+		"?",		/* 0x0008 */
+		"CREATE",	/* 0x0010 */
+		"?",		/* 0x0020 */
+		"?",		/* 0x0040 */
+		"?",		/* 0x0080 */
+		"?",		/* 0x0100 */
+		"?",		/* 0x0200 */
+		"?",		/* 0x0400 */
+		"?",		/* 0x0800 */
+		"KERNOTIME",	/* 0x1000 */
+		NULL
+	};
+
+	kdb_printf("dp 0x%p, dupcnt %d, resynch %d",
+		    context->dp, context->dupcnt, context->resynch);
+	printflags((__psunsigned_t)context->flags, attr_arg_flags, ", flags");
+	kdb_printf("\ncursor h/b/o 0x%x/0x%x/%d -- p/p/i 0x%x/0x%x/0x%x\n",
+			  context->cursor->hashval, context->cursor->blkno,
+			  context->cursor->offset, context->cursor->pad1,
+			  context->cursor->pad2, context->cursor->initted);
+	kdb_printf("alist 0x%p, bufsize 0x%x, count %d, firstu 0x%x\n",
+		       context->alist, context->bufsize, context->count,
+		       context->firstu);
+}
+
+/*
+ * Print attribute leaf block.
+ */
+static void
+xfsidbg_xattrleaf(xfs_attr_leafblock_t *leaf)
+{
+	xfs_attr_leaf_hdr_t *h;
+	xfs_da_blkinfo_t *i;
+	xfs_attr_leaf_map_t *m;
+	xfs_attr_leaf_entry_t *e;
+	xfs_attr_leaf_name_local_t *l;
+	xfs_attr_leaf_name_remote_t *r;
+	int j, k;
+
+	h = &leaf->hdr;
+	i = &h->info;
+	kdb_printf("hdr info forw 0x%x back 0x%x magic 0x%x\n",
+		i->forw, i->back, i->magic);
+	kdb_printf("hdr count %d usedbytes %d firstused %d holes %d\n",
+		INT_GET(h->count, ARCH_CONVERT),
+		INT_GET(h->usedbytes, ARCH_CONVERT),
+		INT_GET(h->firstused, ARCH_CONVERT), h->holes);
+	for (j = 0, m = h->freemap; j < XFS_ATTR_LEAF_MAPSIZE; j++, m++) {
+		kdb_printf("hdr freemap %d base %d size %d\n",
+			j, INT_GET(m->base, ARCH_CONVERT),
+			INT_GET(m->size, ARCH_CONVERT));
+	}
+	for (j = 0, e = leaf->entries; j < INT_GET(h->count, ARCH_CONVERT); j++, e++) {
+		kdb_printf("[%2d] hash 0x%x nameidx %d flags 0x%x",
+			j, INT_GET(e->hashval, ARCH_CONVERT),
+			INT_GET(e->nameidx, ARCH_CONVERT), e->flags);
+		if (e->flags & XFS_ATTR_LOCAL)
+			kdb_printf("LOCAL ");
+		if (e->flags & XFS_ATTR_ROOT)
+			kdb_printf("ROOT ");
+		if (e->flags & XFS_ATTR_INCOMPLETE)
+			kdb_printf("INCOMPLETE ");
+		k = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_INCOMPLETE);
+		if ((e->flags & k) != 0)
+			kdb_printf("0x%x", e->flags & k);
+		kdb_printf(">\n	    name \"");
+		if (e->flags & XFS_ATTR_LOCAL) {
+			l = XFS_ATTR_LEAF_NAME_LOCAL(leaf, j);
+			for (k = 0; k < l->namelen; k++)
+				kdb_printf("%c", l->nameval[k]);
+			kdb_printf("\"(%d) value \"", l->namelen);
+			for (k = 0; (k < INT_GET(l->valuelen, ARCH_CONVERT)) && (k < 32); k++)
+				kdb_printf("%c", l->nameval[l->namelen + k]);
+			if (k == 32)
+				kdb_printf("...");
+			kdb_printf("\"(%d)\n",
+				INT_GET(l->valuelen, ARCH_CONVERT));
+		} else {
+			r = XFS_ATTR_LEAF_NAME_REMOTE(leaf, j);
+			for (k = 0; k < r->namelen; k++)
+				kdb_printf("%c", r->name[k]);
+			kdb_printf("\"(%d) value blk 0x%x len %d\n",
+				    r->namelen,
+				    INT_GET(r->valueblk, ARCH_CONVERT),
+				    INT_GET(r->valuelen, ARCH_CONVERT));
+		}
+	}
+}
+
+/*
+ * Print a shortform attribute list.
+ */
+static void
+xfsidbg_xattrsf(xfs_attr_shortform_t *s)
+{
+	xfs_attr_sf_hdr_t *sfh;
+	xfs_attr_sf_entry_t *sfe;
+	int i, j;
+
+	sfh = &s->hdr;
+	kdb_printf("hdr count %d\n", INT_GET(sfh->count, ARCH_CONVERT));
+	for (i = 0, sfe = s->list; i < INT_GET(sfh->count, ARCH_CONVERT); i++) {
+		kdb_printf("entry %d namelen %d name \"", i, sfe->namelen);
+		for (j = 0; j < sfe->namelen; j++)
+			kdb_printf("%c", sfe->nameval[j]);
+		kdb_printf("\" valuelen %d value \"", INT_GET(sfe->valuelen, ARCH_CONVERT));
+		for (j = 0; (j < INT_GET(sfe->valuelen, ARCH_CONVERT)) && (j < 32); j++)
+			kdb_printf("%c", sfe->nameval[sfe->namelen + j]);
+		if (j == 32)
+			kdb_printf("...");
+		kdb_printf("\"\n");
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+	}
+}
+
+
+/*
+ * Print xfs bmap internal record
+ */
+static void
+xfsidbg_xbirec(xfs_bmbt_irec_t *r)
+{
+	kdb_printf(
+	"startoff %Ld startblock %Lx blockcount %Ld state %Ld\n",
+		(__uint64_t)r->br_startoff,
+		(__uint64_t)r->br_startblock,
+		(__uint64_t)r->br_blockcount,
+		(__uint64_t)r->br_state);
+}
+
+
+/*
+ * Print a bmap alloc argument structure for XFS.
+ */
+static void
+xfsidbg_xbmalla(xfs_bmalloca_t *a)
+{
+	kdb_printf("tp 0x%p ip 0x%p eof %d prevp 0x%p\n",
+		a->tp, a->ip, a->eof, a->prevp);
+	kdb_printf("gotp 0x%p firstblock %s alen %d total %d\n",
+		a->gotp, xfs_fmtfsblock(a->firstblock, a->ip->i_mount),
+		a->alen, a->total);
+	kdb_printf("off %s wasdel %d userdata %d minlen %d\n",
+		xfs_fmtfsblock(a->off, a->ip->i_mount), a->wasdel,
+		a->userdata, a->minlen);
+	kdb_printf("minleft %d low %d rval %s aeof %d\n",
+		a->minleft, a->low, xfs_fmtfsblock(a->rval, a->ip->i_mount),
+		a->aeof);
+}
+
+
+/*
+ * Print xfs bmap record
+ */
+static void
+xfsidbg_xbrec(xfs_bmbt_rec_64_t *r)
+{
+	xfs_dfiloff_t o;
+	xfs_dfsbno_t s;
+	xfs_dfilblks_t c;
+	int flag;
+
+	xfs_convert_extent(r, &o, &s, &c, &flag);
+	kdb_printf("startoff %Ld startblock %Lx blockcount %Ld flag %d\n",
+		o, s, c, flag);
+}
+
+/*
+ * Print an xfs in-inode bmap btree root (data fork).
+ */
+static void
+xfsidbg_xbroot(xfs_inode_t *ip)
+{
+	xfs_broot(ip, &ip->i_df);
+}
+
+/*
+ * Print an xfs in-inode bmap btree root (attribute fork).
+ */
+static void
+xfsidbg_xbroota(xfs_inode_t *ip)
+{
+	if (ip->i_afp)
+		xfs_broot(ip, ip->i_afp);
+}
+
+/*
+ * Print xfs btree cursor.
+ */
+static void
+xfsidbg_xbtcur(xfs_btree_cur_t *c)
+{
+	int l;
+
+	kdb_printf("tp 0x%p mp 0x%p\n",
+		c->bc_tp,
+		c->bc_mp);
+	if (c->bc_btnum == XFS_BTNUM_BMAP) {
+		kdb_printf("rec.b ");
+		xfsidbg_xbirec(&c->bc_rec.b);
+	} else if (c->bc_btnum == XFS_BTNUM_INO) {
+		kdb_printf("rec.i startino 0x%x freecount 0x%x free %Lx\n",
+			c->bc_rec.i.ir_startino, c->bc_rec.i.ir_freecount,
+			c->bc_rec.i.ir_free);
+	} else {
+		kdb_printf("rec.a startblock 0x%x blockcount 0x%x\n",
+			c->bc_rec.a.ar_startblock,
+			c->bc_rec.a.ar_blockcount);
+	}
+	kdb_printf("bufs");
+	for (l = 0; l < c->bc_nlevels; l++)
+		kdb_printf(" 0x%p", c->bc_bufs[l]);
+	kdb_printf("\n");
+	kdb_printf("ptrs");
+	for (l = 0; l < c->bc_nlevels; l++)
+		kdb_printf(" 0x%x", c->bc_ptrs[l]);
+	kdb_printf("  ra");
+	for (l = 0; l < c->bc_nlevels; l++)
+		kdb_printf(" %d", c->bc_ra[l]);
+	kdb_printf("\n");
+	kdb_printf("nlevels %d btnum %s blocklog %d\n",
+		c->bc_nlevels,
+		c->bc_btnum == XFS_BTNUM_BNO ? "bno" :
+		(c->bc_btnum == XFS_BTNUM_CNT ? "cnt" :
+		 (c->bc_btnum == XFS_BTNUM_BMAP ? "bmap" : "ino")),
+		c->bc_blocklog);
+	if (c->bc_btnum == XFS_BTNUM_BMAP) {
+		kdb_printf("private forksize 0x%x whichfork %d ip 0x%p flags %d\n",
+			c->bc_private.b.forksize,
+			c->bc_private.b.whichfork,
+			c->bc_private.b.ip,
+			c->bc_private.b.flags);
+		kdb_printf("private firstblock %s flist 0x%p allocated 0x%x\n",
+			xfs_fmtfsblock(c->bc_private.b.firstblock, c->bc_mp),
+			c->bc_private.b.flist,
+			c->bc_private.b.allocated);
+	} else if (c->bc_btnum == XFS_BTNUM_INO) {
+		kdb_printf("private agbp 0x%p agno 0x%x\n",
+			c->bc_private.i.agbp,
+			c->bc_private.i.agno);
+	} else {
+		kdb_printf("private agbp 0x%p agno 0x%x\n",
+			c->bc_private.a.agbp,
+			c->bc_private.a.agno);
+	}
+}
+
+/*
+ * Figure out what kind of xfs block the buffer contains,
+ * and invoke a print routine.
+ */
+static void
+xfsidbg_xbuf(xfs_buf_t *bp)
+{
+	xfsidbg_xbuf_real(bp, 0);
+}
+
+/*
+ * Figure out what kind of xfs block the buffer contains,
+ * and invoke a print routine (if asked to).
+ */
+static void
+xfsidbg_xbuf_real(xfs_buf_t *bp, int summary)
+{
+	void *d;
+	xfs_agf_t *agf;
+	xfs_agi_t *agi;
+	xfs_sb_t *sb;
+	xfs_alloc_block_t *bta;
+	xfs_bmbt_block_t *btb;
+	xfs_inobt_block_t *bti;
+	xfs_attr_leafblock_t *aleaf;
+	xfs_dir_leafblock_t *dleaf;
+	xfs_da_intnode_t *node;
+	xfs_dinode_t *di;
+	xfs_disk_dquot_t *dqb;
+	xfs_dir2_block_t *d2block;
+	xfs_dir2_data_t *d2data;
+	xfs_dir2_leaf_t *d2leaf;
+	xfs_dir2_free_t *d2free;
+
+	d = XFS_BUF_PTR(bp);
+	if (INT_GET((agf = d)->agf_magicnum, ARCH_CONVERT) == XFS_AGF_MAGIC) {
+		if (summary) {
+			kdb_printf("freespace hdr for AG %d (at 0x%p)\n",
+				INT_GET(agf->agf_seqno, ARCH_CONVERT), agf);
+		} else {
+			kdb_printf("buf 0x%p agf 0x%p\n", bp, agf);
+			xfsidbg_xagf(agf);
+		}
+	} else if (INT_GET((agi = d)->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC) {
+		if (summary) {
+			kdb_printf("Inode hdr for AG %d (at 0x%p)\n",
+			       INT_GET(agi->agi_seqno, ARCH_CONVERT), agi);
+		} else {
+			kdb_printf("buf 0x%p agi 0x%p\n", bp, agi);
+			xfsidbg_xagi(agi);
+		}
+	} else if (INT_GET((bta = d)->bb_magic, ARCH_CONVERT) == XFS_ABTB_MAGIC) {
+		if (summary) {
+			kdb_printf("Alloc BNO Btree blk, level %d (at 0x%p)\n",
+				       INT_GET(bta->bb_level, ARCH_CONVERT), bta);
+		} else {
+			kdb_printf("buf 0x%p abtbno 0x%p\n", bp, bta);
+			xfs_btalloc(bta, XFS_BUF_COUNT(bp));
+		}
+	} else if (INT_GET((bta = d)->bb_magic, ARCH_CONVERT) == XFS_ABTC_MAGIC) {
+		if (summary) {
+			kdb_printf("Alloc COUNT Btree blk, level %d (at 0x%p)\n",
+				       INT_GET(bta->bb_level, ARCH_CONVERT), bta);
+		} else {
+			kdb_printf("buf 0x%p abtcnt 0x%p\n", bp, bta);
+			xfs_btalloc(bta, XFS_BUF_COUNT(bp));
+		}
+	} else if (INT_GET((btb = d)->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC) {
+		if (summary) {
+			kdb_printf("Bmap Btree blk, level %d (at 0x%p)\n",
+				      INT_GET(btb->bb_level, ARCH_CONVERT), btb);
+		} else {
+			kdb_printf("buf 0x%p bmapbt 0x%p\n", bp, btb);
+			xfs_btbmap(btb, XFS_BUF_COUNT(bp));
+		}
+	} else if (INT_GET((bti = d)->bb_magic, ARCH_CONVERT) == XFS_IBT_MAGIC) {
+		if (summary) {
+			kdb_printf("Inode Btree blk, level %d (at 0x%p)\n",
+				       INT_GET(bti->bb_level, ARCH_CONVERT), bti);
+		} else {
+			kdb_printf("buf 0x%p inobt 0x%p\n", bp, bti);
+			xfs_btino(bti, XFS_BUF_COUNT(bp));
+		}
+	} else if (INT_GET((aleaf = d)->hdr.info.magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
+		if (summary) {
+			kdb_printf("Attr Leaf, 1st hash 0x%x (at 0x%p)\n",
+				      INT_GET(aleaf->entries[0].hashval, ARCH_CONVERT), aleaf);
+		} else {
+			kdb_printf("buf 0x%p attr leaf 0x%p\n", bp, aleaf);
+			xfsidbg_xattrleaf(aleaf);
+		}
+	} else if (INT_GET((dleaf = d)->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
+		if (summary) {
+			kdb_printf("Dir Leaf, 1st hash 0x%x (at 0x%p)\n",
+				     dleaf->entries[0].hashval, dleaf);
+		} else {
+			kdb_printf("buf 0x%p dir leaf 0x%p\n", bp, dleaf);
+			xfsidbg_xdirleaf(dleaf);
+		}
+	} else if (INT_GET((node = d)->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+		if (summary) {
+			kdb_printf("Dir/Attr Node, level %d, 1st hash 0x%x (at 0x%p)\n",
+			      node->hdr.level, node->btree[0].hashval, node);
+		} else {
+			kdb_printf("buf 0x%p dir/attr node 0x%p\n", bp, node);
+			xfsidbg_xdanode(node);
+		}
+	} else if (INT_GET((di = d)->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC) {
+		if (summary) {
+			kdb_printf("Disk Inode (at 0x%p)\n", di);
+		} else {
+			kdb_printf("buf 0x%p dinode 0x%p\n", bp, di);
+			xfs_inodebuf(bp);
+		}
+	} else if (INT_GET((sb = d)->sb_magicnum, ARCH_CONVERT) == XFS_SB_MAGIC) {
+		if (summary) {
+			kdb_printf("Superblock (at 0x%p)\n", sb);
+		} else {
+			kdb_printf("buf 0x%p sb 0x%p\n", bp, sb);
+			/* SB in a buffer - we need to convert */
+			xfsidbg_xsb(sb, 1);
+		}
+	} else if ((dqb = d)->d_magic == XFS_DQUOT_MAGIC) {
+#define XFSIDBG_DQTYPESTR(d)	 \
+	((INT_GET((d)->d_flags, ARCH_CONVERT) & XFS_DQ_USER) ? "USR" : \
+	((INT_GET((d)->d_flags, ARCH_CONVERT) & XFS_DQ_GROUP) ? "GRP" : "???"))
+		kdb_printf("Quota blk starting ID [%d], type %s at 0x%p\n",
+			INT_GET(dqb->d_id, ARCH_CONVERT), XFSIDBG_DQTYPESTR(dqb), dqb);
+
+	} else if (INT_GET((d2block = d)->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+		if (summary) {
+			kdb_printf("Dir2 block (at 0x%p)\n", d2block);
+		} else {
+			kdb_printf("buf 0x%p dir2 block 0x%p\n", bp, d2block);
+			xfs_dir2data((void *)d2block, XFS_BUF_COUNT(bp));
+		}
+	} else if (INT_GET((d2data = d)->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC) {
+		if (summary) {
+			kdb_printf("Dir2 data (at 0x%p)\n", d2data);
+		} else {
+			kdb_printf("buf 0x%p dir2 data 0x%p\n", bp, d2data);
+			xfs_dir2data((void *)d2data, XFS_BUF_COUNT(bp));
+		}
+	} else if (INT_GET((d2leaf = d)->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC) {
+		if (summary) {
+			kdb_printf("Dir2 leaf(1) (at 0x%p)\n", d2leaf);
+		} else {
+			kdb_printf("buf 0x%p dir2 leaf 0x%p\n", bp, d2leaf);
+			xfs_dir2leaf(d2leaf, XFS_BUF_COUNT(bp));
+		}
+	} else if (INT_GET(d2leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
+		if (summary) {
+			kdb_printf("Dir2 leaf(n) (at 0x%p)\n", d2leaf);
+		} else {
+			kdb_printf("buf 0x%p dir2 leaf 0x%p\n", bp, d2leaf);
+			xfs_dir2leaf(d2leaf, XFS_BUF_COUNT(bp));
+		}
+	} else if (INT_GET((d2free = d)->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC) {
+		if (summary) {
+			kdb_printf("Dir2 free (at 0x%p)\n", d2free);
+		} else {
+			kdb_printf("buf 0x%p dir2 free 0x%p\n", bp, d2free);
+			xfsidbg_xdir2free(d2free);
+		}
+	} else {
+		kdb_printf("buf 0x%p unknown 0x%p\n", bp, d);
+	}
+}
+
+
+/*
+ * Print an xfs_da_args structure.
+ */
+static void
+xfsidbg_xdaargs(xfs_da_args_t *n)
+{
+	char *ch;
+	int i;
+
+	kdb_printf(" name \"");
+	for (i = 0; i < n->namelen; i++) {
+		kdb_printf("%c", n->name[i]);
+	}
+	kdb_printf("\"(%d) value ", n->namelen);
+	if (n->value) {
+		kdb_printf("\"");
+		ch = n->value;
+		for (i = 0; (i < n->valuelen) && (i < 32); ch++, i++) {
+			switch(*ch) {
+			case '\n':	kdb_printf("\n");		break;
+			case '\b':	kdb_printf("\b");		break;
+			case '\t':	kdb_printf("\t");		break;
+			default:	kdb_printf("%c", *ch);	break;
+			}
+		}
+		if (i == 32)
+			kdb_printf("...");
+		kdb_printf("\"(%d)\n", n->valuelen);
+	} else {
+		kdb_printf("(NULL)(%d)\n", n->valuelen);
+	}
+	kdb_printf(" hashval 0x%x whichfork %d flags <",
+		  (uint_t)n->hashval, n->whichfork);
+	if (n->flags & ATTR_ROOT)
+		kdb_printf("ROOT ");
+	if (n->flags & ATTR_CREATE)
+		kdb_printf("CREATE ");
+	if (n->flags & ATTR_REPLACE)
+		kdb_printf("REPLACE ");
+	if (n->flags & XFS_ATTR_INCOMPLETE)
+		kdb_printf("INCOMPLETE ");
+	i = ~(ATTR_ROOT | ATTR_CREATE | ATTR_REPLACE | XFS_ATTR_INCOMPLETE);
+	if ((n->flags & i) != 0)
+		kdb_printf("0x%x", n->flags & i);
+	kdb_printf(">\n");
+	kdb_printf(" rename %d justcheck %d addname %d oknoent %d\n",
+		  n->rename, n->justcheck, n->addname, n->oknoent);
+	kdb_printf(" leaf: blkno %d index %d rmtblkno %d rmtblkcnt %d\n",
+		  n->blkno, n->index, n->rmtblkno, n->rmtblkcnt);
+	kdb_printf(" leaf2: blkno %d index %d rmtblkno %d rmtblkcnt %d\n",
+		  n->blkno2, n->index2, n->rmtblkno2, n->rmtblkcnt2);
+	kdb_printf(" inumber %llu dp 0x%p firstblock 0x%p flist 0x%p\n",
+		  (unsigned long long) n->inumber,
+		  n->dp, n->firstblock, n->flist);
+	kdb_printf(" trans 0x%p total %d\n",
+		  n->trans, n->total);
+}
+
+/*
+ * Print a da buffer structure.
+ */
+static void
+xfsidbg_xdabuf(xfs_dabuf_t *dabuf)
+{
+	int	i;
+
+	kdb_printf("nbuf %d dirty %d bbcount %d data 0x%p bps",
+		dabuf->nbuf, dabuf->dirty, dabuf->bbcount, dabuf->data);
+	for (i = 0; i < dabuf->nbuf; i++)
+		kdb_printf(" %d:0x%p", i, dabuf->bps[i]);
+	kdb_printf("\n");
+#ifdef XFS_DABUF_DEBUG
+	kdb_printf(" ra 0x%x prev 0x%x next 0x%x dev 0x%x blkno 0x%x\n",
+		dabuf->ra, dabuf->prev, dabuf->next, dabuf->dev, dabuf->blkno);
+#endif
+}
+
+/*
+ * Print a directory/attribute internal node block.
+ */
+static void
+xfsidbg_xdanode(xfs_da_intnode_t *node)
+{
+	xfs_da_node_hdr_t *h;
+	xfs_da_blkinfo_t *i;
+	xfs_da_node_entry_t *e;
+	int j;
+
+	h = &node->hdr;
+	i = &h->info;
+	kdb_printf("hdr info forw 0x%x back 0x%x magic 0x%x\n",
+		INT_GET(i->forw, ARCH_CONVERT), INT_GET(i->back, ARCH_CONVERT), INT_GET(i->magic, ARCH_CONVERT));
+	kdb_printf("hdr count %d level %d\n",
+		INT_GET(h->count, ARCH_CONVERT), INT_GET(h->level, ARCH_CONVERT));
+	for (j = 0, e = node->btree; j < INT_GET(h->count, ARCH_CONVERT); j++, e++) {
+		kdb_printf("btree %d hashval 0x%x before 0x%x\n",
+			j, (uint_t)INT_GET(e->hashval, ARCH_CONVERT), INT_GET(e->before, ARCH_CONVERT));
+	}
+}
+
+/*
+ * Print an xfs_da_state_blk structure.
+ */
+static void
+xfsidbg_xdastate(xfs_da_state_t *s)
+{
+	xfs_da_state_blk_t *eblk;
+
+	kdb_printf("args 0x%p mp 0x%p blocksize %d inleaf %d\n",
+		s->args, s->mp, s->blocksize, s->inleaf);
+	if (s->args)
+		xfsidbg_xdaargs(s->args);
+
+	kdb_printf("path:  ");
+	xfs_dastate_path(&s->path);
+
+	kdb_printf("altpath:  ");
+	xfs_dastate_path(&s->altpath);
+
+	eblk = &s->extrablk;
+	kdb_printf("extra: valid %d, after %d\n", s->extravalid, s->extraafter);
+	kdb_printf(" bp 0x%p blkno 0x%x ", eblk->bp, eblk->blkno);
+	kdb_printf("index %d hashval 0x%x\n", eblk->index, (uint_t)eblk->hashval);
+}
+
+/*
+ * Print a directory leaf block.
+ */
+static void
+xfsidbg_xdirleaf(xfs_dir_leafblock_t *leaf)
+{
+	xfs_dir_leaf_hdr_t *h;
+	xfs_da_blkinfo_t *i;
+	xfs_dir_leaf_map_t *m;
+	xfs_dir_leaf_entry_t *e;
+	xfs_dir_leaf_name_t *n;
+	int j, k;
+	xfs_ino_t ino;
+
+	h = &leaf->hdr;
+	i = &h->info;
+	kdb_printf("hdr info forw 0x%x back 0x%x magic 0x%x\n",
+		INT_GET(i->forw, ARCH_CONVERT), INT_GET(i->back, ARCH_CONVERT), INT_GET(i->magic, ARCH_CONVERT));
+	kdb_printf("hdr count %d namebytes %d firstused %d holes %d\n",
+		INT_GET(h->count, ARCH_CONVERT), INT_GET(h->namebytes, ARCH_CONVERT), INT_GET(h->firstused, ARCH_CONVERT), h->holes);
+	for (j = 0, m = h->freemap; j < XFS_DIR_LEAF_MAPSIZE; j++, m++) {
+		kdb_printf("hdr freemap %d base %d size %d\n",
+			j, INT_GET(m->base, ARCH_CONVERT), INT_GET(m->size, ARCH_CONVERT));
+	}
+	for (j = 0, e = leaf->entries; j < INT_GET(h->count, ARCH_CONVERT); j++, e++) {
+		n = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(e->nameidx, ARCH_CONVERT));
+		XFS_DIR_SF_GET_DIRINO_ARCH(&n->inumber, &ino, ARCH_CONVERT);
+		kdb_printf("leaf %d hashval 0x%x nameidx %d inumber %llu ",
+			j, (uint_t)INT_GET(e->hashval, ARCH_CONVERT),
+			INT_GET(e->nameidx, ARCH_CONVERT),
+			(unsigned long long)ino);
+		kdb_printf("namelen %d name \"", e->namelen);
+		for (k = 0; k < e->namelen; k++)
+			kdb_printf("%c", n->name[k]);
+		kdb_printf("\"\n");
+	}
+}
+
+/*
+ * Print a directory v2 data block, single or multiple.
+ */
+static void
+xfs_dir2data(void *addr, int size)
+{
+	xfs_dir2_data_t *db;
+	xfs_dir2_block_t *bb;
+	xfs_dir2_data_hdr_t *h;
+	xfs_dir2_data_free_t *m;
+	xfs_dir2_data_entry_t *e;
+	xfs_dir2_data_unused_t *u;
+	xfs_dir2_leaf_entry_t *l=NULL;
+	int j, k;
+	char *p;
+	char *t;
+	xfs_dir2_block_tail_t *tail=NULL;
+
+	db = (xfs_dir2_data_t *)addr;
+	bb = (xfs_dir2_block_t *)addr;
+	h = &db->hdr;
+	kdb_printf("hdr magic 0x%x (%s)\nhdr bestfree", INT_GET(h->magic, ARCH_CONVERT),
+		INT_GET(h->magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ? "DATA" :
+			(INT_GET(h->magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC ? "BLOCK" : ""));
+	for (j = 0, m = h->bestfree; j < XFS_DIR2_DATA_FD_COUNT; j++, m++) {
+		kdb_printf(" %d: 0x%x@0x%x", j, INT_GET(m->length, ARCH_CONVERT), INT_GET(m->offset, ARCH_CONVERT));
+	}
+	kdb_printf("\n");
+	if (INT_GET(h->magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC)
+		t = (char *)db + size;
+	else {
+		/* XFS_DIR2_BLOCK_TAIL_P */
+		tail = (xfs_dir2_block_tail_t *)
+		       ((char *)bb + size - sizeof(xfs_dir2_block_tail_t));
+		l = XFS_DIR2_BLOCK_LEAF_P_ARCH(tail, ARCH_CONVERT);
+		t = (char *)l;
+	}
+	for (p = (char *)(h + 1); p < t; ) {
+		u = (xfs_dir2_data_unused_t *)p;
+		if (u->freetag == XFS_DIR2_DATA_FREE_TAG) {
+			kdb_printf("0x%lx unused freetag 0x%x length 0x%x tag 0x%x\n",
+				(unsigned long) (p - (char *)addr),
+				INT_GET(u->freetag, ARCH_CONVERT),
+				INT_GET(u->length, ARCH_CONVERT),
+				INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(u, ARCH_CONVERT), ARCH_CONVERT));
+			p += INT_GET(u->length, ARCH_CONVERT);
+			continue;
+		}
+		e = (xfs_dir2_data_entry_t *)p;
+		kdb_printf("0x%lx entry inumber %llu namelen %d name \"",
+			(unsigned long) (p - (char *)addr),
+			(unsigned long long) INT_GET(e->inumber, ARCH_CONVERT),
+			e->namelen);
+		for (k = 0; k < e->namelen; k++)
+			kdb_printf("%c", e->name[k]);
+		kdb_printf("\" tag 0x%x\n", INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(e), ARCH_CONVERT));
+		p += XFS_DIR2_DATA_ENTSIZE(e->namelen);
+	}
+	if (INT_GET(h->magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC)
+		return;
+	for (j = 0; j < INT_GET(tail->count, ARCH_CONVERT); j++, l++) {
+		kdb_printf("0x%lx leaf %d hashval 0x%x address 0x%x (byte 0x%x)\n",
+			(unsigned long) ((char *)l - (char *)addr), j,
+			(uint_t)INT_GET(l->hashval, ARCH_CONVERT),
+			INT_GET(l->address, ARCH_CONVERT),
+			/* XFS_DIR2_DATAPTR_TO_BYTE */
+			INT_GET(l->address, ARCH_CONVERT) << XFS_DIR2_DATA_ALIGN_LOG);
+	}
+	kdb_printf("0x%lx tail count %d\n",
+		(unsigned long) ((char *)tail - (char *)addr),
+		INT_GET(tail->count, ARCH_CONVERT));
+}
+
+static void
+xfs_dir2leaf(xfs_dir2_leaf_t *leaf, int size)
+{
+	xfs_dir2_leaf_hdr_t *h;
+	xfs_da_blkinfo_t *i;
+	xfs_dir2_leaf_entry_t *e;
+	xfs_dir2_data_off_t *b;
+	xfs_dir2_leaf_tail_t *t;
+	int j;
+
+	h = &leaf->hdr;
+	i = &h->info;
+	e = leaf->ents;
+	kdb_printf("hdr info forw 0x%x back 0x%x magic 0x%x\n",
+		INT_GET(i->forw, ARCH_CONVERT), INT_GET(i->back, ARCH_CONVERT), INT_GET(i->magic, ARCH_CONVERT));
+	kdb_printf("hdr count %d stale %d\n", INT_GET(h->count, ARCH_CONVERT), INT_GET(h->stale, ARCH_CONVERT));
+	for (j = 0; j < INT_GET(h->count, ARCH_CONVERT); j++, e++) {
+		kdb_printf("0x%lx ent %d hashval 0x%x address 0x%x (byte 0x%x)\n",
+			(unsigned long) ((char *)e - (char *)leaf), j,
+			(uint_t)INT_GET(e->hashval, ARCH_CONVERT),
+			INT_GET(e->address, ARCH_CONVERT),
+			/* XFS_DIR2_DATAPTR_TO_BYTE */
+			INT_GET(e->address, ARCH_CONVERT) << XFS_DIR2_DATA_ALIGN_LOG);
+	}
+	if (INT_GET(i->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC)
+		return;
+	/* XFS_DIR2_LEAF_TAIL_P */
+	t = (xfs_dir2_leaf_tail_t *)((char *)leaf + size - sizeof(*t));
+	b = XFS_DIR2_LEAF_BESTS_P_ARCH(t, ARCH_CONVERT);
+	for (j = 0; j < INT_GET(t->bestcount, ARCH_CONVERT); j++, b++) {
+		kdb_printf("0x%lx best %d 0x%x\n",
+			(unsigned long) ((char *)b - (char *)leaf), j,
+			INT_GET(*b, ARCH_CONVERT));
+	}
+	kdb_printf("tail bestcount %d\n", INT_GET(t->bestcount, ARCH_CONVERT));
+}
+
+/*
+ * Print a shortform directory.
+ */
+static void
+xfsidbg_xdirsf(xfs_dir_shortform_t *s)
+{
+	xfs_dir_sf_hdr_t *sfh;
+	xfs_dir_sf_entry_t *sfe;
+	xfs_ino_t ino;
+	int i, j;
+
+	sfh = &s->hdr;
+	XFS_DIR_SF_GET_DIRINO_ARCH(&sfh->parent, &ino, ARCH_CONVERT);
+	kdb_printf("hdr parent %llu", (unsigned long long)ino);
+	kdb_printf(" count %d\n", sfh->count);
+	for (i = 0, sfe = s->list; i < sfh->count; i++) {
+		XFS_DIR_SF_GET_DIRINO_ARCH(&sfe->inumber, &ino, ARCH_CONVERT);
+		kdb_printf("entry %d inumber %llu", i, (unsigned long long)ino);
+		kdb_printf(" namelen %d name \"", sfe->namelen);
+		for (j = 0; j < sfe->namelen; j++)
+			kdb_printf("%c", sfe->name[j]);
+		kdb_printf("\"\n");
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+}
+
+/*
+ * Print a shortform v2 directory.
+ */
+static void
+xfsidbg_xdir2sf(xfs_dir2_sf_t *s)
+{
+	xfs_dir2_sf_hdr_t *sfh;
+	xfs_dir2_sf_entry_t *sfe;
+	xfs_ino_t ino;
+	int i, j;
+
+	sfh = &s->hdr;
+	ino = XFS_DIR2_SF_GET_INUMBER_ARCH(s, &sfh->parent, ARCH_CONVERT);
+	kdb_printf("hdr count %d i8count %d parent %llu\n",
+		sfh->count, sfh->i8count, (unsigned long long) ino);
+	for (i = 0, sfe = XFS_DIR2_SF_FIRSTENTRY(s); i < sfh->count; i++) {
+		ino = XFS_DIR2_SF_GET_INUMBER_ARCH(s, XFS_DIR2_SF_INUMBERP(sfe), ARCH_CONVERT);
+		kdb_printf("entry %d inumber %llu offset 0x%x namelen %d name \"",
+			i, (unsigned long long) ino,
+			XFS_DIR2_SF_GET_OFFSET_ARCH(sfe, ARCH_CONVERT),
+			sfe->namelen);
+		for (j = 0; j < sfe->namelen; j++)
+			kdb_printf("%c", sfe->name[j]);
+		kdb_printf("\"\n");
+		sfe = XFS_DIR2_SF_NEXTENTRY(s, sfe);
+	}
+}
+
+/*
+ * Print a node-form v2 directory freemap block.
+ */
+static void
+xfsidbg_xdir2free(xfs_dir2_free_t *f)
+{
+	int	i;
+
+	kdb_printf("hdr magic 0x%x firstdb %d nvalid %d nused %d\n",
+		INT_GET(f->hdr.magic, ARCH_CONVERT), INT_GET(f->hdr.firstdb, ARCH_CONVERT), INT_GET(f->hdr.nvalid, ARCH_CONVERT), INT_GET(f->hdr.nused, ARCH_CONVERT));
+	for (i = 0; i < INT_GET(f->hdr.nvalid, ARCH_CONVERT); i++) {
+		kdb_printf("entry %d db %d count %d\n",
+			i, i + INT_GET(f->hdr.firstdb, ARCH_CONVERT), INT_GET(f->bests[i], ARCH_CONVERT));
+	}
+}
+
+
+/*
+ * Print xfs extent list.
+ */
+static void
+xfsidbg_xexlist(xfs_inode_t *ip)
+{
+	xfs_xexlist_fork(ip, XFS_DATA_FORK);
+	if (XFS_IFORK_Q(ip))
+		xfs_xexlist_fork(ip, XFS_ATTR_FORK);
+}
+
+/*
+ * Print an xfs free-extent list.
+ */
+static void
+xfsidbg_xflist(xfs_bmap_free_t *flist)
+{
+	xfs_bmap_free_item_t	*item;
+
+	kdb_printf("flist@0x%p: first 0x%p count %d low %d\n", flist,
+		flist->xbf_first, flist->xbf_count, flist->xbf_low);
+	for (item = flist->xbf_first; item; item = item->xbfi_next) {
+		kdb_printf("item@0x%p: startblock %Lx blockcount %d", item,
+			(xfs_dfsbno_t)item->xbfi_startblock,
+			item->xbfi_blockcount);
+	}
+}
+
+/*
+ * Print out the help messages for these functions.
+ */
+static void
+xfsidbg_xhelp(void)
+{
+	struct xif	*p;
+
+	for (p = xfsidbg_funcs; p->name; p++)
+		kdb_printf("%-16s %s %s\n", p->name, p->args, p->help);
+}
+
+/*
+ * Print out an XFS in-core log structure.
+ */
+static void
+xfsidbg_xiclog(xlog_in_core_t *iclog)
+{
+	int i;
+	static char *ic_flags[] = {
+		"ACTIVE",	/* 0x0001 */
+		"WANT_SYNC",	/* 0x0002 */
+		"SYNCING",	/* 0X0004 */
+		"DONE_SYNC",	/* 0X0008 */
+		"DO_CALLBACK",	/* 0X0010 */
+		"CALLBACK",	/* 0X0020 */
+		"DIRTY",	/* 0X0040 */
+		"IOERROR",	/* 0X0080 */
+		"NOTUSED",	/* 0X8000 */
+		0
+	};
+
+	kdb_printf("xlog_in_core/header at 0x%p/0x%p\n",
+		iclog, iclog->hic_data);
+	kdb_printf("magicno: %x	 cycle: %d  version: %d	 lsn: 0x%Lx\n",
+		INT_GET(iclog->ic_header.h_magicno, ARCH_CONVERT), INT_GET(iclog->ic_header.h_cycle, ARCH_CONVERT),
+		INT_GET(iclog->ic_header.h_version, ARCH_CONVERT), INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT));
+	kdb_printf("tail_lsn: 0x%Lx  len: %d  prev_block: %d  num_ops: %d\n",
+		INT_GET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT), INT_GET(iclog->ic_header.h_len, ARCH_CONVERT),
+		INT_GET(iclog->ic_header.h_prev_block, ARCH_CONVERT), INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT));
+	kdb_printf("cycle_data: ");
+	for (i=0; i<(iclog->ic_size>>BBSHIFT); i++) {
+		kdb_printf("%x	", INT_GET(iclog->ic_header.h_cycle_data[i], ARCH_CONVERT));
+	}
+	kdb_printf("\n");
+	kdb_printf("size: %d\n", INT_GET(iclog->ic_header.h_size, ARCH_CONVERT));
+	kdb_printf("\n");
+	kdb_printf("--------------------------------------------------\n");
+	kdb_printf("data: 0x%p	&forcesema: 0x%p  next: 0x%p bp: 0x%p\n",
+		iclog->ic_datap, &iclog->ic_forcesema, iclog->ic_next,
+		iclog->ic_bp);
+	kdb_printf("log: 0x%p  callb: 0x%p  callb_tail: 0x%p  roundoff: %d\n",
+		iclog->ic_log, iclog->ic_callback, iclog->ic_callback_tail,
+		iclog->ic_roundoff);
+	kdb_printf("size: %d (OFFSET: %d) refcnt: %d  bwritecnt: %d",
+		iclog->ic_size, iclog->ic_offset,
+		iclog->ic_refcnt, iclog->ic_bwritecnt);
+	if (iclog->ic_state & XLOG_STATE_ALL)
+		printflags(iclog->ic_state, ic_flags, "state:");
+	else
+		kdb_printf("state: ILLEGAL 0x%x", iclog->ic_state);
+	kdb_printf("\n");
+}	/* xfsidbg_xiclog */
+
+
+/*
+ * Print all incore logs.
+ */
+static void
+xfsidbg_xiclogall(xlog_in_core_t *iclog)
+{
+    xlog_in_core_t *first_iclog = iclog;
+
+    do {
+	xfsidbg_xiclog(iclog);
+	kdb_printf("=================================================\n");
+	iclog = iclog->ic_next;
+    } while (iclog != first_iclog);
+}	/* xfsidbg_xiclogall */
+
+/*
+ * Print out the callback structures attached to an iclog.
+ */
+static void
+xfsidbg_xiclogcb(xlog_in_core_t *iclog)
+{
+	xfs_log_callback_t	*cb;
+	kdb_symtab_t		 symtab;
+
+	for (cb = iclog->ic_callback; cb != NULL; cb = cb->cb_next) {
+
+		if (kdbnearsym((unsigned long)cb->cb_func, &symtab)) {
+			unsigned long offval;
+
+			offval = (unsigned long)cb->cb_func - symtab.sym_start;
+
+			if (offval)
+				kdb_printf("func = %s+0x%lx",
+							symtab.sym_name,
+							offval);
+			else
+				kdb_printf("func = %s", symtab.sym_name);
+		} else
+			kdb_printf("func = ?? 0x%p", (void *)cb->cb_func);
+
+		kdb_printf(" arg 0x%p next 0x%p\n", cb->cb_arg, cb->cb_next);
+	}
+}
+
+
+/*
+ * Print all of the inodes attached to the given mount structure.
+ */
+static void
+xfsidbg_xinodes(xfs_mount_t *mp)
+{
+	xfs_inode_t	*ip;
+
+	kdb_printf("xfs_mount at 0x%p\n", mp);
+	ip = mp->m_inodes;
+	if (ip != NULL) {
+		do {
+			if (ip->i_mount == NULL) {
+				ip = ip->i_mnext;
+				continue;
+			}
+			kdb_printf("\n");
+			xfsidbg_xnode(ip);
+			ip = ip->i_mnext;
+		} while (ip != mp->m_inodes);
+	}
+	kdb_printf("\nEnd of Inodes\n");
+}
+
+static void
+xfsidbg_delayed_blocks(xfs_mount_t *mp)
+{
+	xfs_inode_t	*ip;
+	unsigned int	total = 0;
+	unsigned int	icount = 0;
+
+	ip = mp->m_inodes;
+	if (ip != NULL) {
+		do {
+			if (ip->i_mount == NULL) {
+				ip = ip->i_mnext;
+				continue;
+			}
+			if (ip->i_delayed_blks) {
+				total += ip->i_delayed_blks;
+				icount++;
+			}
+			ip = ip->i_mnext;
+		} while (ip != mp->m_inodes);
+	}
+	kdb_printf("delayed blocks total: %d in %d inodes\n", total, icount);
+}
+
+static void
+xfsidbg_xinodes_quiesce(xfs_mount_t *mp)
+{
+	xfs_inode_t	*ip;
+
+	kdb_printf("xfs_mount at 0x%p\n", mp);
+	ip = mp->m_inodes;
+	if (ip != NULL) {
+		do {
+			if (ip->i_mount == NULL) {
+				ip = ip->i_mnext;
+				continue;
+			}
+			if (!(ip->i_flags & XFS_IQUIESCE)) {
+				kdb_printf("ip 0x%p not quiesced\n", ip);
+			} else if (!BHV_IS_WRITE_LOCKED(VN_BHV_HEAD(XFS_ITOV(ip)))) {
+				kdb_printf("ip 0x%p not write locked\n", ip);
+			}
+			ip = ip->i_mnext;
+		} while (ip != mp->m_inodes);
+	}
+	kdb_printf("\nEnd of Inodes\n");
+}
+
+static char *
+xfsidbg_get_cstate(int state)
+{
+	switch(state) {
+	case  XLOG_STATE_COVER_IDLE:
+		return("idle");
+	case  XLOG_STATE_COVER_NEED:
+		return("need");
+	case  XLOG_STATE_COVER_DONE:
+		return("done");
+	case  XLOG_STATE_COVER_NEED2:
+		return("need2");
+	case  XLOG_STATE_COVER_DONE2:
+		return("done2");
+	default:
+		return("unknown");
+	}
+}
+
+/*
+ * Print out an XFS log structure.
+ */
+static void
+xfsidbg_xlog(xlog_t *log)
+{
+	int rbytes;
+	int wbytes;
+	static char *t_flags[] = {
+		"CHKSUM_MISMATCH",	/* 0x01 */
+		"ACTIVE_RECOVERY",	/* 0x02 */
+		"RECOVERY_NEEDED",	/* 0x04 */
+		"IO_ERROR",		/* 0x08 */
+		0
+	};
+
+	kdb_printf("xlog at 0x%p\n", log);
+	kdb_printf("&flushsm: 0x%p  tic_cnt: %d	 tic_tcnt: %d  \n",
+		&log->l_flushsema, log->l_ticket_cnt, log->l_ticket_tcnt);
+	kdb_printf("freelist: 0x%p  tail: 0x%p	ICLOG: 0x%p  \n",
+		log->l_freelist, log->l_tail, log->l_iclog);
+	kdb_printf("&icloglock: 0x%p  tail_lsn: %s  last_sync_lsn: %s \n",
+		&log->l_icloglock, xfs_fmtlsn(&log->l_tail_lsn),
+		xfs_fmtlsn(&log->l_last_sync_lsn));
+	kdb_printf("mp: 0x%p  xbuf: 0x%p  roundoff: %d	l_covered_state: %s \n",
+		log->l_mp, log->l_xbuf, log->l_roundoff,
+			xfsidbg_get_cstate(log->l_covered_state));
+	kdb_printf("flags: ");
+	printflags(log->l_flags, t_flags,"log");
+	kdb_printf("  dev: 0x%x logBBstart: %lld logsize: %d logBBsize: %d\n",
+		log->l_dev, (long long) log->l_logBBstart,
+		log->l_logsize,log->l_logBBsize);
+	kdb_printf("curr_cycle: %d  prev_cycle: %d  curr_block: %d  prev_block: %d\n",
+	     log->l_curr_cycle, log->l_prev_cycle, log->l_curr_block,
+	     log->l_prev_block);
+	kdb_printf("iclog_bak: 0x%p  iclog_size: 0x%x (%d)  num iclogs: %d\n",
+		log->l_iclog_bak, log->l_iclog_size, log->l_iclog_size,
+		log->l_iclog_bufs);
+	kdb_printf("l_iclog_hsize %d l_iclog_heads %d\n",
+		log->l_iclog_hsize, log->l_iclog_heads);
+	kdb_printf("&grant_lock: 0x%p  resHeadQ: 0x%p  wrHeadQ: 0x%p\n",
+		&log->l_grant_lock, log->l_reserve_headq, log->l_write_headq);
+	kdb_printf("GResCycle: %d  GResBytes: %d  GWrCycle: %d	GWrBytes: %d\n",
+		log->l_grant_reserve_cycle, log->l_grant_reserve_bytes,
+		log->l_grant_write_cycle, log->l_grant_write_bytes);
+	rbytes = log->l_grant_reserve_bytes + log->l_roundoff;
+	wbytes = log->l_grant_write_bytes + log->l_roundoff;
+       kdb_printf("GResBlocks: %d  GResRemain: %d  GWrBlocks: %d  GWrRemain: %d\n",
+	       rbytes / BBSIZE, rbytes % BBSIZE,
+	       wbytes / BBSIZE, wbytes % BBSIZE);
+}	/* xfsidbg_xlog */
+
+
+/*
+ * Print out an XFS recovery transaction
+ */
+static void
+xfsidbg_xlog_ritem(xlog_recover_item_t *item)
+{
+	int i = XLOG_MAX_REGIONS_IN_ITEM;
+
+	kdb_printf("(xlog_recover_item 0x%p) ", item);
+	kdb_printf("next: 0x%p prev: 0x%p type: %d cnt: %d ttl: %d\n",
+		item->ri_next, item->ri_prev, ITEM_TYPE(item), item->ri_cnt,
+		item->ri_total);
+	for ( ; i > 0; i--) {
+		if (!item->ri_buf[XLOG_MAX_REGIONS_IN_ITEM-i].i_addr)
+			break;
+		kdb_printf("a: 0x%p l: %d ",
+			item->ri_buf[XLOG_MAX_REGIONS_IN_ITEM-i].i_addr,
+			item->ri_buf[XLOG_MAX_REGIONS_IN_ITEM-i].i_len);
+	}
+	kdb_printf("\n");
+}	/* xfsidbg_xlog_ritem */
+
+/*
+ * Print out an XFS recovery transaction
+ */
+static void
+xfsidbg_xlog_rtrans(xlog_recover_t *trans)
+{
+	xlog_recover_item_t *rip, *first_rip;
+
+	kdb_printf("(xlog_recover 0x%p) ", trans);
+	kdb_printf("tid: %x type: %d items: %d ttid: 0x%x  ",
+		trans->r_log_tid, trans->r_theader.th_type,
+		trans->r_theader.th_num_items, trans->r_theader.th_tid);
+	kdb_printf("itemq: 0x%p\n", trans->r_itemq);
+	if (trans->r_itemq) {
+		rip = first_rip = trans->r_itemq;
+		do {
+			kdb_printf("(recovery item: 0x%p) ", rip);
+			kdb_printf("type: %d cnt: %d total: %d\n",
+				ITEM_TYPE(rip), rip->ri_cnt, rip->ri_total);
+			rip = rip->ri_next;
+		} while (rip != first_rip);
+	}
+}	/* xfsidbg_xlog_rtrans */
+
+static void
+xfsidbg_xlog_buf_logitem(xlog_recover_item_t *item)
+{
+	xfs_buf_log_format_t	*buf_f;
+	int			i, j;
+	int			bit;
+	int			nbits;
+	unsigned int		*data_map;
+	unsigned int		map_size;
+	int			size;
+
+	buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
+	if (buf_f->blf_flags & XFS_BLI_INODE_BUF) {
+		kdb_printf("\tINODE BUF <blkno=0x%Lx, len=0x%x>\n",
+			buf_f->blf_blkno, buf_f->blf_len);
+	} else if (buf_f->blf_flags & (XFS_BLI_UDQUOT_BUF | XFS_BLI_GDQUOT_BUF)) {
+		kdb_printf("\tDQUOT BUF <blkno=0x%Lx, len=0x%x>\n",
+			buf_f->blf_blkno, buf_f->blf_len);
+	} else {
+		data_map = buf_f->blf_data_map;
+		map_size = buf_f->blf_map_size;
+		kdb_printf("\tREG BUF <blkno=0x%Lx, len=0x%x map 0x%p size %d>\n",
+			buf_f->blf_blkno, buf_f->blf_len, data_map, map_size);
+		bit = 0;
+		i = 0;  /* 0 is the buf format structure */
+		while (1) {
+			bit = xfs_next_bit(data_map, map_size, bit);
+			if (bit == -1)
+				break;
+			nbits = xfs_contig_bits(data_map, map_size, bit);
+			size = ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT);
+			kdb_printf("\t\tlogbuf.i_addr 0x%p, size 0x%x\n",
+				item->ri_buf[i].i_addr, size);
+			kdb_printf("\t\t\t\"");
+			for (j=0; j<8 && j<size; j++) {
+				kdb_printf("%02x", ((char *)item->ri_buf[i].i_addr)[j]);
+			}
+			kdb_printf("...\"\n");
+			i++;
+			bit += nbits;
+		}
+
+	}
+}
+
+/*
+ * Print out an ENTIRE XFS recovery transaction
+ */
+static void
+xfsidbg_xlog_rtrans_entire(xlog_recover_t *trans)
+{
+	xlog_recover_item_t *item, *first_rip;
+
+	kdb_printf("(Recovering Xact 0x%p) ", trans);
+	kdb_printf("tid: %x type: %d nitems: %d ttid: 0x%x  ",
+		trans->r_log_tid, trans->r_theader.th_type,
+		trans->r_theader.th_num_items, trans->r_theader.th_tid);
+	kdb_printf("itemq: 0x%p\n", trans->r_itemq);
+	if (trans->r_itemq) {
+		item = first_rip = trans->r_itemq;
+		do {
+			/*
+			   kdb_printf("(recovery item: 0x%x) ", item);
+			   kdb_printf("type: %d cnt: %d total: %d\n",
+				   item->ri_type, item->ri_cnt, item->ri_total);
+				   */
+			if ((ITEM_TYPE(item) == XFS_LI_BUF) ||
+			    (ITEM_TYPE(item) == XFS_LI_6_1_BUF) ||
+			    (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) {
+				kdb_printf("BUF:");
+				xfsidbg_xlog_buf_logitem(item);
+			} else if ((ITEM_TYPE(item) == XFS_LI_INODE) ||
+				   (ITEM_TYPE(item) == XFS_LI_6_1_INODE) ||
+				   (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) {
+				kdb_printf("INODE:\n");
+			} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
+				kdb_printf("EFI:\n");
+			} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
+				kdb_printf("EFD:\n");
+			} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
+				kdb_printf("DQUOT:\n");
+			} else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
+				kdb_printf("QUOTAOFF:\n");
+			} else {
+				kdb_printf("UNKNOWN LOGITEM 0x%x\n", ITEM_TYPE(item));
+			}
+			item = item->ri_next;
+		} while (item != first_rip);
+	}
+}	/* xfsidbg_xlog_rtrans */
+
+/*
+ * Print out an XFS ticket structure.
+ */
+static void
+xfsidbg_xlog_tic(xlog_ticket_t *tic)
+{
+	static char *t_flags[] = {
+		"INIT",		/* 0x1 */
+		"PERM_RES",	/* 0x2 */
+		"IN_Q",		/* 0x4 */
+		0
+	};
+
+	kdb_printf("xlog_ticket at 0x%p\n", tic);
+	kdb_printf("next: 0x%p	prev: 0x%p  tid: 0x%x  \n",
+		tic->t_next, tic->t_prev, tic->t_tid);
+	kdb_printf("curr_res: %d  unit_res: %d	ocnt: %d  cnt: %d\n",
+		tic->t_curr_res, tic->t_unit_res, (int)tic->t_ocnt,
+		(int)tic->t_cnt);
+	kdb_printf("clientid: %c  \n", tic->t_clientid);
+	printflags(tic->t_flags, t_flags,"ticket");
+	kdb_printf("\n");
+}	/* xfsidbg_xlog_tic */
+
+/*
+ * Print out a single log item.
+ */
+static void
+xfsidbg_xlogitem(xfs_log_item_t *lip)
+{
+	xfs_log_item_t	*bio_lip;
+	static char *lid_type[] = {
+		"???",		/* 0 */
+		"5-3-buf",	/* 1 */
+		"5-3-inode",	/* 2 */
+		"efi",		/* 3 */
+		"efd",		/* 4 */
+		"iunlink",	/* 5 */
+		"6-1-inode",	/* 6 */
+		"6-1-buf",	/* 7 */
+		"inode",	/* 8 */
+		"buf",		/* 9 */
+		"dquot",	/* 10 */
+		0
+		};
+	static char *li_flags[] = {
+		"in ail",	/* 0x1 */
+		0
+		};
+
+	kdb_printf("type %s mountp 0x%p flags ",
+		lid_type[lip->li_type - XFS_LI_5_3_BUF + 1],
+		lip->li_mountp);
+	printflags((uint)(lip->li_flags), li_flags,"log");
+	kdb_printf("\n");
+	kdb_printf("ail forw 0x%p ail back 0x%p lsn %s desc %p ops 0x%p\n",
+		lip->li_ail.ail_forw, lip->li_ail.ail_back,
+		xfs_fmtlsn(&(lip->li_lsn)), lip->li_desc, lip->li_ops);
+	kdb_printf("iodonefunc &0x%p\n", lip->li_cb);
+	if (lip->li_type == XFS_LI_BUF) {
+		bio_lip = lip->li_bio_list;
+		if (bio_lip != NULL) {
+			kdb_printf("iodone list:\n");
+		}
+		while (bio_lip != NULL) {
+			kdb_printf("item 0x%p func 0x%p\n",
+				bio_lip, bio_lip->li_cb);
+			bio_lip = bio_lip->li_bio_list;
+		}
+	}
+	switch (lip->li_type) {
+	case XFS_LI_BUF:
+		xfs_buf_item_print((xfs_buf_log_item_t *)lip, 0);
+		break;
+	case XFS_LI_INODE:
+		xfs_inode_item_print((xfs_inode_log_item_t *)lip, 0);
+		break;
+	case XFS_LI_EFI:
+		xfs_efi_item_print((xfs_efi_log_item_t *)lip, 0);
+		break;
+	case XFS_LI_EFD:
+		xfs_efd_item_print((xfs_efd_log_item_t *)lip, 0);
+		break;
+	case XFS_LI_DQUOT:
+		xfs_dquot_item_print((xfs_dq_logitem_t *)lip, 0);
+		break;
+	case XFS_LI_QUOTAOFF:
+		xfs_qoff_item_print((xfs_qoff_logitem_t *)lip, 0);
+		break;
+
+	default:
+		kdb_printf("Unknown item type %d\n", lip->li_type);
+		break;
+	}
+}
+
+/*
+ * Print out a summary of the AIL hanging off of a mount struct.
+ */
+static void
+xfsidbg_xaildump(xfs_mount_t *mp)
+{
+	xfs_log_item_t *lip;
+	static char *lid_type[] = {
+		"???",		/* 0 */
+		"5-3-buf",	/* 1 */
+		"5-3-inode",	/* 2 */
+		"efi",		/* 3 */
+		"efd",		/* 4 */
+		"iunlink",	/* 5 */
+		"6-1-inode",	/* 6 */
+		"6-1-buf",	/* 7 */
+		"inode",	/* 8 */
+		"buf",		/* 9 */
+		"dquot",	/* 10 */
+		0
+		};
+	static char *li_flags[] = {
+		"in ail",	/* 0x1 */
+		0
+		};
+	int count;
+
+	if ((mp->m_ail.ail_forw == NULL) ||
+	    (mp->m_ail.ail_forw == (xfs_log_item_t *)&mp->m_ail)) {
+		kdb_printf("AIL is empty\n");
+		return;
+	}
+	kdb_printf("AIL for mp 0x%p, oldest first\n", mp);
+	lip = (xfs_log_item_t*)mp->m_ail.ail_forw;
+	for (count = 0; lip; count++) {
+		kdb_printf("[%d] type %s ", count,
+			      lid_type[lip->li_type - XFS_LI_5_3_BUF + 1]);
+		printflags((uint)(lip->li_flags), li_flags, "flags:");
+		kdb_printf("  lsn %s\n	 ", xfs_fmtlsn(&(lip->li_lsn)));
+		switch (lip->li_type) {
+		case XFS_LI_BUF:
+			xfs_buf_item_print((xfs_buf_log_item_t *)lip, 1);
+			break;
+		case XFS_LI_INODE:
+			xfs_inode_item_print((xfs_inode_log_item_t *)lip, 1);
+			break;
+		case XFS_LI_EFI:
+			xfs_efi_item_print((xfs_efi_log_item_t *)lip, 1);
+			break;
+		case XFS_LI_EFD:
+			xfs_efd_item_print((xfs_efd_log_item_t *)lip, 1);
+			break;
+		case XFS_LI_DQUOT:
+			xfs_dquot_item_print((xfs_dq_logitem_t *)lip, 1);
+			break;
+		case XFS_LI_QUOTAOFF:
+			xfs_qoff_item_print((xfs_qoff_logitem_t *)lip, 1);
+			break;
+		default:
+			kdb_printf("Unknown item type %d\n", lip->li_type);
+			break;
+		}
+
+		if (lip->li_ail.ail_forw == (xfs_log_item_t*)&mp->m_ail) {
+			lip = NULL;
+		} else {
+			lip = lip->li_ail.ail_forw;
+		}
+	}
+}
+
+/*
+ * Print xfs mount structure.
+ */
+static void
+xfsidbg_xmount(xfs_mount_t *mp)
+{
+	static char *xmount_flags[] = {
+		"WSYNC",	/* 0x0001 */
+		"INO64",	/* 0x0002 */
+		"RQCHK",	/* 0x0004 */
+		"FSCLEAN",	/* 0x0008 */
+		"FSSHUTDN",	/* 0x0010 */
+		"NOATIME",	/* 0x0020 */
+		"RETERR",	/* 0x0040 */
+		"NOALIGN",	/* 0x0080 */
+		"UNSHRD",	/* 0x0100 */
+		"RGSTRD",	/* 0x0200 */
+		"NORECVR",	/* 0x0400 */
+		"SHRD",		/* 0x0800 */
+		"IOSZ",		/* 0x1000 */
+		"OSYNC",	/* 0x2000 */
+		"NOUUID",	/* 0x4000 */
+		"32BIT",	/* 0x8000 */
+		"IRIXSGID",	/* 0x10000 */
+		"NOLOGFLUSH",	/* 0x20000 */
+		0
+	};
+
+	static char *quota_flags[] = {
+		"UQ",		/* 0x0001 */
+		"UQE",		/* 0x0002 */
+		"UQCHKD",	/* 0x0004 */
+		"PQ",		/* 0x0008 (IRIX ondisk) */
+		"GQE",		/* 0x0010 */
+		"GQCHKD",	/* 0x0020 */
+		"GQ",		/* 0x0040 */
+		"UQACTV",	/* 0x0080 */
+		"GQACTV",	/* 0x0100 */
+		"QMAYBE",	/* 0x0200 */
+		0
+	};
+
+	kdb_printf("xfs_mount at 0x%p\n", mp);
+	kdb_printf("vfsp 0x%p tid 0x%x ail_lock 0x%p &ail 0x%p\n",
+		XFS_MTOVFS(mp), mp->m_tid, &mp->m_ail_lock, &mp->m_ail);
+	kdb_printf("ail_gen 0x%x &sb 0x%p\n",
+		mp->m_ail_gen, &mp->m_sb);
+	kdb_printf("sb_lock 0x%p sb_bp 0x%p dev 0x%x logdev 0x%x rtdev 0x%x\n",
+		&mp->m_sb_lock, mp->m_sb_bp,
+		mp->m_ddev_targp->pbr_dev,
+		mp->m_logdev_targp->pbr_dev,
+		mp->m_rtdev_targp->pbr_dev);
+	kdb_printf("bsize %d agfrotor %d agirotor %d ihash 0x%p ihsize %d\n",
+		mp->m_bsize, mp->m_agfrotor, mp->m_agirotor,
+		mp->m_ihash, mp->m_ihsize);
+	kdb_printf("inodes 0x%p ilock 0x%p ireclaims 0x%x\n",
+		mp->m_inodes, &mp->m_ilock, mp->m_ireclaims);
+	kdb_printf("readio_log 0x%x readio_blocks 0x%x ",
+		mp->m_readio_log, mp->m_readio_blocks);
+	kdb_printf("writeio_log 0x%x writeio_blocks 0x%x\n",
+		mp->m_writeio_log, mp->m_writeio_blocks);
+	kdb_printf("logbufs %d logbsize %d LOG 0x%p\n", mp->m_logbufs,
+		mp->m_logbsize, mp->m_log);
+	kdb_printf("rsumlevels 0x%x rsumsize 0x%x rbmip 0x%p rsumip 0x%p\n",
+		mp->m_rsumlevels, mp->m_rsumsize, mp->m_rbmip, mp->m_rsumip);
+	kdb_printf("rootip 0x%p\n", mp->m_rootip);
+	kdb_printf("dircook_elog %d blkbit_log %d blkbb_log %d agno_log %d\n",
+		mp->m_dircook_elog, mp->m_blkbit_log, mp->m_blkbb_log,
+		mp->m_agno_log);
+	kdb_printf("agino_log %d nreadaheads %d inode cluster size %d\n",
+		mp->m_agino_log, mp->m_nreadaheads,
+		mp->m_inode_cluster_size);
+	kdb_printf("blockmask 0x%x blockwsize 0x%x blockwmask 0x%x\n",
+		mp->m_blockmask, mp->m_blockwsize, mp->m_blockwmask);
+	kdb_printf("alloc_mxr[lf,nd] %d %d alloc_mnr[lf,nd] %d %d\n",
+		mp->m_alloc_mxr[0], mp->m_alloc_mxr[1],
+		mp->m_alloc_mnr[0], mp->m_alloc_mnr[1]);
+	kdb_printf("bmap_dmxr[lfnr,ndnr] %d %d bmap_dmnr[lfnr,ndnr] %d %d\n",
+		mp->m_bmap_dmxr[0], mp->m_bmap_dmxr[1],
+		mp->m_bmap_dmnr[0], mp->m_bmap_dmnr[1]);
+	kdb_printf("inobt_mxr[lf,nd] %d %d inobt_mnr[lf,nd] %d %d\n",
+		mp->m_inobt_mxr[0], mp->m_inobt_mxr[1],
+		mp->m_inobt_mnr[0], mp->m_inobt_mnr[1]);
+	kdb_printf("ag_maxlevels %d bm_maxlevels[d,a] %d %d in_maxlevels %d\n",
+		mp->m_ag_maxlevels, mp->m_bm_maxlevels[0],
+		mp->m_bm_maxlevels[1], mp->m_in_maxlevels);
+	kdb_printf("perag 0x%p &peraglock 0x%p &growlock 0x%p\n",
+		mp->m_perag, &mp->m_peraglock, &mp->m_growlock);
+	printflags(mp->m_flags, xmount_flags,"flags");
+	kdb_printf("ialloc_inos %d ialloc_blks %d litino %d\n",
+		mp->m_ialloc_inos, mp->m_ialloc_blks, mp->m_litino);
+	kdb_printf("attroffset %d da_node_ents %d maxicount %Ld inoalign_mask %d\n",
+		mp->m_attroffset, mp->m_da_node_ents, mp->m_maxicount,
+		mp->m_inoalign_mask);
+	kdb_printf("resblks %Ld resblks_avail %Ld\n", mp->m_resblks,
+		mp->m_resblks_avail);
+#if XFS_BIG_FILESYSTEMS
+	kdb_printf(" inoadd %llx\n", (unsigned long long) mp->m_inoadd);
+#else
+	kdb_printf("\n");
+#endif
+	if (mp->m_quotainfo)
+		kdb_printf("quotainfo 0x%p (uqip = 0x%p, gqip = 0x%p)\n",
+			mp->m_quotainfo,
+			mp->m_quotainfo->qi_uquotaip,
+			mp->m_quotainfo->qi_gquotaip);
+	else
+		kdb_printf("quotainfo NULL\n");
+	printflags(mp->m_qflags, quota_flags,"quotaflags");
+	kdb_printf("\n");
+	kdb_printf("dalign %d swidth %d sinoalign %d attr_magicpct %d dir_magicpct %d\n",
+		mp->m_dalign, mp->m_swidth, mp->m_sinoalign,
+		mp->m_attr_magicpct, mp->m_dir_magicpct);
+	kdb_printf("mk_sharedro %d dirversion %d dirblkfsbs %d &dirops 0x%p\n",
+		mp->m_mk_sharedro, mp->m_dirversion, mp->m_dirblkfsbs,
+		&mp->m_dirops);
+	kdb_printf("dirblksize %d dirdatablk 0x%Lx dirleafblk 0x%Lx dirfreeblk 0x%Lx\n",
+		mp->m_dirblksize,
+		(xfs_dfiloff_t)mp->m_dirdatablk,
+		(xfs_dfiloff_t)mp->m_dirleafblk,
+		(xfs_dfiloff_t)mp->m_dirfreeblk);
+	kdb_printf("chsize %d chash 0x%p\n",
+		mp->m_chsize, mp->m_chash);
+	kdb_printf("m_lstripemask %d\n", mp->m_lstripemask);
+	kdb_printf("m_frozen %d m_active_trans %d\n",
+		mp->m_frozen, mp->m_active_trans.counter);
+	if (mp->m_fsname != NULL)
+		kdb_printf("mountpoint \"%s\"\n", mp->m_fsname);
+	else
+		kdb_printf("No name!!!\n");
+
+}
+
+static void
+xfsidbg_xihash(xfs_mount_t *mp)
+{
+	xfs_ihash_t	*ih;
+	int		i;
+	int		j;
+	int		total;
+	int		numzeros;
+	xfs_inode_t	*ip;
+	int		*hist;
+	int		hist_bytes = mp->m_ihsize * sizeof(int);
+	int		hist2[21];
+
+	hist = (int *) kmalloc(hist_bytes, GFP_KERNEL);
+
+	if (hist == NULL) {
+		kdb_printf("xfsidbg_xihash: kmalloc(%d) failed!\n",
+							hist_bytes);
+		return;
+	}
+
+	for (i = 0; i < mp->m_ihsize; i++) {
+		ih = mp->m_ihash + i;
+		j = 0;
+		for (ip = ih->ih_next; ip != NULL; ip = ip->i_next)
+			j++;
+		hist[i] = j;
+	}
+
+	numzeros = total = 0;
+
+	for (i = 0; i < 21; i++)
+		hist2[i] = 0;
+
+	for (i = 0; i < mp->m_ihsize; i++)  {
+		kdb_printf("%d ", hist[i]);
+		total += hist[i];
+		numzeros += hist[i] == 0 ? 1 : 0;
+		if (hist[i] > 20)
+			j = 20;
+		else
+			j = hist[i];
+
+		if (! (j <= 20)) {
+			kdb_printf("xfsidbg_xihash: (j > 20)/%d @ line # %d\n",
+							j, __LINE__);
+			return;
+		}
+
+		hist2[j]++;
+	}
+
+	kdb_printf("\n");
+
+	kdb_printf("total inodes = %d, average length = %d, adjusted average = %d \n",
+		total, total / mp->m_ihsize,
+		total / (mp->m_ihsize - numzeros));
+
+	for (i = 0; i < 21; i++)  {
+		kdb_printf("%d - %d , ", i, hist2[i]);
+	}
+	kdb_printf("\n");
+	kfree(hist);
+}
+
+/*
+ * Command to print xfs inodes: kp xnode <addr>
+ */
+static void
+xfsidbg_xnode(xfs_inode_t *ip)
+{
+	static char *tab_flags[] = {
+		"grio",		/* XFS_IGRIO */
+		"uiosize",	/* XFS_IUIOSZ */
+		"quiesce",	/* XFS_IQUIESCE */
+		"reclaim",	/* XFS_IRECLAIM */
+		NULL
+	};
+
+	kdb_printf("hash 0x%p next 0x%p prevp 0x%p mount 0x%p\n",
+		ip->i_hash,
+		ip->i_next,
+		ip->i_prevp,
+		ip->i_mount);
+	kdb_printf("mnext 0x%p mprev 0x%p vnode 0x%p \n",
+		ip->i_mnext,
+		ip->i_mprev,
+		XFS_ITOV_NULL(ip));
+	kdb_printf("dev %x ino %s\n",
+		ip->i_mount->m_dev,
+		xfs_fmtino(ip->i_ino, ip->i_mount));
+	kdb_printf("blkno 0x%llx len 0x%x boffset 0x%x\n",
+		(long long) ip->i_blkno,
+		ip->i_len,
+		ip->i_boffset);
+	kdb_printf("transp 0x%p &itemp 0x%p\n",
+		ip->i_transp,
+		ip->i_itemp);
+	kdb_printf("&lock 0x%p &iolock 0x%p",
+		&ip->i_lock,
+		&ip->i_iolock);
+	kdb_printf("&flock 0x%p (%d) pincount 0x%x\n",
+		&ip->i_flock, valusema(&ip->i_flock),
+		xfs_ipincount(ip));
+	kdb_printf("udquotp 0x%p gdquotp 0x%p\n",
+		ip->i_udquot, ip->i_gdquot);
+	kdb_printf("new_size %Lx\n", ip->i_iocore.io_new_size);
+	printflags((int)ip->i_flags, tab_flags, "flags");
+	kdb_printf("\n");
+	kdb_printf("update_core 0x%x update size 0x%x\n",
+		(int)(ip->i_update_core), (int) ip->i_update_size);
+	kdb_printf("gen 0x%x delayed blks %d",
+		ip->i_gen,
+		ip->i_delayed_blks);
+	kdb_printf("\n");
+	kdb_printf("chash 0x%p cnext 0x%p cprev 0x%p\n",
+		ip->i_chash,
+		ip->i_cnext,
+		ip->i_cprev);
+	xfs_xnode_fork("data", &ip->i_df);
+	xfs_xnode_fork("attr", ip->i_afp);
+	kdb_printf("\n");
+	xfs_prdinode_core(&ip->i_d, ARCH_NOCONVERT);
+}
+
+static void
+xfsidbg_xcore(xfs_iocore_t *io)
+{
+	if (IO_IS_XFS(io)) {
+		kdb_printf("io_obj 0x%p (xinode) io_mount 0x%p\n",
+			io->io_obj, io->io_mount);
+	} else {
+		kdb_printf("io_obj 0x%p (dcxvn) io_mount 0x%p\n",
+			io->io_obj, io->io_mount);
+	}
+	kdb_printf("new_size %Lx\n", io->io_new_size);
+}
+
+/*
+ * Command to print xfs inode cluster hash table: kp xchash <addr>
+ */
+static void
+xfsidbg_xchash(xfs_mount_t *mp)
+{
+	int		i;
+	xfs_chash_t	*ch;
+
+	kdb_printf("m_chash 0x%p size %d\n",
+		mp->m_chash, mp->m_chsize);
+	for (i = 0; i < mp->m_chsize; i++) {
+		ch = mp->m_chash + i;
+		kdb_printf("[%3d] ch 0x%p chashlist 0x%p\n", i, ch, ch->ch_list);
+		xfsidbg_xchashlist(ch->ch_list);
+	}
+}
+
+/*
+ * Command to print xfs inode cluster hash list: kp xchashlist <addr>
+ */
+static void
+xfsidbg_xchashlist(xfs_chashlist_t *chl)
+{
+	xfs_inode_t	*ip;
+
+	while (chl != NULL) {
+#ifdef DEBUG
+		kdb_printf("hashlist inode 0x%p blkno %Ld buf 0x%p",
+		       chl->chl_ip, chl->chl_blkno, chl->chl_buf);
+#else
+		kdb_printf("hashlist inode 0x%p blkno %lld",
+		       chl->chl_ip, (long long) chl->chl_blkno);
+#endif
+
+		kdb_printf("\n");
+
+		/* print inodes on chashlist */
+		ip = chl->chl_ip;
+		do {
+			kdb_printf("0x%p ", ip);
+			ip = ip->i_cnext;
+		} while (ip != chl->chl_ip);
+		kdb_printf("\n");
+
+		chl=chl->chl_next;
+	}
+}
+
+/*
+ * Print xfs per-ag data structures for filesystem.
+ */
+static void
+xfsidbg_xperag(xfs_mount_t *mp)
+{
+	xfs_agnumber_t	agno;
+	xfs_perag_t	*pag;
+	int		busy;
+
+	pag = mp->m_perag;
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++, pag++) {
+		kdb_printf("ag %d f_init %d i_init %d\n",
+			agno, pag->pagf_init, pag->pagi_init);
+		if (pag->pagf_init)
+			kdb_printf(
+	"    f_levels[b,c] %d,%d f_flcount %d f_freeblks %d f_longest %d\n"
+	"    f__metadata %d\n",
+				pag->pagf_levels[XFS_BTNUM_BNOi],
+				pag->pagf_levels[XFS_BTNUM_CNTi],
+				pag->pagf_flcount, pag->pagf_freeblks,
+				pag->pagf_longest, pag->pagf_metadata);
+		if (pag->pagi_init)
+			kdb_printf("	i_freecount %d i_inodeok %d\n",
+				pag->pagi_freecount, pag->pagi_inodeok);
+
+		for (busy = 0; busy < XFS_PAGB_NUM_SLOTS; busy++) {
+			kdb_printf("	 %04d: start %d length %d tp 0x%p\n",
+				busy,
+				pag->pagb_list[busy].busy_start,
+				pag->pagb_list[busy].busy_length,
+				pag->pagb_list[busy].busy_tp);
+		}
+	}
+}
+
+#ifdef CONFIG_XFS_QUOTA
+static void
+xfsidbg_xqm()
+{
+	if (xfs_Gqm == NULL) {
+		kdb_printf("NULL XQM!!\n");
+		return;
+	}
+
+	kdb_printf("usrhtab 0x%p\tgrphtab 0x%p\tndqfree 0x%x\thashmask 0x%x\n",
+		xfs_Gqm->qm_usr_dqhtable,
+		xfs_Gqm->qm_grp_dqhtable,
+		xfs_Gqm->qm_dqfreelist.qh_nelems,
+		xfs_Gqm->qm_dqhashmask);
+	kdb_printf("&freelist 0x%p, totaldquots 0x%x nrefs 0x%x\n",
+		&xfs_Gqm->qm_dqfreelist,
+		atomic_read(&xfs_Gqm->qm_totaldquots),
+		xfs_Gqm->qm_nrefs);
+}
+#endif
+
+static void
+xfsidbg_xqm_diskdq(xfs_disk_dquot_t *d)
+{
+	kdb_printf("magic 0x%x\tversion 0x%x\tID 0x%x (%d)\t\n",
+		INT_GET(d->d_magic, ARCH_CONVERT),
+		INT_GET(d->d_version, ARCH_CONVERT),
+		INT_GET(d->d_id, ARCH_CONVERT),
+		INT_GET(d->d_id, ARCH_CONVERT));
+	kdb_printf("bhard 0x%llx\tbsoft 0x%llx\tihard 0x%llx\tisoft 0x%llx\n",
+		(unsigned long long)INT_GET(d->d_blk_hardlimit, ARCH_CONVERT),
+		(unsigned long long)INT_GET(d->d_blk_softlimit, ARCH_CONVERT),
+		(unsigned long long)INT_GET(d->d_ino_hardlimit, ARCH_CONVERT),
+		(unsigned long long)INT_GET(d->d_ino_softlimit, ARCH_CONVERT));
+	kdb_printf("bcount 0x%llx icount 0x%llx\n",
+		(unsigned long long)INT_GET(d->d_bcount, ARCH_CONVERT),
+		(unsigned long long)INT_GET(d->d_icount, ARCH_CONVERT));
+	kdb_printf("btimer 0x%x itimer 0x%x \n",
+		(int)INT_GET(d->d_btimer, ARCH_CONVERT),
+		(int)INT_GET(d->d_itimer, ARCH_CONVERT));
+}
+
+static void
+xfsidbg_xqm_dquot(xfs_dquot_t *dqp)
+{
+	static char *qflags[] = {
+		"USR",
+		"GRP",
+		"LCKD",
+		"FLKD",
+		"DIRTY",
+		"WANT",
+		"INACT",
+		"MARKER",
+		0
+	};
+	kdb_printf("mount 0x%p hash 0x%p gdquotp 0x%p HL_next 0x%p HL_prevp 0x%p\n",
+		dqp->q_mount,
+		dqp->q_hash,
+		dqp->q_gdquot,
+		dqp->HL_NEXT,
+		dqp->HL_PREVP);
+	kdb_printf("MPL_next 0x%p MPL_prevp 0x%p FL_next 0x%p FL_prev 0x%p\n",
+		dqp->MPL_NEXT,
+		dqp->MPL_PREVP,
+		dqp->dq_flnext,
+		dqp->dq_flprev);
+
+	kdb_printf("nrefs 0x%x, res_bcount %d, ",
+		dqp->q_nrefs, (int) dqp->q_res_bcount);
+	printflags(dqp->dq_flags, qflags, "flags:");
+	kdb_printf("\nblkno 0x%llx\tboffset 0x%x\n",
+		(unsigned long long) dqp->q_blkno, (int) dqp->q_bufoffset);
+	kdb_printf("qlock 0x%p	flock 0x%p (%s) pincount 0x%x\n",
+		&dqp->q_qlock,
+		&dqp->q_flock,
+		(valusema(&dqp->q_flock) <= 0) ? "LCK" : "UNLKD",
+		dqp->q_pincount);
+	kdb_printf("disk-dquot 0x%p\n", &dqp->q_core);
+	xfsidbg_xqm_diskdq(&dqp->q_core);
+
+}
+
+
+#define XQMIDBG_LIST_PRINT(l, NXT) \
+{ \
+	  xfs_dquot_t	*dqp;\
+	  int i = 0; \
+	  kdb_printf("[#%d dquots]\n", (int) (l)->qh_nelems); \
+	  for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) {\
+	   kdb_printf( \
+	      "\t%d. [0x%p] \"%d (%s)\"\t blks = %d, inos = %d refs = %d\n", \
+			 ++i, dqp, (int) INT_GET(dqp->q_core.d_id, ARCH_CONVERT), \
+			 DQFLAGTO_TYPESTR(dqp),	     \
+			 (int) INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), \
+			 (int) INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), \
+			 (int) dqp->q_nrefs); }\
+	  kdb_printf("\n"); \
+}
+
+static void
+xfsidbg_xqm_dqattached_inos(xfs_mount_t *mp)
+{
+	xfs_inode_t	*ip;
+	int		n = 0;
+
+	ip = mp->m_inodes;
+	do {
+		if (ip->i_mount == NULL) {
+			ip = ip->i_mnext;
+			continue;
+		}
+		if (ip->i_udquot || ip->i_gdquot) {
+			n++;
+			kdb_printf("inode = 0x%p, ino %d: udq 0x%p, gdq 0x%p\n",
+				ip, (int)ip->i_ino, ip->i_udquot, ip->i_gdquot);
+		}
+		ip = ip->i_mnext;
+	} while (ip != mp->m_inodes);
+	kdb_printf("\nNumber of inodes with dquots attached: %d\n", n);
+}
+
+#ifdef	CONFIG_XFS_QUOTA
+static void
+xfsidbg_xqm_freelist_print(xfs_frlist_t *qlist, char *title)
+{
+	xfs_dquot_t *dq;
+	int i = 0;
+	kdb_printf("%s (#%d)\n", title, (int) qlist->qh_nelems);
+	FOREACH_DQUOT_IN_FREELIST(dq, qlist) {
+		kdb_printf("\t%d.\t\"%d (%s:0x%p)\"\t bcnt = %d, icnt = %d "
+		       "refs = %d\n",
+		       ++i, (int) INT_GET(dq->q_core.d_id, ARCH_CONVERT),
+		       DQFLAGTO_TYPESTR(dq), dq,
+		       (int) INT_GET(dq->q_core.d_bcount, ARCH_CONVERT),
+		       (int) INT_GET(dq->q_core.d_icount, ARCH_CONVERT),
+		       (int) dq->q_nrefs);
+	}
+}
+
+static void
+xfsidbg_xqm_freelist(void)
+{
+	if (xfs_Gqm) {
+		xfsidbg_xqm_freelist_print(&(xfs_Gqm->qm_dqfreelist), "Freelist");
+	} else
+		kdb_printf("NULL XQM!!\n");
+}
+
+static void
+xfsidbg_xqm_htab(void)
+{
+	int		i;
+	xfs_dqhash_t	*h;
+
+	if (xfs_Gqm == NULL) {
+		kdb_printf("NULL XQM!!\n");
+		return;
+	}
+	for (i = 0; i <= xfs_Gqm->qm_dqhashmask; i++) {
+		h = &xfs_Gqm->qm_usr_dqhtable[i];
+		if (h->qh_next) {
+			kdb_printf("USR %d: ", i);
+			XQMIDBG_LIST_PRINT(h, HL_NEXT);
+		}
+	}
+	for (i = 0; i <= xfs_Gqm->qm_dqhashmask; i++) {
+		h = &xfs_Gqm->qm_grp_dqhtable[i];
+		if (h->qh_next) {
+			kdb_printf("GRP %d: ", i);
+			XQMIDBG_LIST_PRINT(h, HL_NEXT);
+		}
+	}
+}
+#endif
+
+static void
+xfsidbg_xqm_mplist(xfs_mount_t *mp)
+{
+	if (mp->m_quotainfo == NULL) {
+		kdb_printf("NULL quotainfo\n");
+		return;
+	}
+
+	XQMIDBG_LIST_PRINT(&(mp->m_quotainfo->qi_dqlist), MPL_NEXT);
+
+}
+
+
+static void
+xfsidbg_xqm_qinfo(xfs_mount_t *mp)
+{
+	if (mp == NULL || mp->m_quotainfo == NULL) {
+		kdb_printf("NULL quotainfo\n");
+		return;
+	}
+
+	kdb_printf("uqip 0x%p, gqip 0x%p, &pinlock 0x%p &dqlist 0x%p\n",
+		mp->m_quotainfo->qi_uquotaip,
+		mp->m_quotainfo->qi_gquotaip,
+		&mp->m_quotainfo->qi_pinlock,
+		&mp->m_quotainfo->qi_dqlist);
+
+	kdb_printf("nreclaims %d, btmlimit 0x%x, itmlimit 0x%x, RTbtmlim 0x%x\n",
+		(int)mp->m_quotainfo->qi_dqreclaims,
+		(int)mp->m_quotainfo->qi_btimelimit,
+		(int)mp->m_quotainfo->qi_itimelimit,
+		(int)mp->m_quotainfo->qi_rtbtimelimit);
+
+	kdb_printf("bwarnlim 0x%x, iwarnlim 0x%x, &qofflock 0x%p, "
+		"chunklen 0x%x, dqperchunk 0x%x\n",
+		(int)mp->m_quotainfo->qi_bwarnlimit,
+		(int)mp->m_quotainfo->qi_iwarnlimit,
+		&mp->m_quotainfo->qi_quotaofflock,
+		(int)mp->m_quotainfo->qi_dqchunklen,
+		(int)mp->m_quotainfo->qi_dqperchunk);
+}
+
+static void
+xfsidbg_xqm_tpdqinfo(xfs_trans_t *tp)
+{
+	xfs_dqtrx_t	*qa, *q;
+	int		i,j;
+
+	kdb_printf("dqinfo 0x%p\n", tp->t_dqinfo);
+	if (! tp->t_dqinfo)
+		return;
+	kdb_printf("USR: \n");
+	qa = tp->t_dqinfo->dqa_usrdquots;
+	for (j = 0; j < 2; j++) {
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			if (qa[i].qt_dquot == NULL)
+				break;
+			q = &qa[i];
+			kdb_printf(
+  "\"%d\"[0x%p]: bres %d, bres-used %d, bdelta %d, del-delta %d, icnt-delta %d\n",
+				(int) q->qt_dquot->q_core.d_id,
+				q->qt_dquot,
+				(int) q->qt_blk_res,
+				(int) q->qt_blk_res_used,
+				(int) q->qt_bcount_delta,
+				(int) q->qt_delbcnt_delta,
+				(int) q->qt_icount_delta);
+		}
+		if (j == 0) {
+			qa = tp->t_dqinfo->dqa_grpdquots;
+			kdb_printf("GRP: \n");
+		}
+	}
+
+}
+
+
+
+/*
+ * Print xfs superblock.
+ */
+static void
+xfsidbg_xsb(xfs_sb_t *sbp, int convert)
+{
+	xfs_arch_t arch=convert?ARCH_CONVERT:ARCH_NOCONVERT;
+
+	kdb_printf(convert?"<converted>\n":"<unconverted>\n");
+
+	kdb_printf("magicnum 0x%x blocksize 0x%x dblocks %Ld rblocks %Ld\n",
+		INT_GET(sbp->sb_magicnum, arch), INT_GET(sbp->sb_blocksize, arch),
+		INT_GET(sbp->sb_dblocks, arch), INT_GET(sbp->sb_rblocks, arch));
+	kdb_printf("rextents %Ld uuid %s logstart %s\n",
+		INT_GET(sbp->sb_rextents, arch),
+		xfs_fmtuuid(&sbp->sb_uuid),
+		xfs_fmtfsblock(INT_GET(sbp->sb_logstart, arch), NULL));
+	kdb_printf("rootino %s ",
+		xfs_fmtino(INT_GET(sbp->sb_rootino, arch), NULL));
+	kdb_printf("rbmino %s ",
+		xfs_fmtino(INT_GET(sbp->sb_rbmino, arch), NULL));
+	kdb_printf("rsumino %s\n",
+		xfs_fmtino(INT_GET(sbp->sb_rsumino, arch), NULL));
+	kdb_printf("rextsize 0x%x agblocks 0x%x agcount 0x%x rbmblocks 0x%x\n",
+		INT_GET(sbp->sb_rextsize, arch),
+		INT_GET(sbp->sb_agblocks, arch),
+		INT_GET(sbp->sb_agcount, arch),
+		INT_GET(sbp->sb_rbmblocks, arch));
+	kdb_printf("logblocks 0x%x versionnum 0x%x sectsize 0x%x inodesize 0x%x\n",
+		INT_GET(sbp->sb_logblocks, arch),
+		INT_GET(sbp->sb_versionnum, arch),
+		INT_GET(sbp->sb_sectsize, arch),
+		INT_GET(sbp->sb_inodesize, arch));
+	kdb_printf("inopblock 0x%x blocklog 0x%x sectlog 0x%x inodelog 0x%x\n",
+		INT_GET(sbp->sb_inopblock, arch),
+		INT_GET(sbp->sb_blocklog, arch),
+		INT_GET(sbp->sb_sectlog, arch),
+		INT_GET(sbp->sb_inodelog, arch));
+	kdb_printf("inopblog %d agblklog %d rextslog %d inprogress %d imax_pct %d\n",
+		INT_GET(sbp->sb_inopblog, arch),
+		INT_GET(sbp->sb_agblklog, arch),
+		INT_GET(sbp->sb_rextslog, arch),
+		INT_GET(sbp->sb_inprogress, arch),
+		INT_GET(sbp->sb_imax_pct, arch));
+	kdb_printf("icount %Lx ifree %Lx fdblocks %Lx frextents %Lx\n",
+		INT_GET(sbp->sb_icount, arch),
+		INT_GET(sbp->sb_ifree, arch),
+		INT_GET(sbp->sb_fdblocks, arch),
+		INT_GET(sbp->sb_frextents, arch));
+	kdb_printf("uquotino %s ", xfs_fmtino(INT_GET(sbp->sb_uquotino, arch), NULL));
+	kdb_printf("gquotino %s ", xfs_fmtino(INT_GET(sbp->sb_gquotino, arch), NULL));
+	kdb_printf("qflags 0x%x flags 0x%x shared_vn %d inoaligmt %d\n",
+		INT_GET(sbp->sb_qflags, arch), INT_GET(sbp->sb_flags, arch), INT_GET(sbp->sb_shared_vn, arch),
+		INT_GET(sbp->sb_inoalignmt, arch));
+	kdb_printf("unit %d width %d dirblklog %d\n",
+		INT_GET(sbp->sb_unit, arch), INT_GET(sbp->sb_width, arch), INT_GET(sbp->sb_dirblklog, arch));
+	kdb_printf("log sunit %d\n", INT_GET(sbp->sb_logsunit, arch));
+}
+
+
+/*
+ * Print out an XFS transaction structure.  Print summaries for
+ * each of the items.
+ */
+static void
+xfsidbg_xtp(xfs_trans_t *tp)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_busy_chunk_t	*lbcp;
+	int			i;
+	int			chunk;
+	static char *xtp_flags[] = {
+		"dirty",	/* 0x1 */
+		"sb_dirty",	/* 0x2 */
+		"perm_log_res", /* 0x4 */
+		"sync",		/* 0x08 */
+		"dq_dirty",	/* 0x10 */
+		0
+		};
+	static char *lid_flags[] = {
+		"dirty",	/* 0x1 */
+		"pinned",	/* 0x2 */
+		"sync unlock",	/* 0x4 */
+		0
+		};
+
+	kdb_printf("tp 0x%p type ", tp);
+	switch (tp->t_type) {
+	case XFS_TRANS_SETATTR_NOT_SIZE: kdb_printf("SETATTR_NOT_SIZE");	break;
+	case XFS_TRANS_SETATTR_SIZE:	kdb_printf("SETATTR_SIZE");	break;
+	case XFS_TRANS_INACTIVE:	kdb_printf("INACTIVE");		break;
+	case XFS_TRANS_CREATE:		kdb_printf("CREATE");		break;
+	case XFS_TRANS_CREATE_TRUNC:	kdb_printf("CREATE_TRUNC");	break;
+	case XFS_TRANS_TRUNCATE_FILE:	kdb_printf("TRUNCATE_FILE");	break;
+	case XFS_TRANS_REMOVE:		kdb_printf("REMOVE");		break;
+	case XFS_TRANS_LINK:		kdb_printf("LINK");		break;
+	case XFS_TRANS_RENAME:		kdb_printf("RENAME");		break;
+	case XFS_TRANS_MKDIR:		kdb_printf("MKDIR");		break;
+	case XFS_TRANS_RMDIR:		kdb_printf("RMDIR");		break;
+	case XFS_TRANS_SYMLINK:		kdb_printf("SYMLINK");		break;
+	case XFS_TRANS_SET_DMATTRS:	kdb_printf("SET_DMATTRS");		break;
+	case XFS_TRANS_GROWFS:		kdb_printf("GROWFS");		break;
+	case XFS_TRANS_STRAT_WRITE:	kdb_printf("STRAT_WRITE");		break;
+	case XFS_TRANS_DIOSTRAT:	kdb_printf("DIOSTRAT");		break;
+	case XFS_TRANS_WRITE_SYNC:	kdb_printf("WRITE_SYNC");		break;
+	case XFS_TRANS_WRITEID:		kdb_printf("WRITEID");		break;
+	case XFS_TRANS_ADDAFORK:	kdb_printf("ADDAFORK");		break;
+	case XFS_TRANS_ATTRINVAL:	kdb_printf("ATTRINVAL");		break;
+	case XFS_TRANS_ATRUNCATE:	kdb_printf("ATRUNCATE");		break;
+	case XFS_TRANS_ATTR_SET:	kdb_printf("ATTR_SET");		break;
+	case XFS_TRANS_ATTR_RM:		kdb_printf("ATTR_RM");		break;
+	case XFS_TRANS_ATTR_FLAG:	kdb_printf("ATTR_FLAG");		break;
+	case XFS_TRANS_CLEAR_AGI_BUCKET:  kdb_printf("CLEAR_AGI_BUCKET");	break;
+	case XFS_TRANS_QM_SBCHANGE:	kdb_printf("QM_SBCHANGE");	break;
+	case XFS_TRANS_QM_QUOTAOFF:	kdb_printf("QM_QUOTAOFF");	break;
+	case XFS_TRANS_QM_DQALLOC:	kdb_printf("QM_DQALLOC");		break;
+	case XFS_TRANS_QM_SETQLIM:	kdb_printf("QM_SETQLIM");		break;
+	case XFS_TRANS_QM_DQCLUSTER:	kdb_printf("QM_DQCLUSTER");	break;
+	case XFS_TRANS_QM_QINOCREATE:	kdb_printf("QM_QINOCREATE");	break;
+	case XFS_TRANS_QM_QUOTAOFF_END: kdb_printf("QM_QOFF_END");		break;
+	case XFS_TRANS_SB_UNIT:		kdb_printf("SB_UNIT");		break;
+	case XFS_TRANS_FSYNC_TS:	kdb_printf("FSYNC_TS");		break;
+	case XFS_TRANS_GROWFSRT_ALLOC:	kdb_printf("GROWFSRT_ALLOC");	break;
+	case XFS_TRANS_GROWFSRT_ZERO:	kdb_printf("GROWFSRT_ZERO");	break;
+	case XFS_TRANS_GROWFSRT_FREE:	kdb_printf("GROWFSRT_FREE");	break;
+
+	default:			kdb_printf("0x%x", tp->t_type); break;
+	}
+	kdb_printf(" mount 0x%p\n", tp->t_mountp);
+	kdb_printf("flags ");
+	printflags(tp->t_flags, xtp_flags,"xtp");
+	kdb_printf("\n");
+	kdb_printf("callback 0x%p forw 0x%p back 0x%p\n",
+		&tp->t_logcb, tp->t_forw, tp->t_back);
+	kdb_printf("log res %d block res %d block res used %d\n",
+		tp->t_log_res, tp->t_blk_res, tp->t_blk_res_used);
+	kdb_printf("rt res %d rt res used %d\n", tp->t_rtx_res,
+		tp->t_rtx_res_used);
+	kdb_printf("ticket 0x%lx lsn %s\n",
+		(unsigned long) tp->t_ticket, xfs_fmtlsn(&tp->t_lsn));
+	kdb_printf("callback 0x%p callarg 0x%p\n",
+		tp->t_callback, tp->t_callarg);
+	kdb_printf("icount delta %ld ifree delta %ld\n",
+		tp->t_icount_delta, tp->t_ifree_delta);
+	kdb_printf("blocks delta %ld res blocks delta %ld\n",
+		tp->t_fdblocks_delta, tp->t_res_fdblocks_delta);
+	kdb_printf("rt delta %ld res rt delta %ld\n",
+		tp->t_frextents_delta, tp->t_res_frextents_delta);
+	kdb_printf("ag freeblks delta %ld ag flist delta %ld ag btree delta %ld\n",
+		tp->t_ag_freeblks_delta, tp->t_ag_flist_delta,
+		tp->t_ag_btree_delta);
+	kdb_printf("dblocks delta %ld agcount delta %ld imaxpct delta %ld\n",
+		tp->t_dblocks_delta, tp->t_agcount_delta, tp->t_imaxpct_delta);
+	kdb_printf("rextsize delta %ld rbmblocks delta %ld\n",
+		tp->t_rextsize_delta, tp->t_rbmblocks_delta);
+	kdb_printf("rblocks delta %ld rextents delta %ld rextslog delta %ld\n",
+		tp->t_rblocks_delta, tp->t_rextents_delta,
+		tp->t_rextslog_delta);
+	kdb_printf("dqinfo 0x%p\n", tp->t_dqinfo);
+	kdb_printf("log items:\n");
+	licp = &tp->t_items;
+	chunk = 0;
+	while (licp != NULL) {
+		if (XFS_LIC_ARE_ALL_FREE(licp)) {
+			licp = licp->lic_next;
+			chunk++;
+			continue;
+		}
+		for (i = 0; i < licp->lic_unused; i++) {
+			if (XFS_LIC_ISFREE(licp, i)) {
+				continue;
+			}
+
+			lidp = XFS_LIC_SLOT(licp, i);
+			kdb_printf("\n");
+			kdb_printf("chunk %d index %d item 0x%p size %d\n",
+				chunk, i, lidp->lid_item, lidp->lid_size);
+			kdb_printf("flags ");
+			printflags(lidp->lid_flags, lid_flags,"lic");
+			kdb_printf("\n");
+			xfsidbg_xlogitem(lidp->lid_item);
+		}
+		chunk++;
+		licp = licp->lic_next;
+	}
+
+	kdb_printf("log busy free %d, list:\n", tp->t_busy_free);
+	lbcp = &tp->t_busy;
+	chunk = 0;
+	while (lbcp != NULL) {
+		kdb_printf("Chunk %d at 0x%p next 0x%p free 0x%08x unused %d\n",
+			chunk, lbcp, lbcp->lbc_next, lbcp->lbc_free,
+			lbcp->lbc_unused);
+		for (i = 0; i < XFS_LBC_NUM_SLOTS; i++) {
+			kdb_printf("  %02d: ag %d idx %d\n",
+				i,
+				lbcp->lbc_busy[i].lbc_ag,
+				lbcp->lbc_busy[i].lbc_idx);
+		}
+		lbcp = lbcp->lbc_next;
+	}
+}
+
+static void
+xfsidbg_xtrans_res(
+	xfs_mount_t	*mp)
+{
+	xfs_trans_reservations_t	*xtrp;
+
+	xtrp = &mp->m_reservations;
+	kdb_printf("write: %d\ttruncate: %d\trename: %d\n",
+		xtrp->tr_write, xtrp->tr_itruncate, xtrp->tr_rename);
+	kdb_printf("link: %d\tremove: %d\tsymlink: %d\n",
+		xtrp->tr_link, xtrp->tr_remove, xtrp->tr_symlink);
+	kdb_printf("create: %d\tmkdir: %d\tifree: %d\n",
+		xtrp->tr_create, xtrp->tr_mkdir, xtrp->tr_ifree);
+	kdb_printf("ichange: %d\tgrowdata: %d\tswrite: %d\n",
+		xtrp->tr_ichange, xtrp->tr_growdata, xtrp->tr_swrite);
+	kdb_printf("addafork: %d\twriteid: %d\tattrinval: %d\n",
+		xtrp->tr_addafork, xtrp->tr_writeid, xtrp->tr_attrinval);
+	kdb_printf("attrset: %d\tattrrm: %d\tclearagi: %d\n",
+		xtrp->tr_attrset, xtrp->tr_attrrm, xtrp->tr_clearagi);
+	kdb_printf("growrtalloc: %d\tgrowrtzero: %d\tgrowrtfree: %d\n",
+		xtrp->tr_growrtalloc, xtrp->tr_growrtzero, xtrp->tr_growrtfree);
+}
+
+module_init(xfsidbg_init)
+module_exit(xfsidbg_exit)
diff -Nru linux-2.4.20-pre10-mjc1/include/linux/sysctl.h linux-2.4.20-pre10-mjc2/include/linux/sysctl.h
--- linux-2.4.20-pre10-mjc1/include/linux/sysctl.h	2002-10-23 23:01:18.000000000 -0400
+++ linux-2.4.20-pre10-mjc2/include/linux/sysctl.h	2002-10-25 16:53:10.000000000 -0400
@@ -144,6 +144,7 @@
 	VM_MAX_MAP_COUNT=11,	/* int: Maximum number of active map areas */
 	VM_MIN_READAHEAD=12,    /* Min file readahead */
 	VM_MAX_READAHEAD=13,    /* Max file readahead */
+	VM_PAGEBUF=21,		/* struct: Control pagebuf parameters */
 };
 
 
@@ -549,6 +550,7 @@
 	FS_DIR_NOTIFY=14,	/* int: directory notification enabled */
 	FS_LEASE_TIME=15,	/* int: maximum time to wait for a lease break */
 	FS_DQSTATS=16,	/* dir: disc quota usage statistics */
+	FS_XFS=17,	/* struct: control xfs parameters */
 };
 
 /* /proc/sys/fs/quota/ */
diff -Nru linux-2.4.20-pre10-mjc1/kernel/ksyms.c linux-2.4.20-pre10-mjc2/kernel/ksyms.c
--- linux-2.4.20-pre10-mjc1/kernel/ksyms.c	2002-10-25 16:50:57.000000000 -0400
+++ linux-2.4.20-pre10-mjc2/kernel/ksyms.c	2002-10-25 16:51:26.000000000 -0400
@@ -254,6 +254,7 @@
 EXPORT_SYMBOL(find_inode_number);
 EXPORT_SYMBOL(is_subdir);
 EXPORT_SYMBOL(get_unused_fd);
+EXPORT_SYMBOL(put_unused_fd);
 EXPORT_SYMBOL(vfs_create);
 EXPORT_SYMBOL(vfs_mkdir);
 EXPORT_SYMBOL(vfs_mknod);
diff -Nru linux-2.4.20-pre10-mjc1/mm/filemap.c linux-2.4.20-pre10-mjc2/mm/filemap.c
--- linux-2.4.20-pre10-mjc1/mm/filemap.c	2002-10-25 00:10:34.000000000 -0400
+++ linux-2.4.20-pre10-mjc2/mm/filemap.c	2002-10-25 16:51:26.000000000 -0400
@@ -1468,6 +1468,8 @@
 	return nr_locked + nr_clean + nr_dirty;
 }
 
+EXPORT_SYMBOL(mark_page_accessed);
+
 /*
  * This is a generic file read routine, and uses the
  * inode->i_op->readpage() function for the actual low-level
