/*
 * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
 * 
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * 
 * Further, this software is distributed without any warranty that it is
 * free of the rightful claim of any third person regarding infringement
 * or the like.  Any license provided herein, whether implied or
 * otherwise, applies only to this software file.  Patent licenses, if
 * any, provided herein do not apply to combinations of this program with
 * other software, or any other product whatsoever.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write the Free Software Foundation, Inc., 59
 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
 * 
 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
 * Mountain View, CA  94043, or:
 * 
 * http://www.sgi.com 
 * 
 * For further information regarding this notice, see: 
 * 
 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
 */
#ident "$Id:  $"

/*
 *	page_buf.c
 *
 *	The page_buf module provides an abstract buffer cache model on top of
 *	the Linux page cache.  Cached blocks for a file are hashed to the
 *	inode for that file, and can be held dirty in delayed write mode in
 *	the page cache.  Cached metadata blocks for a file system are hashed
 *	to the inode for the mounted device.  The page_buf module assembles
 *	buffer (page_buf_t) objects on demand to aggregate such cached pages
 *	for I/O.
 *
 *
 *      Written by William J. Earl, Steve Lord, Jim Mostek, Russell Cattelan
 *		    and Rajagopal Ananthanarayanan ("ananth") at SGI.
 *
 *	Added kiobuf-based I/O requests: Chait Tumuluri.
 *
 */

#define _PAGE_BUF_INTERNAL_ 1

#include <linux/config.h>
#include <linux/version.h>

#include <linux/module.h>
#include <linux/stddef.h>

#include <linux/spinlock.h>
#include <linux/page_buf.h>
#include <linux/fs.h>
#include <linux/smp_lock.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/locks.h>
#include <linux/swap.h>
#include <asm/hardirq.h>

#define PB_DEFINE_TRACES
#include <linux/page_buf_trace.h>

int pbfw_debug = 0;
int pbpw_debug = 0;
int pagebuf_dones, pagebuf_syncs, pagebuf_toomany, pagebuf_max;

EXPORT_SYMBOL(pbfw_debug);
EXPORT_SYMBOL(pbpw_debug);
EXPORT_SYMBOL(delay_alloc);

#ifndef STATIC
#define STATIC static
#endif

/*
 * Forward declarations.
 */
STATIC int _pagebuf_set_blocks(struct inode *, struct page *, int);
STATIC int _pagebuf_set_blocks_mp(struct inode *,
		struct page *, page_buf_bmap_t *, int);
STATIC int __pb_block_prepare_write(struct inode *, struct page *,
		unsigned, unsigned, int);
STATIC int __pb_block_commit_write(struct inode *, struct page *,
		unsigned, unsigned);
STATIC void __pb_block_commit_write_async(struct inode *,
		struct page *, page_buf_bmap_t *);
STATIC int pb_delalloc_convert(mem_map_t *, u_long, int);

STATIC atomic_t	pagebuf_asyncs = ATOMIC_INIT(0);


/*
 * The minimum size where we will start using pagebuf structures instead
 * of just working with pages.
 */

#define PAGEBUF_MIN_IOSIZE (4*PAGE_SIZE)
#define PBF_IO_CHUNKSIZE 65536
#define PBF_MAX_MAPS	1

/*
 * The following structure is used to communicate between various levels
 * of pagebuf code. It is used by iomove, through segment_apply, into
 * the actual copyfrom or copyto routines.
 */

/* The start of this deliberately looks like a read_descriptor_t in layout */
typedef struct {
	read_descriptor_t io_rdesc;

	/* 0x10 */
	page_buf_rw_t io_dir;	/* read or write */
	loff_t io_offset;	/* Starting offset of I/O */
	int io_iovec_nr;	/* Number of entries in iovec */

	/* 0x20 */
	struct iovec **io_iovec;	/* iovec list indexed by iovec_index */
	loff_t io_iovec_offset;	/* offset into current iovec. */
	int io_iovec_index;	/* current iovec being processed */
	unsigned int io_sshift;	/* sector bit shift */
	loff_t io_i_size;	/* size of the file */
} pb_io_desc_t;


#define io_written	io_rdesc.written
#define io_total_count	io_rdesc.count
#define io_error	io_rdesc.error

void
_pb_zero_out_delay(struct inode *inode, struct page *page, page_buf_bmap_t *mp)
{
	void *kaddr = (void *) kmap(page);
	memset(kaddr, 0, PAGE_CACHE_SIZE);
	kunmap(page);

	__pb_block_commit_write_async(inode, page, mp);
}

/*
 *	pagebuf_readahead
 *
 *	pagebuf_readahead reads into the page cache some or all of the
 *	data associated with the given inode specified by the offset
 *	and length of the range supplied.  This call is really just a
 *	hint to the page buffer module.  There is no guarantee that a
 *	later pagebuf_get will find any or all of the data in
 *	the cache.  
*/

void pagebuf_readahead(		/* read ahead into cache        */
    struct inode *ip,		/* inode for buffer (or NULL)   */
    loff_t ioff,		/* starting offset of range     */
    size_t isize)		/* length of range              */
{
	/* XXX */
}


/*
 *	pagebuf_flush
 *
 *	pagebuf_flush causes any buffered modified storage for the
 *	specified inode, in the specified range, to be written back to
 *	disk.  If PBF_ASYNC is set in the flags, pagebuf_flush does
 *	not wait for the writes to complete.  If any of the storage is
 *	pinned, and PBF_ASYNC is not set, pagebuf_flush will wait for
 *	the storage to be unpinned, after first starting the writes of
 *	the unpinned storage. 
 */

void pagebuf_flush(
    struct inode *ip,		/* inode for range              */
    loff_t ioff,		/* first location in range      */
    page_buf_flags_t bflags)	/* buffer flags, usually        */
				/* PBF_ASYNC                    */
{
	struct list_head *head, *curr;
	struct page *page;
	unsigned long start;
	extern spinlock_t pagecache_lock;

	start = ioff >> PAGE_CACHE_SHIFT;

repeat:
	head = &ip->i_mapping->pages;
	spin_lock(&pagecache_lock);
	curr = head->next;
	while (curr != head) {
		unsigned long offset;

		page = list_entry(curr, struct page, list);
		curr = curr->next;

		offset = page->index;
		if ((offset >= start) && test_bit(PG_delalloc, &page->flags)) {
			get_page(page);
			spin_unlock(&pagecache_lock);

			lock_page(page);
			if (test_and_clear_bit(PG_delalloc, &page->flags)) {
				pb_delalloc_convert(page, 0, 0);
			} else {
				UnlockPage(page);
			}
			page_cache_release(page);
			goto repeat;
		}
	}
	spin_unlock(&pagecache_lock);
	generic_buffer_fdatasync(ip, (unsigned long) ioff, ~0UL);
}


/*
 *	pagebuf_inval
 *
 *	pagebuf_inval marks invalid buffered data associated with 
 *	the inode, in the specified range, including any pages associated with
 *	buffers.  All such pages will be unhashed, although pages associated
 *	with buffers will not actually be freed until the buffers are released.
 *	Such pages will be freed even if the buffers are released with
 *	pagebuf_awrite, pagebuf_dwrite, or pagebuf_write.
 */

void pagebuf_inval( /* invalidate buffered data for */
    struct inode *ip,		/* inode for range              */
    loff_t ioff,		/* first location in range      */
    page_buf_flags_t bflags)	/* buffer flags, usually        */
				/* PBF_ASYNC                    */
{
	truncate_inode_pages(ip->i_mapping, ioff, TRUNC_TOSS);
}

/*
 *	pagebuf_flushinval
 *
 *	pagebuf_flush causes any buffered modified storage for the
 *	specified inode, in the specified range, to be written back to
 *	disk.  If PBF_ASYNC is set in the flags, pagebuf_flush does
 *	not wait for the writes to complete.  If any of the storage is
 *	pinned, and PBF_ASYNC is not set, pagebuf_flush will wait for
 *	the storage to be unpinned, after first starting the writes of
 *	the unpinned storage. Following the flush the pages are unhashed.
 */

void pagebuf_flushinval(	/* write & invalidate buffered storage	*/
    struct inode *ip,		/* inode for range              	*/
    loff_t ioff,		/* first location in range      	*/
    page_buf_flags_t bflags)	/* buffer flags, usually 		*/
				/* PBF_ASYNC				*/
{
	pagebuf_flush(ip, ioff, bflags);
	if (atomic_read(&pagebuf_asyncs))
		run_task_queue(&tq_disk);
	/* Until pages are dirty there is not a lot more we can do here */
	truncate_inode_pages(ip->i_mapping, ioff, TRUNC_NO_TOSS);
}


/*
 *	pagebuf_sethole
 *
 *	pagebuf_sethole marks any valid pages in memory associated
 *	with the given range of pages of the specified inode has not
 *	being backed by storage.  This effectively flushes any cached
 *	page_buf_bmap_t information for such pages. 
 */

void pagebuf_sethole(	/* mark pages as unbacked       */
    struct inode *ip,	/* inode for pages              */
    loff_t ioff,	/* offset in inode              */
    size_t isize)	/* size of area                   */
{
	/* XXX */
}


/*
 *	_pagebuf_handle_iovecs
 *
 *	_pagebuf_handle_iovecs uses an I/O descriptor structure containing
 *	iovec(s) and copies in/out or zeros the memory associated with it.
 *	This routine is passed (for copies in/out) a kern address with contiguous
 *	memory which is to be copied to/from the iovec(s). This routine updates
 *	the I/O descriptor to show what happened including setting io_error if
 *	something goes awry.
 */

int _pagebuf_handle_iovecs(
    pb_io_desc_t * iodp,	/* I/O descriptor */
    struct page *page,		/* Page to move to / from */
    unsigned long offset,	/* start offset into page data */
    loff_t pb_off, size_t len,	/* Total length to copy in this call */
    page_buf_rw_t op)		/* read/write/zero */
{
	int start_off;		/* offset past kern_addr to start */
	struct iovec *iovecp;
	void *user_addr;	/* user addr computed from iovec(s) */
	size_t copy_len,	/* Amount per copy to user */
	 left,			/* not copied by __copy_to_user */
	 iov_left_len,		/* amount to do in one iovec entry */
	 copied;		/* amount successfully copied */
	void *kaddr, *kern_addr = NULL;


	/*
	 * If the offsets don't match, move kern_addr up
	 * and length down.
	 */
	if (pb_off != iodp->io_offset) {
		start_off = iodp->io_offset - pb_off;
		offset += start_off;
		len -= start_off;
	} else {
		start_off = 0;
	}

	if (page) {
		kaddr = (void *) kmap(page);
		kern_addr = kaddr + offset;
	}

	copied = start_off;	/* Tell caller we used this + what we copy
				   use below. */

	while (len && iodp->io_error == 0 &&
	    iodp->io_iovec_index < iodp->io_iovec_nr &&
	    /* Don't go beyond EOF unless writing */
	    ((op == PBF_WRITE) || (iodp->io_offset < iodp->io_i_size))) {
		iovecp = iodp->io_iovec[iodp->io_iovec_index];
		user_addr = iovecp->iov_base + iodp->io_iovec_offset;
		iov_left_len = iovecp->iov_len - iodp->io_iovec_offset;
		copy_len = min(iov_left_len, len);

		if (op != PBF_WRITE) {
			/*
			 * Restrict read/zero size to what is left
			 * in the file.
			 */
			copy_len = min(copy_len,
			    iodp->io_i_size - iodp->io_offset);
		}

		/*
		 * Depending on the op, copy to/from or just zero
		 */


		switch (op) {
		case PBRW_READ:
			left = __copy_to_user(user_addr, kern_addr, copy_len);
			break;

		case PBRW_WRITE:
			left = __copy_from_user(kern_addr, user_addr, copy_len);
			break;

		case PBRW_ZERO:
			/* Use __clear_user since we don't need
			 * to check VERIFY_WRITE, again.
			 */
			left = __clear_user(user_addr, copy_len);
			break;

		default:
			printk("page_buf: bad op code %d\n", op);
			left = copy_len;
			break;
		}

		if (copy_len - left) {
			if (page && (op != PBRW_READ)) {
				PageBlockSetRangeValid(page,
				    offset >> iodp->io_sshift,
				    (copy_len - left) >> iodp->io_sshift);
				if (PageBlockAllValid(page)) {
					PageClearPartial(page);
					SetPageUptodate(page);
				} else {
					PageSetPartial(page);
				}
			}
		}

		if (left) {
			copy_len -= left;
			iodp->io_error = -EFAULT;
		}

		/* Move to the next iov or update the offset in the current */
		if (copy_len == iov_left_len) {
			iodp->io_iovec_index++;
			iodp->io_iovec_offset = 0;
		} else {
			iodp->io_iovec_offset += copy_len;
		}

		/* Move along the total offset, length, writen, ... */
		iodp->io_written += copy_len;
		iodp->io_total_count -= copy_len;
		iodp->io_offset += copy_len;
		len = len - copy_len;
		copied += copy_len;
		offset += copy_len;
	}

	if (page)
		kunmap(page);

	if (iodp->io_error)
		return iodp->io_error;
	else
		return copied;
}

int
_pagebuf_iomove_apply(
    void *parm,
    page_buf_t * pb,
    loff_t pb_off,
    struct page *page,
    size_t page_off,
    size_t seg_len)
{
	pb_io_desc_t *iodp = (pb_io_desc_t *) parm;

	/*
	 * Iterate through the iovecs
	 * copying out until we reach the
	 * end of the iovec(s) or we reach the end of the segment.
	 * we might need to do partial iovecs.
	 */

	return (_pagebuf_handle_iovecs(iodp, page, page_off,
		pb_off, seg_len, iodp->io_dir));
}


/* 
 *	pagebuf_iomove
 *
 *	pagebuf_iomove copies data to or from the user storage defined
 *	by the iovecs/niovecs arguments.
 *	It returns 0 if successful or an error code if an error occurs on the move.
 *
 *	Let pagebuf_segment_apply and the actor do the real copies.
 */

int pagebuf_iomove(		/* move data between buffer and */
    				/* user space                   */
    page_buf_t * pb,		/* buffer to move               */
    loff_t boff,		/* offset in buffer             */
    size_t bsize,		/* size of data to move         */
    struct iovec **iovecs,	/* user address list    	*/
    int niovecs)		/* number of entries in ioves	*/
{
	/* XXX */
	return (-EFAULT);
}


/*
 *	pagebuf_iozero
 *
 *	pagebuf_iozero clears the specified range of buffer supplied,
 *	and marks all the affected blocks as valid and modified.  If
 *	an affected block is not allocated, it will be allocated.  If
 *	an affected block is not completely overwritten, and is not
 *	valid before the operation, it will be read from disk before
 *	being partially zeroed. 
 */

int pagebuf_iozero(		/* zero contents of buffer      */
    page_buf_t * pb,		/* buffer to zero               */
    loff_t boff,		/* offset in buffer             */
    size_t bsize)		/* size of data to zero           */
{
	loff_t cboff;
	size_t cpoff;
	size_t csize;
	mem_map_t *pm;

	cboff = boff;
	boff += bsize; /* last */

	/* check range */
	if (boff > pb->pb_buffer_length)
		return (-ENOENT);

	while (cboff < boff) {
		if (pagebuf_segment(pb, &cboff, &pm, &cpoff, &csize, 0)) {
			/* XXX allocate missing page */
			return (-ENOMEM);
		}
		assert(((csize + cpoff) <= PAGE_SIZE));
		memset((void *) (page_address(pm) + cpoff), 0, csize);
		PageBlockSetAllValid(pm);
		PageClearPartial(pm);
		SetPageUptodate(pm);
	}
	pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
	pb->pb_flags &= ~(_PBF_SOME_INVALID_PAGES | PBF_PARTIAL | PBF_NONE);

	return (0);
}


/* 
 *	Reading and writing files
 */


size_t
_pb_buffered_read(
	struct inode	*inode,
	loff_t		rounded_offset,
	size_t		pb_size,
	page_buf_bmap_t	*mp,
	pb_io_desc_t	*rdp)
{
	page_buf_t	*pb;

	_pagebuf_get_object(inode, rounded_offset, pb_size, PBF_READ, &pb);
	if (!pb) {
		rdp->io_error = -ENOMEM;
		return 0;
	}
	pb->pb_bn = mp->pbm_bn + (mp->pbm_delta >> PB_SECTOR_BITS(pb));
	rdp->io_error = _pagebuf_lookup_pages(pb, rounded_offset, pb_size,
					_PBF_ENTER_PAGES, mp);

	if (rdp->io_error) {
		printk("_pagebuf_lookup_pages failed (error %d) mp 0x%p",
				rdp->io_error, mp);
		return 0;
	}
	/* Start a readahead here, if needed */

	/*
	 * If the pagebuf is all valid, just copy out with no I/O.
	 * Otherwise, get the pagebuf read in, then copy.
	 */

	if (PBF_NOT_DONE(pb)) {
		if (mp->pbm_flags & PBMF_DELAY) {
			printk("pb 0x%p not done on delalloc read mp 0x%p\n",
				pb, mp);
		}
		pagebuf_iostart(pb, PBF_READ);
	}
	pagebuf_segment_apply
	    (_pagebuf_iomove_apply, rdp, pb, 0);
	pagebuf_rele(pb);

	return pb_size;
}

size_t
_pb_direct_read(
	struct inode	*inode,
	loff_t		rounded_offset,
	size_t		pb_size,
	page_buf_bmap_t	*mp,
	pb_io_desc_t	*rdp)
{
	page_buf_t	*pb;
	struct kiobuf	*kp;
	int		rval;
	void		*user_addr;
	struct iovec	*iovecp;

	_pagebuf_get_object(inode, rounded_offset, pb_size,
					PBF_READ | PBF_FORCEIO, &pb);
	if (!pb) {
		rdp->io_error = -ENOMEM;
		return 0;
	}
	pb->pb_bn = mp->pbm_bn + (mp->pbm_delta >> PB_SECTOR_BITS(pb));
	PBP(pb)->pb_mem_single = NULL;
	rval = alloc_kiovec(1, &PBP(pb)->pb_mem_single);
	if (rval) {
		pagebuf_rele(pb);
		rdp->io_error = rval;
		return 0;
	}
	pb->pb_mem.pba_kiovec = &PBP(pb)->pb_mem_single;
	pb->pb_mem.pba_kiocnt = 1;
	pb->pb_mem.pba_direction = PBRW_MODIFY;
	kp = PBP(pb)->pb_mem_single;
	iovecp = rdp->io_iovec[rdp->io_iovec_index];
	user_addr = iovecp->iov_base + rdp->io_iovec_offset;
	rval = map_user_kiobuf(READ, kp, (unsigned long) user_addr, pb_size);
	pagebuf_iostart(pb, PBF_READ | PBF_FORCEIO);
	unmap_kiobuf(kp);
	free_kiovec(1, &kp);

	rdp->io_written += pb_size;
	rdp->io_total_count -= pb_size;
	rdp->io_offset += pb_size;
	rdp->io_iovec_offset += pb_size;

	pagebuf_rele(pb);

	return pb_size;
}

/*
 *	pagebuf_file_read
 *
 *	pagebuf_file_read reads data from the specified file
 *	at the loff_t referenced, updating the loff_t to point after the
 *	data read and updating "rdp" to contain any errors and the bytes
 *	read.
 *
 *	This routine assumes that it is optimal to build page_buf_t's and
 *	that the user is doing larger I/Os.
 */

void pagebuf_file_read(struct file *filp, void * desc)
{
	pb_io_desc_t *rdp = (pb_io_desc_t *)desc;
	struct dentry *dentry = filp->f_dentry;
	struct inode *inode = dentry->d_inode;
	page_buf_bmap_t maps[PBF_MAX_MAPS], *mp;
	int maps_returned, map_entry;
	unsigned long chunksize, map_size, size, readahead;
	int	direct = filp->f_flags & O_DIRECT;
	unsigned int	rounding, mask;
	size_t rounded_isize;

	rounding = direct ? inode->i_sb->s_blocksize : PAGE_CACHE_SIZE;
	mask = ~(rounding - 1);

	/*
	 * while we have data to do, get a bunch of mapping for this
	 * file to blocks on disk (or in delalloc or holes or ...).
	 *  For each map entry,
	 *      get a pagebuf. Note that pagebuf's are limited in size
	 *              so we need to loop again for each chunksize
	 *              within the mapping the file system returned.
	 */

	rounded_isize = (inode->i_size + rounding - 1) & mask;

	while (rdp->io_total_count && !rdp->io_error &&
	    rdp->io_offset < inode->i_size) {
		loff_t rounded_offset;
		size_t rounded_size;

		/*
		 * Let's start by calling bmap.
		 * Back up the offset to start at a page and increase to size
		 * to make it cover an entire page (plus any backing up done).
		 * This will return the on disk representation
		 * (or dealalloc/holes).
		 * There isn't a page for the first part of the I/O
		 * (at least there wasn't before we were called).
		 *
		 * Once we know the on disk and/or in memory representation,
		 * we can better do the I/Os or zero out or ...
		 */

		/*
		 * Make the I/O which will fill pages,
		 * page aligned and complete pages.
		 */

		rounded_offset = rdp->io_offset & mask;
		rounded_size = rdp->io_total_count
		    + (rdp->io_offset - rounded_offset);
		rounded_size =
			(rounded_size + rounding - 1) & mask;

		/*
		 * Truncate the read at the page/block where EOF resides.
		 */
		if (rounded_offset + rounded_size > rounded_isize)
			rounded_size = rounded_isize - rounded_offset;

		rdp->io_error = inode->i_op->pagebuf_bmap(inode,
		    rounded_offset, rounded_size,
		    &maps[0], PBF_MAX_MAPS, &maps_returned, PBF_READ);

		map_entry = 0;

		while (rdp->io_total_count && map_entry < maps_returned &&
		    !rdp->io_error && rdp->io_offset < inode->i_size) {

			/*
			 * Let's look at each maps entry and decide how
			 * to handle things.
			 */

			mp = &maps[map_entry];
			readahead = filp->f_raend;

			/*
			 * First, get the size from this map entry that
			 * applies to this user's I/O. The offset of
			 * the mapping indicates how far from pbm_bn
			 * we need to go (in bytes) to find the block
			 * containing the offset we requested.
			 *
			 * Get the size of this mapping from the offset we
			 * care about to the end of the mapping. Then,
			 * reduce the size to lesser of the user's or the
			 * piece in the map.
			 */

			map_size = mp->pbm_bsize - mp->pbm_delta;
			size = min(map_size, rounded_size);

			if (mp->pbm_flags & (PBMF_HOLE|PBMF_UNWRITTEN|PBMF_NEW)) {

				if (mp->pbm_flags & PBMF_NEW) {
					printk("_pagebuf_file_read with NEW?\n");
				}

				/*
				 * Zero the user's area for the size of
				 * this mapping.
				 */

				_pagebuf_handle_iovecs(rdp, NULL, 0,
				    rounded_offset, size, PBRW_ZERO);
				rounded_offset += size;
			} else {
				size_t pb_size, pb_done;

				/*
				 * build a pagebuf covering some or all of
				 * this mapping, start and wait for the I/O,
				 * then, copyout (maybe also start a
				 * readahead?)
				 */

				chunksize = direct ? size : PBF_IO_CHUNKSIZE;
				pb_size = min(chunksize, size);
				pb_done = 0;

				while (pb_done < size) {
					if (direct) {
						pb_size = _pb_direct_read(inode,
							rounded_offset,
							pb_size, mp, rdp);
					} else {
						pb_size = _pb_buffered_read(inode,
							rounded_offset,
							pb_size, mp, rdp);
					}
					if (rdp->io_error)
						break;
					pb_done += pb_size;
					rounded_offset += pb_size;
					mp->pbm_delta += pb_size;

					/*
					 * Next size is either chunksize or what
					 * is left between pb_done and pb_size.
					 */

					pb_size =
					    min(chunksize, size - pb_done);
				}	/* End of for chunksizes to get/read/copy pbs */
			}	/* End of else we need to do I/O */
			map_entry++;
		}		/* end for all the bmaps */
	}			/* end while we have data to do, bmap */

	/* All the info for callers is in rdp including io_error/io_written */
	return;
}


/*
 *	pagebuf_generic_file_read
 *
 *	pagebuf_generic_file_read reads data from the specified file
 *	at the loff_t referenced, updating the loff_t to point after the
 *	data read and returning the count of bytes read.
 *	The data is read into the supplied buffer, up to a maximum of the
 *	specified size.
 */

STATIC int
_pagebuf_read_helper(
    read_descriptor_t * desc,
    struct page *page,
    unsigned long offset,
    unsigned long count)
{
	pb_io_desc_t *rdp = (pb_io_desc_t *) desc;

	return (_pagebuf_handle_iovecs(rdp, page, offset, rdp->io_offset,
		count, PBRW_READ));
}

ssize_t
pagebuf_generic_file_read(
    struct file * filp,		/* file to read                 */
    char *buf,			/* buffer address               */
    size_t len,			/* size of buffer               */
    loff_t * lp)		/* file offset to use and update */
{
	struct dentry *dentry = filp->f_dentry;
	struct inode *inode = dentry->d_inode;
	pb_io_desc_t iodp, *rdp = &iodp;
	struct iovec iovec, *iovp = &iovec;

	if (!access_ok(VERIFY_WRITE, buf, len)) {
		return -EFAULT;
	}

	rdp->io_dir = PBRW_READ;
	rdp->io_offset = *lp;
	rdp->io_written = 0;
	rdp->io_total_count = len;
	rdp->io_iovec_nr = 1;
	rdp->io_iovec = &iovp;
	iovp->iov_base = buf;
	iovp->iov_len = len;
	rdp->io_iovec_offset = 0;
	rdp->io_iovec_index = 0;
	rdp->io_error = 0;
	rdp->io_i_size = inode->i_size;

	if ((filp->f_flags & O_DIRECT) == 0) {
		do_generic_file_read(filp, lp, (read_descriptor_t *) rdp,
		    _pagebuf_read_helper);
	} else {
		if ((unsigned int) buf & (inode->i_sb->s_blocksize - 1))
			return -EINVAL;

		inode->i_op->pagebuf_fileread(filp, rdp);
	}
	*lp = rdp->io_offset;

	if (!rdp->io_error) {
		return (rdp->io_written);
	}
	return (rdp->io_error);
}

int delwri_debug = 0;

#if 0

STATIC void _pb_write_dn(page_buf_t * pb)
{
	atomic_dec(&pagebuf_asyncs);
	pagebuf_dones++;
	pagebuf_rele(pb);
}

/* JIMJIM make this a function of the amount of memory. Or,
      let delwri take care of this functionality (not waiting). */
#define PB_MAX_AWRS		1024
#define PB_MAX_PROD_WRITES	8

STATIC void
_pagebuf_start_write(
    page_buf_t	*pb,
    int		sync)
{
	if (sync || atomic_read(&pagebuf_asyncs) > PB_MAX_AWRS) {
		if (sync)
			pagebuf_syncs++;
		else
			pagebuf_toomany++;
		pagebuf_iostart(pb, PBF_WRITE | PBF_RELEASE);
	} else {
		/* mark the pb to be relsed later */
		pagebuf_set_iodone(pb, _pb_write_dn);
		pagebuf_iostart(pb, PBF_WRITE | PBF_ASYNC | PBF_RELEASE);
		atomic_inc(&pagebuf_asyncs);
		if (atomic_read(&pagebuf_asyncs) > pagebuf_max)
			pagebuf_max = atomic_read(&pagebuf_asyncs);
	}

	if (atomic_read(&pagebuf_asyncs) > PB_MAX_PROD_WRITES)
		run_task_queue(&tq_disk);
}

#define FILE_DATA_DELWRI
#ifdef FILE_DATA_DELWRI

int do_delwri = 1;


/*
 * Helper for file data delayed write. 
 */
void
__pagebuf_do_delwri(
	struct inode	*inode,		/* target 			     */
	pb_io_desc_t 	*wdp,		/* I/O descriptor                    */
	loff_t 		rounded_offset, /* offset in file, page aligned	     */
	unsigned long 	size,		/* size to write, constrained by wdp */
	page_buf_bmap_t *mp)		/* bmap for page		     */
{
	struct page *page;
	unsigned long done;
	int err;
	void *user_addr;	/* user addr computed from iovec(s) */
	char *kaddr;

	if (wdp->io_iovec_index != 0 || wdp->io_iovec_nr != 1)
		BUG();

	user_addr = wdp->io_iovec[wdp->io_iovec_index]->iov_base
			+ wdp->io_iovec_offset;

	dprintk(delwri_debug,
		("DELWRI: ro 0x%Lx wdpo 0x%Lx size 0x%lx base 0x%p mp 0x%p\n",
		rounded_offset, wdp->io_offset, size, user_addr, mp)); 

	for (done = 0; (done < size) && wdp->io_total_count;
		done += PAGE_CACHE_SIZE, rounded_offset += PAGE_CACHE_SIZE)
	{
		int bytes_in_page;
		int at_eof;
		unsigned long offset_in_page;

		page = grab_cache_page(inode->i_mapping, rounded_offset>> PAGE_CACHE_SHIFT);

		if (!page) {
			wdp->io_error = -ENOMEM;
			return;
		}

		if (page->buffers) {
			dprintk(delwri_debug,
				("DELWRI: page 0x%p present\n", page));
		}

		at_eof = wdp->io_offset == inode->i_size;

		/*
		 * Create the buffers now, so tbe block numbers
		 * can be set with the known map in hand. Avoids
		 * future bmap calls.
		 */
		_pagebuf_set_blocks_mp(inode, page, mp, 1);

		offset_in_page = wdp->io_offset & (PAGE_CACHE_SIZE - 1);
		bytes_in_page = PAGE_CACHE_SIZE - offset_in_page;
		bytes_in_page = min(bytes_in_page, wdp->io_total_count);
		bytes_in_page = min(bytes_in_page, size);

		dprintk(delwri_debug, ("DELWRI: ro 0x%Lx offinp 0x%lx"
			"size 0%lx base 0x%p bip 0x%x done 0x%lx\n",
			rounded_offset, offset_in_page, size, user_addr,
			bytes_in_page, done));

		err = __pb_block_prepare_write(inode, page,
			offset_in_page, offset_in_page + bytes_in_page, at_eof);
		
		if (err) {
			wdp->io_error = err;
			goto unlock;
		}
		kaddr = (char*)page_address(page);
		err = copy_from_user(kaddr+offset_in_page, user_addr,
						bytes_in_page);
		if (err) {
			wdp->io_error = err;
			ClearPageUptodate(page);
			kunmap(page);
			goto unlock;
		}

		__pb_block_commit_write(inode, page, 
			offset_in_page, offset_in_page + bytes_in_page);
		kunmap(page);
		wdp->io_offset += bytes_in_page;
		wdp->io_total_count -= bytes_in_page;
		wdp->io_iovec_offset += bytes_in_page;
		wdp->io_written += bytes_in_page;
		user_addr = (char *)user_addr + bytes_in_page;
		if (wdp->io_offset > inode->i_size)
			inode->i_size = wdp->io_offset;

unlock:
		UnlockPage(page);
		page_cache_release(page);
		if (err < 0)
			break;
	}
}
#endif

/*
 * The following routine sets up zero pages over a new mapping.
 * The start_off is the beginning offset in the file to zero out.
 * The end is the mapping's offset + size;
 *
 * The assumption is that the start and end are on page boundaries.
 */

int pbzo_debug = 0;

void
_pagebuf_zero_out(
	struct inode	*inode,		/* target 			     */
	page_buf_bmap_t *mp,		/* bmap for range		     */
	loff_t 		start_off,	/* start off in file to zero	     */
	pb_io_desc_t    *wdp)           /* I/O descriptor                    */
{
}



/*
 *	_pagebuf_file_write
 *
 *	_pagebuf_file_write writes data to the specified file
 *	at the loff_t referenced, updating the loff_t to point after the
 *	data read and updating "wdp" to contain any errors and the bytes
 *	written.
 *
 *	This routine assumes that it is optimal to build page_buf_t's and
 *	that the user is doing larger I/Os.
 */
void _pagebuf_file_write_old(struct file *filp, pb_io_desc_t * wdp)
{
	struct dentry *dentry = filp->f_dentry;
	struct inode *inode = dentry->d_inode;
	page_buf_bmap_t maps[PBF_MAX_MAPS], *mp;
	int maps_returned, map_entry;
	unsigned long chunksize, map_size, size;
	page_buf_t *pb;
	int sync, error;

	sync = filp->f_flags & O_SYNC;

	/*
	 * while we have data to do, get a bunch of mapping for this
	 * file to blocks on disk (or in delalloc or holes or ...).
	 *  For each map entry,
	 *      get a pagebuf. Note that pagebuf's are limited in size
	 *              so we need to loop again for each chunksize
	 *              within the mapping the file system returned.
	 */

	while (wdp->io_total_count && !wdp->io_error) {
		int pb_flags;
		loff_t rounded_offset;
		size_t rounded_size;

		/*
		 * Let's start by calling bmap for the offset/len we have.
		 * This will return the on disk representation
		 * (or dealalloc/holes).
		 *
		 * Once we know the on disk and/or in memory representation,
		 * we can better do the I/Os in chunks or ...
		 */


		/*
		 * Make the I/O which will fill pages,
		 * page aligned and complete pages.
		 */

		rounded_offset = wdp->io_offset & PAGE_CACHE_MASK;
		rounded_size = wdp->io_total_count +
			(wdp->io_offset - rounded_offset);
		rounded_size =
			(rounded_size + PAGE_CACHE_SIZE - 1) & PAGE_CACHE_MASK;

		/*
		 * round the size up to some minimum value
		 * since this is allocation. 64K for now.
		 *
		 * Don't round up if we are writing within the file
		 * since this probably means seek/write/... and
		 * it isn't sequential. Leave holes.
		 */

		size = rounded_size;
		if (rounded_offset >= inode->i_size)
			size = PBF_IO_CHUNKSIZE;

		if (delay_alloc)
			pb_flags = PBF_WRITE;
		else
			pb_flags = PBF_WRITE|PBF_DIRECT;

		error = inode->i_op->pagebuf_bmap(inode, rounded_offset, size,
				&maps[0], PBF_MAX_MAPS,
				&maps_returned, pb_flags);

		if (error) {
			wdp->io_error = error;
			break;
		}

		if ((maps[0].pbm_flags & PBMF_DELAY) && delay_alloc) {

			error = inode->i_op->pagebuf_bmap(inode, rounded_offset,
					maps[0].pbm_bsize,
					maps, PBF_MAX_MAPS,
					&maps_returned,
					PBF_WRITE|PBF_FILE_ALLOCATE);

			if (error) {
				printk("pbfwa: bmap returned error %d "
				       "ro 0x%Lx size 0x%x\n",
					   error, rounded_offset,
					   maps[0].pbm_bsize);
			} else {
				dprintk(pbfw_debug,
				 ("converted bn:%d off:%d size:%d flags:%d\n",
				     maps[0].pbm_bn, maps[0].pbm_delta,
				     maps[0].pbm_bsize, maps[0].pbm_flags));
			}
		}

		/*
		 * If we allocated NEW space and the I/O didn't
		 * cover it, we need to zero it. Associate pages
		 * with this and zero them out.  This can happen
		 * since the file system can allocate more space
		 * than a page when converting a hole.
		 *
		 * Also, we could find already existing pages
		 * over this range.
		 */

		map_entry = 0;

		while (wdp->io_total_count && map_entry < maps_returned &&
		    !wdp->io_error) {

			/*
			 * Let's look at each maps entry and decide how
			 * to handle things.
			 */

			mp = &maps[map_entry];

			/*
			 * First, get the size from this map entry that
			 * applies to this user's I/O. The offset of
			 * the mapping must be less than or equal the starting
			 * offset or we would be missing a mapping.
			 * Get the size of this mapping from the offset we
			 * care about to the end of the mapping. Then,
			 * reduce the size to lesser of the user's or the
			 * piece in the map.
			 */

			map_size = mp->pbm_bsize - mp->pbm_delta;
			size = min(map_size, rounded_size);

			/*
			 * Holes mean we came to the end of the space returned
			 * from the file system. We need to go back and ask for
			 * more space.
			 */
			if (mp->pbm_flags & (PBMF_HOLE | PBMF_UNWRITTEN)) {
				break;
			} else if (mp->pbm_flags & PBMF_DELAY) {
				/* find the pages, they must be around somewhere. */
				/* Then, just copyout to the user. */
				printk("pfw: bmap returned unexpected DELAY\n");
				wdp->io_error = -EINVAL;
				return;
#ifdef FILE_DATA_DELWRI
			} else if (do_delwri && !sync) {
				/*
				 * Handle delwri before (write-through) sync.
				 */
				if (delay_alloc)
					BUG();
				__pagebuf_do_delwri(inode, wdp,
						rounded_offset,
						map_size, mp);
				rounded_offset += size;
#endif
			} else {
				size_t pb_size, pb_done;

				/*
				 * We are working with one map, here.
				 *
				 * while we have space in the map,
				 *      get a pb,
				 *      allocate pages for it,
				 *      bring in the user's data.
				 *      start I/O on it (if not delalloc).
				 */

				chunksize = PBF_IO_CHUNKSIZE;
				pb_size = min(chunksize, size);
				pb_done = 0;

				while (pb_done < pb_size) {
					int read_pb;

					read_pb = 0;
					_pagebuf_get_object(inode,
					    rounded_offset,
					    pb_size, PBF_WRITE, &pb);
					if (!pb) {
						wdp->io_error = -ENOMEM;
						return;
					}
					PB_TRACE(pb, PB_TRACE_REC(file_write), mp);
					pb->pb_bn = mp->pbm_bn +
					    (mp->pbm_delta >>
					    PB_SECTOR_BITS(pb));
					pb->pb_count_desired = pb_size;
					_pagebuf_lookup_pages(pb,
					    rounded_offset,
					    pb_size, _PBF_ENTER_PAGES, NULL);

					/* Read in any pages and/or sectors that
					   we are partially being changed
					   by the write (if not already read). */

					if (rounded_offset != wdp->io_offset) {
						read_pb = 1;
					}

					if ((pb_size & ~PAGE_CACHE_MASK) &&
					    (pb_size + rounded_offset) <
					    wdp->io_i_size) {
						read_pb = 1;
					}


					if (read_pb) {
						if (mp->pbm_flags & PBMF_NEW) {
						  /*
						   * JIMJIM reduce this to the diff
						   * between pbsize and user's sz.
						   * need to do two calls, though?
						   */
						  pagebuf_iozero(pb, 0, pb_size);
						} else if (PBF_NOT_DONE(pb)) {
							pagebuf_iostart(pb, PBF_READ);
						}
					}

					/* Copy in the user's stuff */
					pagebuf_segment_apply
					    (_pagebuf_iomove_apply, wdp, pb, 0);

					/* Write it out (or mark dirty) */
					_pagebuf_start_write(pb, sync);


					/* Check error code of pb above and bomb */

					if (wdp->io_error)
						break;

					pb_done += pb_size;
					rounded_offset += pb_size;
					mp->pbm_delta += pb_size;

					/*
					 * Next size is either chunksize or what
					 * is left between pb_done and pb_size.
					 */
					pb_size =
					    min(chunksize, size - pb_done);
				}	/* End of for chunksizes to get/read/copy pagebufs */
			}	/* End of else we need to do I/O */
			map_entry++;
		}		/* end for all the bmaps */
	}			/* end while we have data to do, bmap */

	/* All the info for callers is in wdp including io_error/io_written */
	return;
}
/*
 *	pagebuf_generic_file_write
 *
 *	pagebuf_generic_file_write writes data from the specified file
 *	at the loff_t referenced, updating the loff_t to point after the
 *	data written and returning the count of bytes written.
 *	The data is written from the supplied buffer, up to a maximum of the
 *	specified size.  Normally all of the data is written, unless there
 *	is an error.
 */

int pgfw_debug = 0;

ssize_t pagebuf_generic_file_write_old(
    struct file * filp,		/* file to write                */
    char *buf,			/* buffer address               */
    size_t len,			/* size of buffer               */
    loff_t * lp)		/* file offset to use and update */
{
	struct inode *inode = filp->f_dentry->d_inode;
	unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
	struct page *page, **hash;
	int sshift = inode->i_sb->s_blocksize_bits;
	unsigned long index;
	int status = 0;
	pb_io_desc_t iodp, *wdp = &iodp;
	struct iovec iovec, *iovp = &iovec;
	char *kaddr;
	

	wdp->io_error = -EINVAL;
	wdp->io_written = 0;
	if (*lp < 0)
		goto out;

	wdp->io_error = filp->f_error;
	
	if (filp->f_error) {
		filp->f_error = 0;
		goto out;
	}

	wdp->io_dir = PBRW_WRITE;
	wdp->io_offset = *lp;
	wdp->io_iovec_nr = 1;
	wdp->io_iovec = &iovp;
	iovp->iov_base = buf;
	wdp->io_iovec_offset = 0;
	wdp->io_iovec_index = 0;
	wdp->io_error = 0;
	wdp->io_i_size = inode->i_size;
	wdp->io_sshift = sshift;


	/*
	 * Check if we've reached the file size limit.
	 */
	wdp->io_error = -EFBIG;
	if (limit != RLIM_INFINITY) {
		if (wdp->io_offset >= limit) {
			send_sig(SIGXFSZ, current, 0);
			goto out;
		}
		if (len > limit - wdp->io_offset) {
			send_sig(SIGXFSZ, current, 0);
			len = limit - wdp->io_offset;
		}
	}

	wdp->io_total_count = len;
	iovp->iov_len = len;
	wdp->io_error = 0;

	while (wdp->io_total_count && !wdp->io_error) {
		unsigned long bytes, offset;
		int	at_eof, pb_flags;
		/*
		 * Try to find the page in the cache. If it isn't there,
		 * allocate a free page. If the write is sufficiently big,
		 * use pagebufs with possible delayed allocation.
		 */
		offset = (wdp->io_offset & ~PAGE_CACHE_MASK);
		index = wdp->io_offset >> PAGE_CACHE_SHIFT;
		bytes = PAGE_CACHE_SIZE - offset;
		if (bytes > wdp->io_total_count)
			bytes = wdp->io_total_count;

		if (&inode->i_data != inode->i_mapping)
			BUG();
		hash = page_hash(&inode->i_data, index);
		page = __find_lock_page(&inode->i_data, index, hash);

		if (!page) {
			_pagebuf_file_write_old(filp, wdp);
			if (wdp->io_offset > inode->i_size)
				inode->i_size = wdp->io_offset;
			break;
		}

		/*
		 * Do the real work.. If the writer ends up delaying the write,
		 * the writer needs to increment the page use counts until he
		 * is done with the page.
		 */

		if (!PageLocked(page)) {
			PAGE_BUG(page);
		}

		if (delay_alloc && test_bit(PG_delalloc, &page->flags))
			pb_flags = PBF_WRITE|PBF_FILE_ALLOCATE;
		else
			pb_flags = PBF_WRITE|PBF_DIRECT;

		status = _pagebuf_set_blocks(inode, page, pb_flags);
		if (status)
			goto unlock;

		at_eof = wdp->io_offset == inode->i_size;

		status = __pb_block_prepare_write(inode, page,
			offset, offset + bytes, at_eof);
		
		if (status) {
			wdp->io_error = status;
			goto unlock;
		}
		kaddr = (char*)page_address(page);
		status = copy_from_user(kaddr+offset, buf, bytes);
		if (status) {
			wdp->io_error = -EFAULT;
			ClearPageUptodate(page);
			kunmap(page);
			goto unlock;
		}

		__pb_block_commit_write(inode, page, 
			offset, offset + bytes);
		kunmap(page);

		wdp->io_iovec_offset += bytes;
		wdp->io_written += bytes;
		wdp->io_total_count -= bytes;
		wdp->io_offset += bytes;
		buf += bytes;
		if (wdp->io_offset > inode->i_size)
			inode->i_size = wdp->io_offset;
		if (PageBlockAllValid(page)) {
			PageClearPartial(page);
			SetPageUptodate(page);
		} else {
			PageSetPartial(page);
		}
unlock:
		UnlockPage(page);
		page_cache_release(page);

		if (status < 0) {
			wdp->io_error = status;
			break;
		}
	}
	*lp = wdp->io_offset;

	
 out:
	return wdp->io_written ? wdp->io_written : wdp->io_error;
}
#endif



int debug_set_blocks = 0;

int
__pb_set_blocks_mp(
	struct inode	*inode,
	struct page	*page,
	page_buf_bmap_t *mp,
	int		blocksize,
	int		block_bits,
	int		created_buffers,
	struct buffer_head *bh,
	struct buffer_head *head)
{
	size_t	bsize;

	bsize = mp->pbm_bsize - mp->pbm_delta;
	block_bits -= inode->i_sb->s_blocksize_bits;

	/*
	 * While we have room in this bmap, bump the block number if
	 * real.  If the block number is a hole or unwritten, make
	 * the blockno 0. This will cause a clearing of the associated
	 * memory.
	 */
	for ( ; bsize ; bsize -= blocksize) {

		if (mp->pbm_flags & (PBMF_HOLE|PBMF_UNWRITTEN|PBMF_DELAY))
			goto next_buffer;

		if (mp->pbm_offset & (blocksize - 1))
			BUG();

		bh->b_blocknr = (long)mp->pbm_bn +
			(mp->pbm_delta >> inode->i_sb->s_blocksize_bits);
		bh->b_blocknr >>= block_bits;
		bh->b_state |= (1UL << BH_Mapped);

		if (created_buffers && (mp->pbm_flags & PBMF_NEW))
			bh->b_state |= (1UL << BH_New);

		mp->pbm_delta += blocksize;
next_buffer:
		bh = bh->b_this_page;
		if (bh == head)
			return 1;
	}
	return 0;
}

/*
 * The following is used to get a list of blocks of the given
 * size from a pagebuf_bmap_t returned by pagebuf_bmap.
 */

int
_pagebuf_set_blocks_mp(
	struct inode	*inode,
	struct page	*page,
	page_buf_bmap_t *pbpmap,
	int		nbmaps)
{
	int blocksize, block_bits, bmaps;
	struct buffer_head *head = page->buffers;
	struct buffer_head *bh = head;
	int created = 0;

	/* We can use a large buffer size:
	 * 1. there is not already more than 1 buffer head on the page
	 * 2. there is contiguous disk space over the whole page
	 * 3. this is a read which covers partially valid space
	 */

	if (bh && bh != head) {
		blocksize = inode->i_sb->s_blocksize;
		block_bits = inode->i_sb->s_blocksize_bits;
	} else if (pbpmap->pbm_bsize >= PAGE_SIZE) {
		blocksize = PAGE_SIZE;
		block_bits = PAGE_SHIFT;
	} else {
		blocksize = inode->i_sb->s_blocksize;
		block_bits = inode->i_sb->s_blocksize_bits;
	}

	if (!bh) {
		create_empty_buffers(page, inode, blocksize);
		bh = head = page->buffers;
		created = 1;
	} 
	/*
	 * We need to walk the bmaps to cover the case where this page
	 * spans extents.
	 */
	for (bmaps = 0; bmaps < nbmaps; bmaps++, pbpmap++) {
		if (__pb_set_blocks_mp(inode, page, pbpmap,
			blocksize, block_bits, created, bh, head))
		{
			return 0;
		}
        }

	return 0;
}

STATIC int
_pagebuf_set_blocks(
	struct inode	*inode,
	struct page	*page,
	int		flags)
{
	pb_bmap_t	pbmap[PBF_MAX_MAPS];
	int		nbmaps = PBF_MAX_MAPS;
	loff_t		offset = page->index << PAGE_CACHE_SHIFT;
	struct buffer_head *head = page->buffers;
	struct buffer_head *bh = head;
	int status;

	if (bh) {
		do {
			if (!buffer_mapped(bh))
				break;
		} while ((bh = bh->b_this_page) != head);
		/*
		 * All mapped?
		 */
		if (buffer_mapped(bh))
			return 0;
		bh = head = page->buffers;
	}
	status = inode->i_op->pagebuf_bmap(inode, offset,
				PAGE_CACHE_SIZE, pbmap,
				PBF_MAX_MAPS, &nbmaps, flags);

	if ((pbmap->pbm_flags & PBMF_DELAY) && (status == 0)) {
		if (flags & PBF_READ) {
			/*
			 * Don't bother to attach buffers. Just
			 * zero page & mark delay.
			 */
			_pb_zero_out_delay(inode, page, &pbmap[0]);
			return 0;
		} else {
			printk("NO DATA! ip = 0x%p page = 0x%p, map = 0x%p\n",
				inode, page, pbmap);
			BUG();
		}
	}
	if (status == 0) {
		status = _pagebuf_set_blocks_mp(inode, page, pbmap, nbmaps);
	}

	return status;
}



STATIC int __pb_block_prepare_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to, int at_eof)
{
	unsigned block_start, block_end;
	unsigned long block;
	int err = 0;
	unsigned blocksize, bbits;
	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
	char *kaddr = (char *)kmap(page);

	bh = head = page->buffers;

	bbits = inode->i_sb->s_blocksize_bits;
	block = 0;

	for(bh = head, block_start = 0; bh != head || !block_start;
	    block+= blocksize >> bbits, block_start=block_end,
	    bh = bh->b_this_page) {
		if (!bh)
			BUG();
		blocksize = bh->b_size;
		block_end = block_start+blocksize;
		if (block_end <= from)
			continue;
		if (block_start >= to)
			break;
		bh->b_end_io = end_pb_buffer_io_sync;
		bh->b_dev_id = (void *) block;
		if (!buffer_mapped(bh)) {
			BUG();
		}
		if (PageBlockTestRangeValid(page, block, blocksize >> bbits))
			bh->b_state |= (1UL << BH_Uptodate);
		if (buffer_new(bh)) {

			clear_bit(BH_New, &bh->b_state);
			/*
			 * NEW buffer can still be uptodate (as checked
			 * by the above RangeValid test) if a HOLE
			 * was just converted: the allocation is new,
			 * but there already exists data covering the HOLE
			 * (usually by mmap-write).
			 */
			if (buffer_uptodate(bh))
				continue;

			if (block_end > to) {
				dprintk(pbpw_debug,
				     ("pbpw(%d): memset(1) bno %ld page 0x%p "
				      "index 0x%lx from 0x%p to 0x%x\n",
				        current->pid,
					bh->b_blocknr, page, page->index,
					kaddr+to, block_end-to));

				memset(kaddr+to, 0, block_end-to);
			}
			if (block_start < from) {
				dprintk(pbpw_debug,
				     ("pbpw(%d): memset(2) bno %ld page 0x%p "
				      "index 0x%lx from 0x%p to 0x%x\n",
				        current->pid,
					bh->b_blocknr, page, page->index,
					kaddr+block_start, from-block_start));

				memset(kaddr+block_start, 0, from-block_start);
			}
			continue;
		}

		
	/* this is tricky!!
	 * if we are ever at the end of a file but not at the 
	 * start of a block, or a page in most cases,
	 * it means the page we were using got reclaimed by shrink_mmap.
	 * We have allocated a new page, but we must read the data before the
	 * "from" location from disk.
	 * This would also be true if appending to a file that didn't end
	 * on a page boundry.
	 */
		if ((!at_eof || (block_start < from ))
			&& !buffer_uptodate(bh) &&
		     (block_start < from || block_end > to)) {
			ll_rw_block(READ, 1, &bh);
			*wait_bh++=bh;
		}
	}
	/*
	 * If we issued read requests - let them complete.
	 */
	while(wait_bh > wait) {
		dprintk(pbpw_debug, ("pbpw: read bno %ld page 0x%p "
		      "index 0x%lx\n", bh->b_blocknr, page, page->index));
		wait_on_buffer(*--wait_bh);
		err = -EIO;
		if (!buffer_uptodate(*wait_bh))
			goto out;
	}
	
	/* this will be wrong if our page size ever changes or
	 * the default block size of a device ever doesn't match 
	 * the page size
	 */
	if (bh->b_size == PAGE_SIZE){
		PageBlockSetAllValid(page);
	} else {
		BUG();
	}

	return 0;
out:
	return err;
}

STATIC int __pb_block_commit_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to)
{
	unsigned block_start, block_end;
	int partial = 0, need_balance_dirty = 0;
	unsigned block, blocksize, bbits;
	struct buffer_head *bh, *head;

	bbits = inode->i_sb->s_blocksize_bits;

	for(bh = head = page->buffers, block_start = 0, block = 0;
	    bh != head || !block_start;
	    block_start=block_end, block += blocksize >> bbits,
	    bh = bh->b_this_page) {
		blocksize = bh->b_size;
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (!buffer_uptodate(bh))
				partial = 1;
		} else {
			set_bit(BH_Uptodate, &bh->b_state);
			PageBlockSetRangeValid(page, block, blocksize >> bbits);

			if (!buffer_dirty(bh))
				need_balance_dirty = 1;
			__mark_buffer_dirty(bh, 0);
		}
	}

	if (need_balance_dirty)
		balance_dirty(bh->b_dev);

	/*
	 * is this a partial write that happened to make all buffers
	 * uptodate then we can optimize away a bogus readpage() for
	 * the next read(). Here we 'discover' wether the page went
	 * uptodate as a result of this (potentially partial) write.
	 */
	if (!partial)
		SetPageUptodate(page);
	return 0;
}

/*---------------------------------------------------------------------------*/


STATIC void end_pg_buffer_io_async(struct buffer_head * bh, int uptodate)
{
	static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
	unsigned long flags;
	struct buffer_head *tmp;
	struct page *page;

	mark_buffer_uptodate(bh, uptodate);

	/* This is a temporary buffer used for page I/O. */
	page = bh->b_page;

	if (!uptodate){
	  printk("end_pg_buffer_io_async not uptodate %d page 0x%p\n",uptodate,page);
		SetPageError(page);
	} else {
		if (bh->b_size == PAGE_SIZE)
			PageBlockSetAllValid(page);
		else
			PageBlockSetValid(page, (int) bh->b_dev_id);
	}

	/*
	 * Be _very_ careful from here on. Bad things can happen if
	 * two buffer heads end IO at almost the same time and both
	 * decide that the page is now completely done.
	 *
	 * Async buffer_heads are here only as labels for IO, and get
	 * thrown away once the IO for this page is complete.  IO is
	 * deemed complete once all buffers have been visited
	 * (b_count==0) and are now unlocked. We must make sure that
	 * only the _last_ buffer that decrements its count is the one
	 * that unlock the page..
	 */
	spin_lock_irqsave(&page_uptodate_lock, flags);
	unlock_buffer(bh);
	atomic_dec(&bh->b_count);
	tmp = bh->b_this_page;
	while (tmp != bh) {
		if (tmp->b_end_io == end_pg_buffer_io_async &&
						buffer_locked(tmp))
			goto still_busy;
		tmp = tmp->b_this_page;
	}

	/* OK, the async IO on this page is complete. */
	spin_unlock_irqrestore(&page_uptodate_lock, flags);

	/*
	 * if none of the buffers had errors then we can set the
	 * page uptodate:
	 */
	if (!PageError(page))
		SetPageUptodate(page);

	UnlockPage(page);

	return;

still_busy:
	spin_unlock_irqrestore(&page_uptodate_lock, flags);
	return;
}

STATIC void end_pb_buffer_io_async(struct buffer_head *bh, int uptodate)
{
	mark_buffer_uptodate(bh, uptodate);
	unlock_buffer(bh);
}

STATIC void end_pb_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
	struct page *page = bh->b_page;

	mark_buffer_uptodate(bh, uptodate);
	if (bh->b_size == PAGE_SIZE)
		PageBlockSetAllValid(page);
	else
		PageBlockSetValid(page, (int) bh->b_dev_id);
	unlock_buffer(bh);
	atomic_dec(&pagebuf_asyncs);
}

int debug_set_blocks = 0;

int
__pb_set_blocks_mp(
	struct inode	*inode,
	struct page	*page,
	page_buf_bmap_t *mp,
	int		blocksize,
	int		block_bits,
	int		created_buffers,
	struct buffer_head *bh,
	struct buffer_head *head)
{
	size_t	bsize;

	bsize = mp->pbm_bsize - mp->pbm_delta;
	block_bits -= inode->i_sb->s_blocksize_bits;

	/*
	 * While we have room in this bmap, bump the block number if
	 * real.  If the block number is a hole or unwritten, make
	 * the blockno 0. This will cause a clearing of the associated
	 * memory.
	 */
	for ( ; bsize ; bsize -= blocksize) {

		if (mp->pbm_flags & (PBMF_HOLE|PBMF_UNWRITTEN|PBMF_DELAY))
			goto next_buffer;

		if (mp->pbm_offset & (blocksize - 1))
			BUG();

		bh->b_blocknr = (long)mp->pbm_bn +
			(mp->pbm_delta >> inode->i_sb->s_blocksize_bits);
		bh->b_blocknr >>= block_bits;
		bh->b_state |= (1UL << BH_Mapped);

		if (created_buffers && (mp->pbm_flags & PBMF_NEW))
			bh->b_state |= (1UL << BH_New);

		mp->pbm_delta += blocksize;
next_buffer:
		bh = bh->b_this_page;
		if (bh == head)
			return 1;
	}
	return 0;
}

/*
 * The following is used to get a list of blocks of the given
 * size from a pagebuf_bmap_t returned by pagebuf_bmap.
 */

int
_pagebuf_set_blocks_mp(
	struct inode	*inode,
	struct page	*page,
	page_buf_bmap_t *pbpmap,
	int		nbmaps)
{
	int blocksize, block_bits, bmaps;
	struct buffer_head *head = page->buffers;
	struct buffer_head *bh = head;
	int created = 0;

	/* We can use a large buffer size:
	 * 1. there is not already more than 1 buffer head on the page
	 * 2. there is contiguous disk space over the whole page
	 * 3. this is a read which covers partially valid space
	 */

	if (bh && bh != head) {
		blocksize = inode->i_sb->s_blocksize;
		block_bits = inode->i_sb->s_blocksize_bits;
	} else if (pbpmap->pbm_bsize >= PAGE_SIZE) {
		blocksize = PAGE_SIZE;
		block_bits = PAGE_SHIFT;
	} else {
		blocksize = inode->i_sb->s_blocksize;
		block_bits = inode->i_sb->s_blocksize_bits;
	}

	if (!bh) {
		create_empty_buffers(page, inode, blocksize);
		bh = head = page->buffers;
		created = 1;
	} 
	/*
	 * We need to walk the bmaps to cover the case where this page
	 * spans extents.
	 */
	for (bmaps = 0; bmaps < nbmaps; bmaps++, pbpmap++) {
		if (__pb_set_blocks_mp(inode, page, pbpmap,
			blocksize, block_bits, created, bh, head))
		{
			return 0;
		}
        }

	return 0;
}

STATIC int
_pagebuf_set_blocks(
	struct inode	*inode,
	struct page	*page,
	int		flags)
{
	pb_bmap_t	pbmap[PBF_MAX_MAPS];
	int		nbmaps = PBF_MAX_MAPS;
	loff_t		offset = page->index << PAGE_CACHE_SHIFT;
	struct buffer_head *head = page->buffers;
	struct buffer_head *bh = head;
	int status;

	if (bh) {
		do {
			if (!buffer_mapped(bh))
				break;
		} while ((bh = bh->b_this_page) != head);
		/*
		 * All mapped?
		 */
		if (buffer_mapped(bh))
			return 0;
		bh = head = page->buffers;
	}
	status = inode->i_op->pagebuf_bmap(inode, offset,
				PAGE_CACHE_SIZE, pbmap,
				PBF_MAX_MAPS, &nbmaps, flags);

	if ((pbmap->pbm_flags & PBMF_DELAY) && (status == 0)) {
		if (flags & PBF_READ) {
			/*
			 * Don't bother to attach buffers. Just
			 * zero page & mark delay.
			 */
			_pb_zero_out_delay(inode, page, &pbmap[0]);
			return 0;
		} else {
			printk("NO DATA! ip = 0x%p page = 0x%p, map = 0x%p\n",
				inode, page, pbmap);
			BUG();
		}
	}
	if (status == 0) {
		status = _pagebuf_set_blocks_mp(inode, page, pbmap, nbmaps);
	}

	return status;
}



STATIC int __pb_block_prepare_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to, int at_eof)
{
	unsigned block_start, block_end;
	unsigned long block;
	int err = 0;
	unsigned blocksize, bbits;
	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
	char *kaddr = (char *)kmap(page);

	bh = head = page->buffers;

	bbits = inode->i_sb->s_blocksize_bits;
	block = 0;

	for(bh = head, block_start = 0; bh != head || !block_start;
	    block+= blocksize >> bbits, block_start=block_end,
	    bh = bh->b_this_page) {
		if (!bh)
			BUG();
		blocksize = bh->b_size;
		block_end = block_start+blocksize;
		if (block_end <= from)
			continue;
		if (block_start >= to)
			break;
		bh->b_end_io = end_pb_buffer_io_sync;
		bh->b_dev_id = (void *) block;
		if (!buffer_mapped(bh)) {
			BUG();
		}
		if (PageBlockTestRangeValid(page, block, blocksize >> bbits))
			bh->b_state |= (1UL << BH_Uptodate);
		if (buffer_new(bh)) {

			clear_bit(BH_New, &bh->b_state);
			/*
			 * NEW buffer can still be uptodate (as checked
			 * by the above RangeValid test) if a HOLE
			 * was just converted: the allocation is new,
			 * but there already exists data covering the HOLE
			 * (usually by mmap-write).
			 */
			if (buffer_uptodate(bh))
				continue;

			if (block_end > to) {
				dprintk(pbpw_debug,
				     ("pbpw(%d): memset(1) bno %ld page 0x%p "
				      "index 0x%lx from 0x%p to 0x%x\n",
				        current->pid,
					bh->b_blocknr, page, page->index,
					kaddr+to, block_end-to));

				memset(kaddr+to, 0, block_end-to);
			}
			if (block_start < from) {
				dprintk(pbpw_debug,
				     ("pbpw(%d): memset(2) bno %ld page 0x%p "
				      "index 0x%lx from 0x%p to 0x%x\n",
				        current->pid,
					bh->b_blocknr, page, page->index,
					kaddr+block_start, from-block_start));

				memset(kaddr+block_start, 0, from-block_start);
			}
			continue;
		}

		
	/* this is tricky!!
	 * if we are ever at the end of a file but not at the 
	 * start of a block, or a page in most cases,
	 * it means the page we were using got reclaimed by shrink_mmap.
	 * We have allocated a new page, but we must read the data before the
	 * "from" location from disk.
	 * This would also be true if appending to a file that didn't end
	 * on a page boundry.
	 */
		if ((!at_eof || (block_start < from ))
			&& !buffer_uptodate(bh) &&
		     (block_start < from || block_end > to)) {
			ll_rw_block(READ, 1, &bh);
			*wait_bh++=bh;
		}
	}
	/*
	 * If we issued read requests - let them complete.
	 */
	while(wait_bh > wait) {
		dprintk(pbpw_debug, ("pbpw: read bno %ld page 0x%p "
		      "index 0x%lx\n", bh->b_blocknr, page, page->index));
		wait_on_buffer(*--wait_bh);
		err = -EIO;
		if (!buffer_uptodate(*wait_bh))
			goto out;
	}
	
	/* this will be wrong if our page size ever changes or
	 * the default block size of a device ever doesn't match 
	 * the page size
	 */
	if (bh->b_size == PAGE_SIZE){
		PageBlockSetAllValid(page);
	} else {
		BUG();
	}

	return 0;
out:
	return err;
}

STATIC int __pb_block_commit_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to)
{
	unsigned block_start, block_end;
	int partial = 0, need_balance_dirty = 0;
	unsigned block, blocksize, bbits;
	struct buffer_head *bh, *head;

	bbits = inode->i_sb->s_blocksize_bits;

	for(bh = head = page->buffers, block_start = 0, block = 0;
	    bh != head || !block_start;
	    block_start=block_end, block += blocksize >> bbits,
	    bh = bh->b_this_page) {
		blocksize = bh->b_size;
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (!buffer_uptodate(bh))
				partial = 1;
		} else {
			set_bit(BH_Uptodate, &bh->b_state);
			PageBlockSetRangeValid(page, block, blocksize >> bbits);

			if (!buffer_dirty(bh))
				need_balance_dirty = 1;
			__mark_buffer_dirty(bh, 0);
		}
	}

	if (need_balance_dirty)
		balance_dirty(bh->b_dev);

	/*
	 * is this a partial write that happened to make all buffers
	 * uptodate then we can optimize away a bogus readpage() for
	 * the next read(). Here we 'discover' wether the page went
	 * uptodate as a result of this (potentially partial) write.
	 */
	if (!partial)
		SetPageUptodate(page);
	return 0;
}

/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
 * mark_buffer_uptodate() functions propagate buffer state into the
 * page struct once IO has completed.
 *
 *	pagebuf_read_full_page
 */

int pbrfp_debug = 0;

int pagebuf_read_full_page(struct dentry *dentry, struct page *page)
{
	struct inode	*inode = dentry->d_inode;
	/* arr is sized for worst case */
	struct buffer_head *bh, *head, *arr[PAGE_CACHE_SIZE / 512];
	int blocksize, bbits = inode->i_sb->s_blocksize_bits;
	unsigned long kaddr = 0;
	int nr, i, status;

	if (!PageLocked(page) || (inode->i_op->pagebuf_bmap == NULL))
		PAGE_BUG(page);

	dprintk(pbrfp_debug,
	     ("pbrfp(%d): ip 0x%p page 0x%p offset 0x%Lx\n",
		current->pid, inode, page, (loff_t) page->index << PAGE_SHIFT));

	status = _pagebuf_set_blocks(inode, page, PBF_READ);
	if (status)
		return status;

	if (test_bit(PG_delalloc, &page->flags)) {
		if (PagePartial(page) || !PageBlockAllValid(page)) {
			printk("Dirty page 0x%p not all valid or partial\n",
				page);
			
			status = -EIO;
		}
		++current->maj_flt;
		UnlockPage(page);
		return status;
	}

	bh = head = page->buffers;
	nr = 0;
	i = 0;

	do {
		blocksize = bh->b_size;

		if (PageBlockTestRangeValid(page, i, blocksize >> bbits))
			bh->b_state |= (1UL << BH_Uptodate);

		if (buffer_uptodate(bh))
			continue;

		if (!buffer_mapped(bh)) {
			if (!kaddr)
				kaddr = kmap(page);
			memset((char *)(kaddr + (i << bbits)), 0, blocksize);

			set_bit(BH_Uptodate, &bh->b_state);
			PageBlockSetRangeValid(page, i, blocksize >> bbits);
			continue;
		}

		init_buffer(bh, end_pg_buffer_io_async, (void *)i);
		atomic_inc(&bh->b_count);
		arr[nr] = bh;
		nr++;
	}  while (i += blocksize >> bbits, (bh = bh->b_this_page) != head);

	++current->maj_flt;
	if (nr) {
		if (Page_Uptodate(page))
			BUG();
		ll_rw_block(READ, nr, arr);
	} else {
		/*
		 * all buffers are uptodate - we can set the page
		 * uptodate as well.
		 */
		SetPageUptodate(page);
		UnlockPage(page);
	}
	if (kaddr)
		kunmap(page);
	return 0;
}

STATIC int __pb_block_write_full_page(struct inode *inode, struct page *page)
{
	int i, bbits = inode->i_sb->s_blocksize_bits;
	struct buffer_head *bh, *head;
	int need_balance_dirty = 0;

	if (!PageLocked(page))
		BUG();

	bh = head = page->buffers;
	i = 0;
	do {
		if (!bh)
			BUG();

		PageBlockSetRangeValid(page, i, bh->b_size >> bbits);

		bh->b_end_io = end_pb_buffer_io_sync;
		bh->b_dev_id = (void *) i;
		set_bit(BH_Uptodate, &bh->b_state);
		if (!buffer_dirty(bh))
			need_balance_dirty = 1;
		__mark_buffer_dirty(bh,0);

		bh = bh->b_this_page;
		i += bh->b_size >> bbits;
	} while (bh != head);

	if (need_balance_dirty)
		balance_dirty(bh->b_dev);

	SetPageUptodate(page);
	return 0;
}



int pbwfp_debug = 0;

/*
 *	pagebuf_write_full_page
 */

int pagebuf_write_full_page(struct dentry *dentry, struct page *page)
{
	struct inode *inode = dentry->d_inode;
	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
	unsigned offset;
	int err, pb_flags;

	dprintk(pbwfp_debug,
	     ("pbwfp: ip 0x%p(is 0x%Lx) page 0x%p offset 0x%Lx\n",
		inode, inode->i_size, page, (loff_t) page->index << PAGE_SHIFT));

	if (test_bit(PG_delalloc, &page->flags))
		pb_flags = PBF_WRITE|PBF_FILE_ALLOCATE;
	else
		pb_flags = PBF_WRITE|PBF_DIRECT;

	err = _pagebuf_set_blocks(inode, page, pb_flags);
	if (err) goto out;

	/* easy case */
	if (page->index < end_index)
		return __pb_block_write_full_page(inode, page);

	/* things got complicated... */
	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
	/* OK, are we completely out? */
	if (page->index >= end_index+1 || !offset)
		return -EIO;
	err = __pb_block_prepare_write(inode, page, 0, offset, 1);
	if (!err) {
		memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
		__pb_block_commit_write(inode, page, 0, offset);
done:
		kunmap(page);
		return err;
	}
out:
	ClearPageUptodate(page);
	goto done;
}


void
hook_buffers_to_page(struct inode *inode,
	struct page *page, page_buf_bmap_t *mp, ulong bshift)
{
	struct buffer_head 	*bh;
	page_buf_daddr_t	bn;


	if (mp->pbm_bn < 0) {
		printk("hook_buffers_to_page: bad bn page 0x%p mp 0x%p\n",
			page, mp);
		BUG();
	}
	bn = mp->pbm_bn >>
		(bshift - inode->i_sb->s_blocksize_bits);
	bn += (mp->pbm_delta >> bshift);

	if (!page->buffers)
		create_empty_buffers(page, inode, PAGE_SIZE);
	bh = page->buffers;
	bh->b_blocknr = bn; 
	bh->b_state |= (1UL << BH_Mapped);
	bh->b_end_io = end_pb_buffer_io_async;
	bh->b_dev_id = (void *) 0;

	dprintk(pbpw_debug,
		("pbpw: created buffers page 0x%p index 0x%lx bno 0x%lx\n",
			page, page->index, bh->b_blocknr));
}

int pbwcm_debug = 0;

int
__pb_write_or_convert_bmap(struct inode *inode, loff_t offset, size_t size,
	page_buf_bmap_t *maps)
{
	int error, nmaps;

	error = inode->i_op->pagebuf_bmap(inode, offset, size,
			maps, 1, &nmaps, PBF_WRITE);
	if (error == 0 && (maps[0].pbm_flags & PBMF_DELAY)) {
		error = inode->i_op->pagebuf_bmap(inode, offset,
				maps[0].pbm_bsize, maps, 1,
				&nmaps, PBF_WRITE|PBF_FILE_ALLOCATE);
		if (error) {
			printk("pbwcm: bmap error %d ro 0x%Lx size 0x%x\n",
				   error, offset, maps[0].pbm_bsize);
		} else {
			dprintk(pbwcm_debug,
			 ("converted bn:%d off:%Ld size:%d flags:%d\n",
			     maps[0].pbm_bn, maps[0].pbm_offset,
			     maps[0].pbm_bsize, maps[0].pbm_flags));
		}
	}
	return error;
}


STATIC int
__pb_block_prepare_write_async(struct inode *inode, struct page *page,
		unsigned from, unsigned to, int at_eof, page_buf_bmap_t *mp)
{
	struct buffer_head 	*bh;
	int 			err = 0;
	int			nmaps;
	page_buf_bmap_t		map;

	dprintk(pbpw_debug,
		("pbpw: page 0x%p inode 0x%p index 0x%lx from 0x%lx to 0x%lx\n",
		page, inode, page->index,
		page_address(page) + from, page_address(page) + to));

	/*
	 * Create & map buffer.
	 */
	bh = page->buffers;
	if ((!bh || !buffer_mapped(bh)) &&
	    (!test_bit(PG_delalloc, &(page)->flags))) {
		if (!mp) {
			mp = &map;
			err = inode->i_op->pagebuf_bmap(inode,
				page->index << PAGE_SHIFT, PAGE_SIZE, mp,
				1, &nmaps, PBF_WRITE);
			if (err < 0) {
				printk("Read on Write: bmap failed err %d mp 0x%p\n",
					err, mp);
				err = -EIO;
				goto out;
			}
		}
		if (mp->pbm_bn > 0) {
			hook_buffers_to_page(inode, page, mp, PAGE_SHIFT);
			bh = page->buffers;
		}
	}

	/* Advance though extent no matter what */
	if (mp)
		mp->pbm_delta += PAGE_SIZE;

	/*
	 * Is the write over the entire page?
	 */
	if (from == 0 && to == PAGE_SIZE) {
		dprintk(pbpw_debug, ("pbpw: full page write\n"));
		goto out;
	}

	/*
	 * Partial write. Is the page valid anyway?
	 */
	if (PageBlockAllValid(page) || test_bit(PG_delalloc, &(page)->flags)) {
		dprintk(pbpw_debug, ("pbpw: page all valid\n"));
		goto out;
	}

	if ((at_eof && !(bh && buffer_mapped(bh))) ||
	    (mp && (mp->pbm_flags & (PBMF_DELAY|PBMF_NEW|PBMF_UNWRITTEN)))) {

		char *kaddr = (char *)kmap(page);
		
		dprintk(pbpw_debug, ("pbpw: new mapping\n"));
		/*
		 * Zero the parts of page not coverd by this I/O
		 */
		if (PAGE_SIZE > to) {
			dprintk(pbpw_debug,
			     ("pbpw: memset(1) page 0x%p "
			      "index 0x%lx from 0x%p to 0x%p\n",
				page, page->index, kaddr+to, kaddr+PAGE_SIZE));

			memset(kaddr+to, 0, PAGE_SIZE-to);
		}
		if (0 < from) {

			dprintk(pbpw_debug,
			     ("pbpw: memset(2) page 0x%p "
			      "index 0x%lx from 0x%p to 0x%p\n",
				page, page->index, kaddr, kaddr+from));

			memset(kaddr, 0, from);
		}
		goto out;
	}


	if (!bh) {
		printk("page 0x%p inode 0x%p mp 0x%p\n",
			page, inode, mp);
		BUG();
	}

	/*
	 * Ensure only one block allocated.
	 */
	if (bh != bh->b_this_page) {
		printk("bh 0x%p != bh->b_this_page 0x%p\n",bh,bh->b_this_page);
		err = -EIO;
		goto out;
	}
	bh->b_end_io = end_pb_buffer_io_sync;
	bh->b_dev_id = 0;

	dprintk(pbpw_debug,
		("pbpw: reading page 0x%p index 0x%lx bno 0x%lx endio 0x%p\n",
			page, page->index, bh->b_blocknr,
			bh->b_end_io));

	ll_rw_block(READ, 1, &bh);
	wait_on_buffer(bh);
	if (!buffer_uptodate(bh))
		err = -EIO;
out:
	return err;
}

int pbcw_debug = 0;
static int pcd_active;
int PB_MAX_DIRTY_PAGES = 1024;
int PB_MAX_DIRTY_FACTOR = 4;

void
set_buffer_dirty_uptodate(struct buffer_head *bh)
{
	int need_balance_dirty = 0;

	if (bh->b_blocknr <= 0) {
		printk("Warning: buffer 0x%p with weird blockno (%ld)\n",
			bh, bh->b_blocknr);
	}
	set_bit(BH_Uptodate, &bh->b_state);
	if (!buffer_dirty(bh)) {
		bh->b_end_io = end_pb_buffer_io_async;
		need_balance_dirty = 1;
	}
	__mark_buffer_dirty(bh, 0);

	if (need_balance_dirty)
		balance_dirty(bh->b_dev);
}

int pbcw_debug2 = 1;

STATIC void
__pb_block_commit_write_async(struct inode 	*inode,
				struct page 	*page,
				page_buf_bmap_t *mp)
{
	struct buffer_head	*bh;
	unsigned int		np;

	/*
	 * Prepare write took care of reading/zero-out
	 * parts of page not covered by from/to. Page is now
	 * fully valid.
	 */
	PageBlockSetAllValid(page);
	PageClearPartial(page);
	SetPageUptodate(page);
	if ((bh = page->buffers) && buffer_mapped(bh)) {
		if (test_bit(PG_delalloc, &page->flags)) {
			dprintk(pbcw_debug2, ("mapped buffer 0x%p page 0x%p is delalloc\n", bh, page));
		}
		set_buffer_dirty_uptodate(page->buffers);
		dprintk(pbcw_debug, ("pbcw: refiled valid buffer 0x%p\n",
			page->buffers));
	} else if (test_and_set_bit(PG_delalloc, &page->flags) == 0) {
		dprintk(pbcw_debug, ("Marking page 0x%p delalloc\n", page));
		np = atomic_read(&pb_delalloc_pages);
		if (np > PB_MAX_DIRTY_FACTOR * PB_MAX_DIRTY_PAGES) {
			clear_bit(PG_delalloc, &page->flags);
			if (__pb_write_or_convert_bmap(inode,
				page->index << PAGE_SHIFT, PAGE_SIZE, mp))
			{
				BUG();
			}
			hook_buffers_to_page(inode, page, mp, PAGE_SHIFT);
			set_buffer_dirty_uptodate(page->buffers);
			return;
		} else {
			atomic_inc(&pb_delalloc_pages);
			if (!pcd_active) {
				if (np > 2 * PB_MAX_DIRTY_PAGES)
					wake_up_interruptible_sync(&pcd_waitq);
				else if (np > PB_MAX_DIRTY_PAGES)
					wake_up_interruptible(&pcd_waitq);
			}
			balance_dirty(inode->i_rdev);
		}
	}
}

int
__pagebuf_do_delwri_async(
	struct inode	*inode,		/* target 			     */
	loff_t 		rounded_offset, /* offset in file, page aligned	     */
	unsigned long 	size,		/* size to write, constrained by wdp */
	char		*user_addr,
	size_t		len,
	loff_t		*lp,
	page_buf_bmap_t *mp)		/* bmap for page		     */
{
	struct page *page;
	unsigned long done;
	int err = 0, written = 0;
	loff_t foff = *lp;
	char *kaddr;

	dprintk(delwri_debug,
		("DELWRI: ro 0x%Lx foff 0x%Lx size 0x%lx base 0x%p mp 0x%p\n",
		rounded_offset, foff, size, user_addr, mp)); 

	for (done = 0; (done < size) && len;
		done += PAGE_CACHE_SIZE, rounded_offset += PAGE_CACHE_SIZE)
	{
		int bytes_in_page;
		int at_eof;
		unsigned long offset_in_page;

		page = grab_cache_page(inode->i_mapping,
				rounded_offset >> PAGE_CACHE_SHIFT);

		if (!page) {
			err = -ENOMEM;
			break;
		}

		if (page->buffers) {
			dprintk(delwri_debug,
				("DELWRI: page 0x%p present\n", page));
		}

		at_eof = foff == inode->i_size;

		offset_in_page = foff & (PAGE_CACHE_SIZE - 1);
		bytes_in_page = PAGE_CACHE_SIZE - offset_in_page;
		bytes_in_page = min(bytes_in_page, len);
		bytes_in_page = min(bytes_in_page, size);

		dprintk(delwri_debug, ("DELWRI: ro 0x%Lx offinp 0x%lx"
			" size 0x%lx base 0x%p bip 0x%x done 0x%lx\n",
			rounded_offset, offset_in_page, size, user_addr,
			bytes_in_page, done));

		err = __pb_block_prepare_write_async(inode, page,
			offset_in_page, offset_in_page + bytes_in_page,
			at_eof, mp);
		
		if (err)
			goto unlock;
		kaddr = (char*)page_address(page);
		err = copy_from_user(kaddr+offset_in_page, user_addr,
						bytes_in_page);
		if (err) {
			ClearPageUptodate(page);
			kunmap(page);
			goto unlock;
		}

		__pb_block_commit_write_async(inode, page, mp);
		kunmap(page);

		foff += bytes_in_page;
		len -= bytes_in_page;
		written += bytes_in_page;
		user_addr += bytes_in_page;

		if (foff > inode->i_size)
			inode->i_size = foff;

unlock:
		UnlockPage(page);
		page_cache_release(page);
		if (err < 0)
			break;
	}
	*lp = foff;
	return written ? written : err;
}

int
_pagebuf_file_write(
	struct file * filp,	/* file to write                */
	char *buf,		/* buffer address               */
	size_t len,		/* size of buffer               */
	loff_t * lp)		/* file offset to use and update */
{
	struct inode *inode = filp->f_dentry->d_inode;
	page_buf_bmap_t map;
	int maps_returned;
	unsigned long map_size, size;
	int status = 0, written = 0, pb_flags, ilen = len;
	loff_t foff = *lp;
	int sync;

	sync = filp->f_flags & O_SYNC;

	pb_flags = PBF_WRITE;
	if (sync)
		pb_flags |= PBF_SYNC;

	/*
	 * while we have data to do, get a bunch of mapping for this
	 * file to blocks on disk (or in delalloc or holes or ...).
	 *  For each map entry,
	 *      get a pagebuf. Note that pagebuf's are limited in size
	 *              so we need to loop again for each chunksize
	 *              within the mapping the file system returned.
	 */

	while (len) {

		loff_t rounded_offset;
		size_t rounded_size;

		/*
		 * Let's start by calling bmap for the offset/len we have.
		 * This will return the on disk representation
		 * (or dealalloc/holes).
		 *
		 * Once we know the on disk and/or in memory representation,
		 * we can better do the I/Os in chunks or ...
		 */

		/*
		 * Make the I/O which will fill pages,
		 * page aligned and complete pages.
		 */

		rounded_offset = foff & PAGE_CACHE_MASK;
		rounded_size = len + (foff - rounded_offset);
		rounded_size =
			(rounded_size + PAGE_CACHE_SIZE - 1) & PAGE_CACHE_MASK;

		/*
		 * round the size up to some minimum value
		 * since this is allocation. 64K for now.
		 *
		 * Don't round up if we are writing within the file
		 * since this probably means seek/write/... and
		 * it isn't sequential. Leave holes.
		 */

		size = rounded_size;
		if (rounded_offset >= inode->i_size)
			size = PBF_IO_CHUNKSIZE;

		status = inode->i_op->pagebuf_bmap(inode, rounded_offset, size,
				&map, 1, &maps_returned, pb_flags);

		if (status) {
			printk("pbfwa: delay bmap returned error %d ro 0x%Lx size 0x%lx\n",
			status, rounded_offset, size);
			break;
		}

		dprintk(pbpw_debug,
		       ("DELAY returned bn:%d off:%d size:%d flags:%d\n",
			map.pbm_bn, map.pbm_delta, map.pbm_bsize,
			map.pbm_flags));

		/*
		 * Get the size of this mapping from the offset we
		 * care about to the end of the mapping. Then,
		 * reduce the size to lesser of the user's or the
		 * piece in the map.
		 */

		map_size = map.pbm_bsize - map.pbm_delta;
		size = min(map_size, rounded_size);

		/*
		 * Holes mean we came to the end of the space returned
		 * from the file system. We need to go back and ask for
		 * more space.
		 */
		if (map.pbm_flags & PBMF_HOLE) {
			printk("pbfwa: HOLE ro 0x%Lx size 0x%lx mp 0x%p\n",
				rounded_offset, size, &map);
			break;
		}
		/*
		 * Handle delwri before (write-through) sync.
		 */
		status = __pagebuf_do_delwri_async(inode,
				rounded_offset, size, buf,
				len, &foff, &map);
		if (status <= 0)
			break;
		written += status;
		buf += status;
		len -= status;
	}
	*lp = foff;
	if (written != ilen) {
		printk("PBFWA: short write written %d ilen %d status %d\n",
				written, ilen, status);
	}
	return written ? written : status;
}



ssize_t
pagebuf_generic_file_write(
    struct file * filp,		/* file to write                */
    char *buf,			/* buffer address               */
    size_t len,			/* size of buffer               */
    loff_t * lp)		/* file offset to use and update */
{
	struct inode *inode = filp->f_dentry->d_inode;
	unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
	struct page *page, **hash;
	unsigned long index;
	int status = 0, written = 0, ilen = len;
	char *kaddr;
	loff_t foff;
	page_buf_bmap_t map;

	if ((foff = *lp) < 0)
		goto out;

	if ((status = filp->f_error) != 0) {
		filp->f_error = 0;
		goto out;
	}

	/*
	 * Check if we've reached the file size limit.
	 */
	if (limit != RLIM_INFINITY) {
		if (foff >= limit) {
			send_sig(SIGXFSZ, current, 0);
			status = -EFBIG;
			goto out;
		}
		if (len > limit - foff) {
			send_sig(SIGXFSZ, current, 0);
			len = limit - foff;
		}
	}

	while (len) {
		unsigned long bytes, offset;
		int	at_eof;
		/*
		 * Try to find the page in the cache. If it isn't there,
		 * allocate a free page. If the write is sufficiently big,
		 * use pagebufs with possible delayed allocation.
		 */
		offset = (foff & ~PAGE_CACHE_MASK);
		index = foff >> PAGE_CACHE_SHIFT;
		bytes = PAGE_CACHE_SIZE - offset;
		if (bytes > len)
			bytes = len;

		if (&inode->i_data != inode->i_mapping)
			BUG();

		hash = page_hash(&inode->i_data, index);
		page = __find_lock_page(&inode->i_data, index, hash);

		if (!page) {
			status = _pagebuf_file_write(filp,
					buf, len, &foff);
			if (status > 0)
				written += status;

			if (foff > inode->i_size)
				inode->i_size = foff;
			break;
		}

		/*
		 * Do the real work.. If the writer ends up delaying the write,
		 * the writer needs to increment the page use counts until he
		 * is done with the page.
		 */

		if (!PageLocked(page)) {
			PAGE_BUG(page);
		}

		at_eof = foff == inode->i_size;

		status = __pb_block_prepare_write_async(inode, page,
			offset, offset + bytes, at_eof, NULL);
		
		if (status)
			goto unlock;
		kaddr = (char *)page_address(page);
		status = copy_from_user(kaddr+offset, buf, bytes);
		if (status) {
			status = -EFAULT;
			ClearPageUptodate(page);
			kunmap(page);
			goto unlock;
		}

		__pb_block_commit_write_async(inode, page, &map);
		kunmap(page);

		len -= bytes;
		buf += bytes;
		foff += bytes;
		written += bytes;

		if (foff > inode->i_size)
			inode->i_size = foff;

		if (PageBlockAllValid(page)) {
			PageClearPartial(page);
			SetPageUptodate(page);
		} else {
			PageSetPartial(page);
		}
unlock:
		UnlockPage(page);
		page_cache_release(page);

		if (status < 0)
			break;
	}
	*lp = foff;

out:
	if (written != ilen) {
		printk("PBGFWA: short write written %d ilen %d status %d\n",
				written, ilen, status);
	}
	return written ? written : status;
}

int pcd_debug = 0;
int pcd_skip_locked = 0;
int pcd_ilock_failed = 0;
static int page_cleaner_daemon_started = 0;
static int daemon_terminate = 0;

/*
 * Probe for a given page (index) in the inode & test if it is delayed.
 * Returns page locked with an extra reference count.
 */
STATIC struct page *
probe_page(struct inode *inode, unsigned long index)
{
	struct page *page;
	
	page = __find_lock_page(inode->i_mapping, index,
				page_hash(inode->i_mapping, index));
	if (!page)
		return NULL;
	if (!test_and_clear_bit(PG_delalloc, &(page)->flags)) {
		UnlockPage(page);
		put_page(page);
		return NULL;
	}
	return page;
}

/*
 * Allocate & map buffers for page given the extent map.
 */
STATIC void
convert_page(struct inode *inode, struct page *page, page_buf_bmap_t *mp)
{
	mp->pbm_delta = (page->index << PAGE_SHIFT) - mp->pbm_offset;
	hook_buffers_to_page(inode, page, mp, PAGE_SHIFT);
	set_buffer_dirty_uptodate(page->buffers);
	UnlockPage(page);
	put_page(page);
	atomic_dec(&pb_delalloc_pages);
}

/*
 * convert & write out a cluster of pages in the same extent as defined
 * by mp and surrounding the page as defined by index.
 */
STATIC void
cluster_write(struct inode *inode,
	      unsigned long index,
	      page_buf_bmap_t *mp)
{
	unsigned long tindex;
	struct page *page;

	if (index != 0) {
		for (tindex = index-1;
			mp->pbm_offset < (tindex << PAGE_SHIFT); tindex--)
		{
			if (!(page = probe_page(inode, tindex)))
				break;
			convert_page(inode, page, mp);
		}
	}
	for (tindex = index + 1;
	   mp->pbm_offset + mp->pbm_bsize > (tindex << PAGE_SHIFT); tindex++)
	{
		if (!(page = probe_page(inode, tindex)))
			break;
		convert_page(inode, page, mp);
	}
}

STATIC int
pb_delalloc_convert(
	mem_map_t *mm,		/* delalloc page to convert - locked */
	u_long	flags,		/* flags to pass to bmap call */
	int	cluster)	/* can we cluster conversion? */ 
{
	page_buf_bmap_t maps[PBF_MAX_MAPS];
	struct inode *inode;
	int maps_returned, error;
	u_long pb_flags;
	loff_t rounded_offset;

	rounded_offset = mm->index << PAGE_SHIFT;
	inode = (struct inode *)mm->mapping->host;

	pb_flags = PBF_WRITE | PBF_FILE_ALLOCATE | flags;
	error = inode->i_op->pagebuf_bmap(inode, rounded_offset, PAGE_SIZE,
			&maps[0], PBF_MAX_MAPS, &maps_returned, pb_flags);

	if ((error == 0) && (maps[0].pbm_flags & PBMF_HOLE)) {
		printk("delalloc page 0x%p with no extent\n", mm);
		BUG();
	}

	if (error) {
		if (error == EAGAIN) {
			pcd_ilock_failed++;
			set_bit(PG_delalloc, &mm->flags);
		 } else {
			printk("PCD: pagebuf_bmap error %d pb_flags 0x%lx\n",
				error, pb_flags);
		}
		UnlockPage(mm);
		return error;
	}
	if (maps[0].pbm_delta % PAGE_SIZE) {
		printk("PCD: pbm_delta not page aligned mp 0x%p\n", &maps[0]);
		UnlockPage(mm);
		return 0;
	}
	hook_buffers_to_page(inode, mm, &maps[0], PAGE_SHIFT);
	set_buffer_dirty_uptodate(mm->buffers);

	UnlockPage(mm);
	atomic_dec(&pb_delalloc_pages);

	/*
	 * Don't do clustering (involving looking up page cache)
	 * if called from flush (which already had page_cache lock).
	 */
	if (cluster)
		cluster_write(inode, mm->index, &maps[0]);

	return 0;
}

int pcd_debug2 = 0;
int pcd_period = 500; /* milli-seconds */

STATIC int
page_cleaner_daemon(void *data)
{
	mem_map_t *mm = &mem_map[0], *mmlast = &mem_map[max_mapnr];
	u_long flags;
	struct buffer_head *bh;
	int 	pb_min_save = PB_MIN_DIRTY_PAGES;

	/*  Set up the thread  */
	daemonize();

	spin_lock_irqsave(&current->sigmask_lock, flags);	
	flush_signals(current);
	sigfillset(&current->blocked);
	recalc_sigpending(current);
	spin_unlock_irqrestore(&current->sigmask_lock, flags);

	sprintf(current->comm, "page_daemon");

	/*
	 * If we need more memory to do bmap,
	 * indicate this thread might really need it.
	 */
	current->flags |= PF_MEMALLOC;

	mm = &mem_map[0] - 1;
	while (1) {
		/*
		 * If we actually get into a low-memory situation,
		 * the processes needing more memory will wake us
		 * up on a more timely basis.
		 */

		pcd_skip_locked = 0;
		pcd_ilock_failed = 0;
		while (atomic_read(&pb_delalloc_pages) > PB_MIN_DIRTY_PAGES) {
			if (current->need_resched)
				schedule();

			if (++mm >= mmlast)
				mm = &mem_map[0];
			if (!test_bit(PG_delalloc, &(mm)->flags))
				continue;
			if (TryLockPage(mm)) {
				pcd_skip_locked++;
				continue;
			}
			if (!test_and_clear_bit(PG_delalloc, &(mm)->flags)) {
				printk("PCD: page 0x%p delalloc bit unset\n",
						mm);
				UnlockPage(mm);
				continue;
			}

			bh = mm->buffers;
			if (bh && buffer_mapped(bh)) {
				/*
				 * delalloc page has buffers refile it.
				 */

				set_buffer_dirty_uptodate(bh);

				UnlockPage(mm);
				atomic_dec(&pb_delalloc_pages);
				continue;
			}

/*---------------- DELALLOC CONVERT --------------------------------*/
/* since bmap can block, this should be in a different daemon       */
/*---------------- DELALLOC CONVERT --------------------------------*/

			pb_delalloc_convert(mm, PBF_BMAP_TRY_ILOCK, 1);

		}
		pcd_active = 0;

		if (daemon_terminate) {
			page_cleaner_daemon_started = 0;
			wake_up_interruptible(&pcd_waitq);
			break;
		}

		/*
		 * if woken up periodically (nothing else to do)
		 * convert all the pages, else convert only
		 * to keep watermarks happy.
		 */
		if (interruptible_sleep_on_timeout(&pcd_waitq,
				(pcd_period*HZ)/1000) == 0)
		{
			PB_MIN_DIRTY_PAGES = 0;
		} else
			PB_MIN_DIRTY_PAGES = pb_min_save;
		pcd_active = 1;
	}
	return 0;
}

int
_page_cleaner_daemon_start(void)
{
	if (!page_cleaner_daemon_started) {
		page_cleaner_daemon_started = 1;

		/*
		 * watermarks: at 1/16 of total mem start waking
		 * the daemon to convert ... at 1/8th kick the
		 * daemon synchronously ... at 1/4th stop generating
		 * any more delay pages. Low water before daemon
		 * normally stops is 1/4th of when the daemon is
		 * activated.
		 */
		PB_MAX_DIRTY_PAGES = max_mapnr >> 4;
		PB_MIN_DIRTY_PAGES = PB_MAX_DIRTY_PAGES >> 2;

		if (0 > kernel_thread(page_cleaner_daemon, (void *)0,0))
		{
			printk("Can't start page cleaner daemon\n");
			return -1; /* error */
		}
	}
	return 0; /* success */
}

void
_page_cleaner_daemon_stop(void)
{
	daemon_terminate = 1;
	wake_up_interruptible_sync(&pcd_waitq);
	while (page_cleaner_daemon_started)
		interruptible_sleep_on(&pcd_waitq);
}


/*
 *	Module management
 */

EXPORT_SYMBOL(pagebuf_readahead);
EXPORT_SYMBOL(pagebuf_flush);
EXPORT_SYMBOL(pagebuf_inval);
EXPORT_SYMBOL(pagebuf_flushinval);
EXPORT_SYMBOL(pagebuf_sethole);
EXPORT_SYMBOL(pagebuf_iozero);
EXPORT_SYMBOL(pagebuf_file_read);
EXPORT_SYMBOL(pagebuf_generic_file_read);
EXPORT_SYMBOL(pagebuf_generic_file_write);
EXPORT_SYMBOL(pagebuf_read_full_page);
EXPORT_SYMBOL(pagebuf_write_full_page);

