[PATCH] writeback from address spaces

author Andrew Morton <akpm@zip.com.au>

Tue, 30 Apr 2002 06:52:10 +0000 (23:52 -0700)

committer Linus Torvalds <torvalds@home.transmeta.com>

Tue, 30 Apr 2002 06:52:10 +0000 (23:52 -0700)
author Andrew Morton <akpm@zip.com.au>
Tue, 30 Apr 2002 06:52:10 +0000 (23:52 -0700)
committer Linus Torvalds <torvalds@home.transmeta.com>
Tue, 30 Apr 2002 06:52:10 +0000 (23:52 -0700)
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c

index 3aab4502e1354e5abe9b2b59bef25026c89ae7af..0f785a9a2cb85808001f1bcb3762a532c2c9b82c 100644 (file)
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1409,6 +1409,11 @@ int submit_bh(int rw, struct buffer_head * bh)
         BUG_ON(!buffer_mapped(bh));
         BUG_ON(!bh->b_end_io);
  
+       if ((rw == READ || rw == READA) && buffer_uptodate(bh))
+               printk("%s: read of uptodate buffer\n", __FUNCTION__);
+       if (rw == WRITE && !buffer_uptodate(bh))
+               printk("%s: write of non-uptodate buffer\n", __FUNCTION__);
+               
         set_bit(BH_Req, &bh->b_state);
  
         /*
@@ -1465,6 +1470,7 @@ int submit_bh(int rw, struct buffer_head * bh)
   *  a multiple of the current approved size for the device.
   *
   **/
+
  void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
  {
         unsigned int major;
@@ -1513,7 +1519,6 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
                         if (!atomic_set_buffer_clean(bh))
                                 /* Hmmph! Nothing to write */
                                 goto end_io;
-                       __mark_buffer_clean(bh);
                         break;
  
                 case READA:
diff --git a/drivers/block/loop.c b/drivers/block/loop.c

index b2b9d7411dfbff31df5723fc76f7d639f70dab14..d3127589d57de8c900e43f0d0c0c1c0f3f5a4012 100644 (file)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -548,8 +548,6 @@ static int loop_thread(void *data)
         atomic_inc(&lo->lo_pending);
         spin_unlock_irq(&lo->lo_lock);
  
-       current->flags |= PF_NOIO;
-
         /*
          * up sem, we are running
          */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 4ca0d90aaa24e566cd5ada9281b47f6c7fa9b515..1ea9f0762bae4e1b7623f6253280d57e7b006654 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -474,7 +474,6 @@ static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
  
         bh->b_state     = (1 << BH_Req) | (1 << BH_Mapped);
         bh->b_size      = sh->size;
-       bh->b_list      = BUF_LOCKED;
         return bh;
  }
  
diff --git a/fs/Makefile b/fs/Makefile

index f129943df16fe8221caf23e9669481afb698f5aa..2449b05e367ab5e5c7da8fe34bf140889b300262 100644 (file)
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,7 +14,8 @@ obj-y :=      open.o read_write.o devices.o file_table.o buffer.o \
                 bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \
                 namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
                 dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
-               filesystems.o namespace.o seq_file.o xattr.o libfs.o
+               filesystems.o namespace.o seq_file.o xattr.o libfs.o \
+               fs-writeback.o
  
  ifneq ($(CONFIG_NFSD),n)
  ifneq ($(CONFIG_NFSD),)
diff --git a/fs/block_dev.c b/fs/block_dev.c

index 8fb546dad90f424e8aa929a438a967580365a5c3..2a566c1385ca586232d4ff23b2551f152e692ba7 100644 (file)
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -180,7 +180,9 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
         return retval;
  }
         
-
+/*
+ * AKPM: fixme.  unneeded stuff here.
+ */
  static int __block_fsync(struct inode * inode)
  {
         int ret, err;
@@ -759,6 +761,8 @@ struct address_space_operations def_blk_aops = {
         sync_page: block_sync_page,
         prepare_write: blkdev_prepare_write,
         commit_write: blkdev_commit_write,
+       writeback_mapping: generic_writeback_mapping,
+       vm_writeback: generic_vm_writeback,
         direct_IO: blkdev_direct_IO,
  };
  
diff --git a/fs/buffer.c b/fs/buffer.c

index b0f99601d6845501db99c56cf6b2378a770af511..13529fa2be2e8592b256e36186897d2ea7b5b3ef 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1,57 +1,36 @@
  /*
   *  linux/fs/buffer.c
   *
- *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   */
  
  /*
- *  'buffer.c' implements the buffer-cache functions. Race-conditions have
- * been avoided by NEVER letting an interrupt change a buffer (except for the
- * data, of course), but instead letting the caller do it.
- */
-
-/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
-
-/* Removed a lot of unnecessary code and simplified things now that
+ * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
+ *
+ * Removed a lot of unnecessary code and simplified things now that
   * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
+ *
+ * Speed up hash, lru, and free list operations.  Use gfp() for allocating
+ * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
+ *
+ * Added 32k buffer block sizes - these are required older ARM systems. - RMK
+ *
+ * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
   */
  
-/* Speed up hash, lru, and free list operations.  Use gfp() for allocating
- * hash table, use SLAB cache for buffer heads. -DaveM
- */
-
-/* Added 32k buffer block sizes - these are required older ARM systems.
- * - RMK
- */
-
-/* Thread it... -DaveM */
-
-/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
-
  #include <linux/config.h>
-#include <linux/time.h>
  #include <linux/fs.h>
+#include <linux/mm.h>
  #include <linux/slab.h>
  #include <linux/locks.h>
-#include <linux/errno.h>
-#include <linux/swap.h>
-#include <linux/swapctl.h>
  #include <linux/smp_lock.h>
-#include <linux/vmalloc.h>
  #include <linux/blkdev.h>
-#include <linux/sysrq.h>
  #include <linux/file.h>
-#include <linux/init.h>
  #include <linux/quotaops.h>
  #include <linux/iobuf.h>
-#include <linux/highmem.h>
  #include <linux/module.h>
-#include <linux/compiler.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
+#include <linux/writeback.h>
  #include <asm/bitops.h>
-#include <asm/mmu_context.h>
  
  #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
@@ -59,84 +38,110 @@
                                              number of unused buffer heads */
  
  /* Anti-deadlock ordering:
- *     lru_list_lock > hash_table_lock > unused_list_lock
+ *     i_bufferlist_lock > unused_list_lock
   */
  
  #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
  
  /*
- * Hash table gook..
+ * A local cache of buffer_heads is maintained at unused_list.
+ * Free buffers are chained through their b_private field.
   */
-static unsigned int bh_hash_mask;
-static unsigned int bh_hash_shift;
-static struct buffer_head **hash_table;
-static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
-
-static struct buffer_head *lru_list[NR_LIST];
-static spinlock_t lru_list_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-static int nr_buffers_type[NR_LIST];
-static unsigned long size_buffers_type[NR_LIST];
-
-static struct buffer_head * unused_list;
+static struct buffer_head *unused_list;
  static int nr_unused_buffer_heads;
  static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  
-static int grow_buffers(struct block_device *bdev, unsigned long block, int size);
-static void __refile_buffer(struct buffer_head *);
-
  /* This is used by some architectures to estimate available memory. */
  atomic_t buffermem_pages = ATOMIC_INIT(0);
  
-/* Here is the parameter block for the bdflush process. If you add or
- * remove any of the parameters, make sure to update kernel/sysctl.c
- * and the documentation at linux/Documentation/sysctl/vm.txt.
+/*
+ * Several of these buffer list functions are exported to filesystems,
+ * so we do funny things with the spinlocking to support those
+ * filesystems while still using inode->i_bufferlist_lock for
+ * most applications.
+ * FIXME: put a spinlock in the reiserfs journal and kill this lock.
   */
+static spinlock_t global_bufferlist_lock = SPIN_LOCK_UNLOCKED;
  
-#define N_PARAM 9
-
-/* The dummy values in this structure are left in there for compatibility
- * with old programs that play with the /proc entries.
+/*
+ * Debug/devel support stuff
   */
-union bdflush_param {
-       struct {
-               int nfract;     /* Percentage of buffer cache dirty to 
-                                  activate bdflush */
-               int dummy1;     /* old "ndirty" */
-               int dummy2;     /* old "nrefill" */
-               int dummy3;     /* unused */
-               int interval;   /* jiffies delay between kupdate flushes */
-               int age_buffer; /* Time for normal buffer to age before we flush it */
-               int nfract_sync;/* Percentage of buffer cache dirty to 
-                                  activate bdflush synchronously */
-               int dummy4;     /* unused */
-               int dummy5;     /* unused */
-       } b_un;
-       unsigned int data[N_PARAM];
-} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}};
-
-/* These are the min and max parameter values that we will allow to be assigned */
-int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
-int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
+
+void __buffer_error(char *file, int line)
+{
+       static int enough;
+
+       if (enough > 10)
+               return;
+       enough++;
+       printk("buffer layer error at %s:%d\n", file, line);
+#ifdef CONFIG_X86
+       printk("Pass this trace through ksymoops for reporting\n");
+       {
+               extern void show_stack(long *esp);
+               show_stack(0);
+       }
+#endif
+}
+EXPORT_SYMBOL(__buffer_error);
+
+inline void
+init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+{
+       bh->b_end_io = handler;
+       bh->b_private = private;
+}
  
  void unlock_buffer(struct buffer_head *bh)
  {
-       clear_bit(BH_Wait_IO, &bh->b_state);
-       clear_bit(BH_launder, &bh->b_state);
+       /*
+        * unlock_buffer against a zero-count bh is a bug, if the page
+        * is not locked.  Because then nothing protects the buffer's
+        * waitqueue, which is used here. (Well.  Other locked buffers
+        * against the page will pin it.  But complain anyway).
+        */
+       if (atomic_read(&bh->b_count) == 0 && !PageLocked(bh->b_page))
+               buffer_error();
+
         clear_bit(BH_Lock, &bh->b_state);
         smp_mb__after_clear_bit();
         if (waitqueue_active(&bh->b_wait))
                 wake_up(&bh->b_wait);
  }
  
+static inline void
+__set_page_buffers(struct page *page, struct buffer_head *head)
+{
+       struct inode *inode = page->mapping->host;
+
+       if (inode && S_ISBLK(inode->i_mode))
+               atomic_inc(&buffermem_pages);
+       if (page_has_buffers(page))
+               buffer_error();
+       set_page_buffers(page, head);
+       page_cache_get(page);
+}
+
+static inline void
+__clear_page_buffers(struct page *page)
+{
+       struct address_space *mapping = page->mapping;
+
+       if (mapping) {
+               struct inode *inode = mapping->host;
+
+               if (S_ISBLK(inode->i_mode))
+                       atomic_dec(&buffermem_pages);
+       }
+       clear_page_buffers(page);
+       page_cache_release(page);
+}
+
  /*
- * Rewrote the wait-routines to use the "new" wait-queue functionality,
- * and getting rid of the cli-sti pairs. The wait-queue routines still
- * need cli-sti, but now it's just a couple of 386 instructions or so.
- *
- * Note that the real wait_on_buffer() is an inline function that checks
- * if 'b_wait' is set before calling this, so that the queues aren't set
- * up unnecessarily.
+ * Block until a buffer comes unlocked.  This doesn't stop it
+ * from becoming locked again - you have to lock it yourself
+ * if you want to preserve its state.
   */
  void __wait_on_buffer(struct buffer_head * bh)
  {
@@ -163,203 +168,87 @@ void __wait_on_buffer(struct buffer_head * bh)
   */
  void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  {
+       if (!uptodate)
+               printk("%s: I/O error\n", __FUNCTION__);
         mark_buffer_uptodate(bh, uptodate);
         unlock_buffer(bh);
         put_bh(bh);
  }
  
  /*
- * The buffers have been marked clean and locked.  Just submit the dang
- * things.. 
+ * write out all the dirty data associated with a block device
+ * via its mapping.  Does not take the superblock lock.
+ *
+ * If `wait' is true, wait on the writeout.
   */
-static void write_locked_buffers(struct buffer_head **array, unsigned int count)
+int sync_buffers(struct block_device *bdev, int wait)
  {
-       do {
-               struct buffer_head * bh = *array++;
-               bh->b_end_io = end_buffer_io_sync;
-               submit_bh(WRITE, bh);
-       } while (--count);
-}
+       int ret;
  
-/*
- * Write some buffers from the head of the dirty queue.
- *
- * This must be called with the LRU lock held, and will
- * return without it!
- */
-#define NRSYNC (32)
-static int write_some_buffers(struct block_device *bdev)
-{
-       struct buffer_head *next;
-       struct buffer_head *array[NRSYNC];
-       unsigned int count;
-       int nr;
-
-       next = lru_list[BUF_DIRTY];
-       nr = nr_buffers_type[BUF_DIRTY];
-       count = 0;
-       while (next && --nr >= 0) {
-               struct buffer_head * bh = next;
-               next = bh->b_next_free;
-
-               if (bdev && bh->b_bdev != bdev)
-                       continue;
-               if (test_and_set_bit(BH_Lock, &bh->b_state))
-                       continue;
-               if (atomic_set_buffer_clean(bh)) {
-                       __refile_buffer(bh);
-                       get_bh(bh);
-                       array[count++] = bh;
-                       if (count < NRSYNC)
-                               continue;
+       ret = filemap_fdatasync(bdev->bd_inode->i_mapping);
+       if (wait) {
+               int err;
  
-                       spin_unlock(&lru_list_lock);
-                       write_locked_buffers(array, count);
-                       return -EAGAIN;
-               }
-               unlock_buffer(bh);
-               __refile_buffer(bh);
+               err = filemap_fdatawait(bdev->bd_inode->i_mapping);
+               if (!ret)
+                       ret = err;
         }
-       spin_unlock(&lru_list_lock);
-
-       if (count)
-               write_locked_buffers(array, count);
-       return 0;
-}
-
-/*
- * Write out all buffers on the dirty list.
- */
-static void write_unlocked_buffers(struct block_device *bdev)
-{
-       do {
-               spin_lock(&lru_list_lock);
-       } while (write_some_buffers(bdev));
-       run_task_queue(&tq_disk);
+       return ret;
  }
  
  /*
- * Wait for a buffer on the proper list.
+ * Write out all the dirty data associated with a block device
+ * via its mapping.  Does not take the superblock lock.
   *
- * This must be called with the LRU lock held, and
- * will return with it released.
+ * Wait on the writeout.
   */
-static int wait_for_buffers(struct block_device *bdev, int index, int refile)
-{
-       struct buffer_head * next;
-       int nr;
-
-       next = lru_list[index];
-       nr = nr_buffers_type[index];
-       while (next && --nr >= 0) {
-               struct buffer_head *bh = next;
-               next = bh->b_next_free;
-
-               if (!buffer_locked(bh)) {
-                       if (refile)
-                               __refile_buffer(bh);
-                       continue;
-               }
-               if (bdev && bh->b_bdev !=  bdev)
-                       continue;
-
-               get_bh(bh);
-               spin_unlock(&lru_list_lock);
-               wait_on_buffer (bh);
-               put_bh(bh);
-               return -EAGAIN;
-       }
-       spin_unlock(&lru_list_lock);
-       return 0;
-}
-
-static inline void wait_for_some_buffers(struct block_device *bdev)
+int fsync_no_super(struct block_device *bdev)
  {
-       spin_lock(&lru_list_lock);
-       wait_for_buffers(bdev, BUF_LOCKED, 1);
-}
+       int ret = 0;
  
-static int wait_for_locked_buffers(struct block_device *bdev, int index, int refile)
-{
-       do {
-               spin_lock(&lru_list_lock);
-       } while (wait_for_buffers(bdev, index, refile));
-       return 0;
+       if (bdev)
+               ret = sync_buffers(bdev, 1);
+       return ret;
  }
  
-/* Call sync_buffers with wait!=0 to ensure that the call does not
- * return until all buffer writes have completed.  Sync() may return
- * before the writes have finished; fsync() may not.
- */
-
-/* Godamity-damn.  Some buffers (bitmaps for filesystems)
- * spontaneously dirty themselves without ever brelse being called.
- * We will ultimately want to put these in a separate list, but for
- * now we search all of the lists for dirty buffers.
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
   */
-int sync_buffers(struct block_device *bdev, int wait)
-{
-       int err = 0;
-
-       if (!bdev)
-               return 0;
-
-       /* One pass for no-wait, three for wait:
-        * 0) write out all dirty, unlocked buffers;
-        * 1) wait for all dirty locked buffers;
-        * 2) write out all dirty, unlocked buffers;
-        * 2) wait for completion by waiting for all buffers to unlock.
-        */
-       write_unlocked_buffers(bdev);
-       if (wait) {
-               err = wait_for_locked_buffers(bdev, BUF_DIRTY, 0);
-               write_unlocked_buffers(bdev);
-               err |= wait_for_locked_buffers(bdev, BUF_LOCKED, 1);
-       }
-       return err;
-}
-
-int sync_all_buffers(int wait)
-{
-       int err = 0;
-
-       /* One pass for no-wait, three for wait:
-        * 0) write out all dirty, unlocked buffers;
-        * 1) wait for all dirty locked buffers;
-        * 2) write out all dirty, unlocked buffers;
-        * 2) wait for completion by waiting for all buffers to unlock.
-        */
-       write_unlocked_buffers(NULL);
-       if (wait) {
-               err = wait_for_locked_buffers(NULL, BUF_DIRTY, 0);
-               write_unlocked_buffers(NULL);
-               err |= wait_for_locked_buffers(NULL, BUF_LOCKED, 1);
-       }
-       return err;
-}
-
  int fsync_super(struct super_block *sb)
  {
-       sync_buffers(sb->s_bdev, 0);
-
-       lock_kernel();
-       sync_inodes_sb(sb);
+       sync_inodes_sb(sb);     /* All the inodes */
         DQUOT_SYNC(sb);
         lock_super(sb);
         if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
                 sb->s_op->write_super(sb);
         unlock_super(sb);
-       unlock_kernel();
  
-       return sync_buffers(sb->s_bdev, 1);
+       return fsync_no_super(sb->s_bdev);
  }
  
-int fsync_no_super(struct block_device *bdev)
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
  {
-       sync_buffers(bdev, 0);
-       return sync_buffers(bdev, 1);
+       struct super_block *sb = get_super(to_kdev_t(bdev->bd_dev));
+       if (sb) {
+               int res = fsync_super(sb);
+               drop_super(sb);
+               return res;
+       }
+       return fsync_no_super(bdev);
  }
  
+/*
+ * Write out and wait upon all dirty data associated with this
+ * kdev_t.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
  int fsync_dev(kdev_t dev)
  {
         struct block_device *bdev = bdget(kdev_t_to_nr(dev));
@@ -371,33 +260,22 @@ int fsync_dev(kdev_t dev)
         return 0;
  }
  
-int fsync_bdev(struct block_device *bdev)
-{
-       struct super_block *sb = get_super(to_kdev_t(bdev->bd_dev));
-       if (sb) {
-               int res = fsync_super(sb);
-               drop_super(sb);
-               return res;
-       }
-       return fsync_no_super(bdev);
-}
-
+/*
+ * sync everything.
+ */
  asmlinkage long sys_sync(void)
  {
-       sync_all_buffers(0);
-
-       lock_kernel();
-       sync_inodes();
+       sync_inodes();  /* All mappings and inodes, including block devices */
         DQUOT_SYNC(NULL);
-       sync_supers();
-       unlock_kernel();
-
-       sync_all_buffers(1);
+       sync_supers();  /* Write the superblocks */
+       sync_inodes();  /* All the mappings and inodes, again. */
         return 0;
  }
  
  /*
- *     filp may be NULL if called via the msync of a vma.
+ * Generic function to fsync a file.
+ *
+ * filp may be NULL if called via the msync of a vma.
   */
   
  int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
@@ -406,7 +284,6 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
         struct super_block * sb;
         int ret;
  
-       lock_kernel();
         /* sync the inode to buffers */
         write_inode_now(inode, 0);
  
@@ -419,7 +296,6 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
  
         /* .. finally sync the buffers to disk */
         ret = sync_buffers(sb->s_bdev, 1);
-       unlock_kernel();
         return ret;
  }
  
@@ -496,151 +372,85 @@ out:
         return ret;
  }
  
-/* After several hours of tedious analysis, the following hash
- * function won.  Do not mess with it... -DaveM
+/*
+ * Various filesystems appear to want __get_hash_table to be non-blocking.
+ * But it's the page lock which protects the buffers.  To get around this,
+ * we get exclusion from try_to_free_buffers with the inode's
+ * i_bufferlist_lock.
+ *
+ * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
+ * may be quite high.  This code could TryLock the page, and if that
+ * succeeds, there is no need to take i_bufferlist_lock. (But if
+ * i_bufferlist_lock is contended then so is mapping->page_lock).
   */
-#define _hashfn(dev,block)     \
-       ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
-        (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
-         ((block) << (bh_hash_shift - 12))))
-#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
-
-static inline void __insert_into_hash_list(struct buffer_head *bh)
+struct buffer_head *
+__get_hash_table(struct block_device *bdev, sector_t block, int unused)
  {
-       struct buffer_head **head = &hash(to_kdev_t(bh->b_bdev->bd_dev), bh->b_blocknr);
-       struct buffer_head *next = *head;
-
-       *head = bh;
-       bh->b_pprev = head;
-       bh->b_next = next;
-       if (next != NULL)
-               next->b_pprev = &bh->b_next;
-}
-
-static __inline__ void __hash_unlink(struct buffer_head *bh)
-{
-       struct buffer_head **pprev = bh->b_pprev;
-       if (pprev) {
-               struct buffer_head *next = bh->b_next;
-               if (next)
-                       next->b_pprev = pprev;
-               *pprev = next;
-               bh->b_pprev = NULL;
-       }
-}
-
-static void __insert_into_lru_list(struct buffer_head * bh, int blist)
-{
-       struct buffer_head **bhp = &lru_list[blist];
+       struct inode * const inode = bdev->bd_inode;
+       struct buffer_head *ret = NULL;
+       unsigned long index;
+       struct buffer_head *bh;
+       struct buffer_head *head;
+       struct page *page;
  
-       if (bh->b_prev_free || bh->b_next_free) BUG();
+       index = block >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       page = find_get_page(inode->i_mapping, index);
+       if (!page)
+               goto out;
  
-       if(!*bhp) {
-               *bhp = bh;
-               bh->b_prev_free = bh;
-       }
-       bh->b_next_free = *bhp;
-       bh->b_prev_free = (*bhp)->b_prev_free;
-       (*bhp)->b_prev_free->b_next_free = bh;
-       (*bhp)->b_prev_free = bh;
-       nr_buffers_type[blist]++;
-       size_buffers_type[blist] += bh->b_size;
-}
-
-static void __remove_from_lru_list(struct buffer_head * bh)
-{
-       struct buffer_head *next = bh->b_next_free;
-       if (next) {
-               struct buffer_head *prev = bh->b_prev_free;
-               int blist = bh->b_list;
-
-               prev->b_next_free = next;
-               next->b_prev_free = prev;
-               if (lru_list[blist] == bh) {
-                       if (next == bh)
-                               next = NULL;
-                       lru_list[blist] = next;
+       spin_lock(&inode->i_bufferlist_lock);
+       if (!page_has_buffers(page))
+               goto out_unlock;
+       head = page_buffers(page);
+       bh = head;
+       do {
+               if (bh->b_blocknr == block) {
+                       ret = bh;
+                       get_bh(bh);
+                       goto out_unlock;
                 }
-               bh->b_next_free = NULL;
-               bh->b_prev_free = NULL;
-               nr_buffers_type[blist]--;
-               size_buffers_type[blist] -= bh->b_size;
-       }
-}
-
-/* must be called with both the hash_table_lock and the lru_list_lock
-   held */
-static void __remove_from_queues(struct buffer_head *bh)
-{
-       __hash_unlink(bh);
-       __remove_from_lru_list(bh);
-}
-
-static void remove_from_queues(struct buffer_head *bh)
-{
-       spin_lock(&lru_list_lock);
-       write_lock(&hash_table_lock);
-       __remove_from_queues(bh);
-       write_unlock(&hash_table_lock); 
-       spin_unlock(&lru_list_lock);
-}
-
-struct buffer_head * __get_hash_table(struct block_device *bdev, sector_t block, int size)
-{
-       struct buffer_head *bh, **p = &hash(to_kdev_t(bdev->bd_dev), block);
-
-       read_lock(&hash_table_lock);
-
-       for (;;) {
-               bh = *p;
-               if (!bh)
-                       break;
-               p = &bh->b_next;
-               if (bh->b_blocknr != block)
-                       continue;
-               if (bh->b_size != size)
-                       continue;
-               if (bh->b_bdev != bdev)
-                       continue;
-               get_bh(bh);
-               break;
-       }
-
-       read_unlock(&hash_table_lock);
-       return bh;
+               bh = bh->b_this_page;
+       } while (bh != head);
+       buffer_error();
+out_unlock:
+       spin_unlock(&inode->i_bufferlist_lock);
+       page_cache_release(page);
+out:
+       return ret;
  }
  
-void buffer_insert_list(struct buffer_head *bh, struct list_head *list)
+void buffer_insert_list(spinlock_t *lock,
+               struct buffer_head *bh, struct list_head *list)
  {
-       spin_lock(&lru_list_lock);
+       if (lock == NULL)
+               lock = &global_bufferlist_lock;
+       spin_lock(lock);
         if (bh->b_inode)
                 list_del(&bh->b_inode_buffers);
         bh->b_inode = 1;
         list_add(&bh->b_inode_buffers, list);
-       spin_unlock(&lru_list_lock);
+       spin_unlock(lock);
  }
  
-/* The caller must have the lru_list lock before calling the 
-   remove_inode_queue functions.  */
-static void __remove_inode_queue(struct buffer_head *bh)
-{
-       bh->b_inode = 0;
-       list_del(&bh->b_inode_buffers);
-}
-
-static inline void remove_inode_queue(struct buffer_head *bh)
+/*
+ * i_bufferlist_lock must be held
+ */
+static inline void __remove_inode_queue(struct buffer_head *bh)
  {
-       if (bh->b_inode)
-               __remove_inode_queue(bh);
+       if (bh->b_inode) {
+               list_del(&bh->b_inode_buffers);
+               bh->b_inode = 0;
+       }
  }
  
  int inode_has_buffers(struct inode *inode)
  {
         int ret;
         
-       spin_lock(&lru_list_lock);
-       ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
-       spin_unlock(&lru_list_lock);
+       spin_lock(&inode->i_bufferlist_lock);
+       ret = !list_empty(&inode->i_dirty_buffers) ||
+                       !list_empty(&inode->i_dirty_data_buffers);
+       spin_unlock(&inode->i_bufferlist_lock);
         
         return ret;
  }
@@ -679,58 +489,11 @@ int inode_has_buffers(struct inode *inode)
     pass does the actual I/O. */
  void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
  {
-       int i, nlist, slept;
-       struct buffer_head * bh, * bh_next;
-
- retry:
-       slept = 0;
-       spin_lock(&lru_list_lock);
-       for(nlist = 0; nlist < NR_LIST; nlist++) {
-               bh = lru_list[nlist];
-               if (!bh)
-                       continue;
-               for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
-                       bh_next = bh->b_next_free;
-
-                       /* Another device? */
-                       if (bh->b_bdev != bdev)
-                               continue;
-                       /* Not hashed? */
-                       if (!bh->b_pprev)
-                               continue;
-                       if (buffer_locked(bh)) {
-                               get_bh(bh);
-                               spin_unlock(&lru_list_lock);
-                               wait_on_buffer(bh);
-                               slept = 1;
-                               spin_lock(&lru_list_lock);
-                               put_bh(bh);
-                       }
-
-                       write_lock(&hash_table_lock);
-                       /* All buffers in the lru lists are mapped */
-                       if (!buffer_mapped(bh))
-                               BUG();
-                       if (buffer_dirty(bh))
-                               printk("invalidate: dirty buffer\n");
-                       if (!atomic_read(&bh->b_count)) {
-                               if (destroy_dirty_buffers || !buffer_dirty(bh)) {
-                                       remove_inode_queue(bh);
-                               }
-                       } else
-                               printk("invalidate: busy buffer\n");
-
-                       write_unlock(&hash_table_lock);
-                       if (slept)
-                               goto out;
-               }
-       }
-out:
-       spin_unlock(&lru_list_lock);
-       if (slept)
-               goto retry;
-
-       /* Get rid of the page cache */
+       /*
+        * FIXME: what about destroy_dirty_buffers?
+        * We really want to use invalidate_inode_pages2() for
+        * that, but not until that's cleaned up.
+        */
         invalidate_inode_pages(bdev->bd_inode);
  }
  
@@ -743,11 +506,16 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
         }
  }
  
+/*
+ * FIXME: What is this function actually trying to do?  Why "zones[0]"?
+ * Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER?
+ */
  static void free_more_memory(void)
  {
-       zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
-       
-       balance_dirty();
+       zone_t *zone;
+
+       zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
+
         wakeup_bdflush();
         try_to_free_pages(zone, GFP_NOFS, 0);
         run_task_queue(&tq_disk);
@@ -755,25 +523,19 @@ static void free_more_memory(void)
         yield();
  }
  
-void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
-{
-       bh->b_list = BUF_CLEAN;
-       bh->b_end_io = handler;
-       bh->b_private = private;
-}
-
-static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
+static void end_buffer_io_async(struct buffer_head *bh, int uptodate)
  {
         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
         unsigned long flags;
         struct buffer_head *tmp;
         struct page *page;
+       int page_uptodate = 1;
  
-       mark_buffer_uptodate(bh, uptodate);
+       if (!uptodate)
+               printk("%s: I/O error\n", __FUNCTION__);
  
-       /* This is a temporary buffer used for page I/O. */
+       mark_buffer_uptodate(bh, uptodate);
         page = bh->b_page;
-
         if (!uptodate)
                 SetPageError(page);
  
@@ -781,39 +543,31 @@ static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
          * Be _very_ careful from here on. Bad things can happen if
          * two buffer heads end IO at almost the same time and both
          * decide that the page is now completely done.
-        *
-        * Async buffer_heads are here only as labels for IO, and get
-        * thrown away once the IO for this page is complete.  IO is
-        * deemed complete once all buffers have been visited
-        * (b_count==0) and are now unlocked. We must make sure that
-        * only the _last_ buffer that decrements its count is the one
-        * that unlock the page..
          */
         spin_lock_irqsave(&page_uptodate_lock, flags);
         mark_buffer_async(bh, 0);
         unlock_buffer(bh);
-       tmp = bh->b_this_page;
-       while (tmp != bh) {
-               if (buffer_locked(tmp)) {
-                       if (buffer_async(tmp))
+       tmp = bh;
+       do {
+               if (!buffer_uptodate(tmp))
+                       page_uptodate = 0;
+               if (buffer_async(tmp)) {
+                       if (buffer_locked(tmp))
                                 goto still_busy;
-               } else if (!buffer_uptodate(tmp))
-                       SetPageError(page);
+                       if (!buffer_mapped(bh))
+                               BUG();
+               }
                 tmp = tmp->b_this_page;
-       }
-
-       /* OK, the async IO on this page is complete. */
+       } while (tmp != bh);
         spin_unlock_irqrestore(&page_uptodate_lock, flags);
  
         /*
-        * if none of the buffers had errors then we can set the
-        * page uptodate:
+        * If none of the buffers had errors and they are all
+        * uptodate then we can set the page uptodate.
          */
-       if (!PageError(page))
+       if (page_uptodate && !PageError(page))
                 SetPageUptodate(page);
-
         UnlockPage(page);
-
         return;
  
  still_busy:
@@ -821,12 +575,66 @@ still_busy:
         return;
  }
  
+/*
+ * If a page's buffers are under async writeout (end_buffer_io_async
+ * completion) then there is a possibility that another thread of
+ * control could lock one of the buffers after it has completed
+ * but while some of the other buffers have not completed.  This
+ * locked buffer would confuse end_buffer_io_async() into not unlocking
+ * the page.  So the absence of BH_Async tells end_buffer_io_async()
+ * that this buffer is not under async I/O.
+ *
+ * The page comes unlocked when it has no locked buffer_async buffers
+ * left.
+ *
+ * The page lock prevents anyone starting new async I/O against any of
+ * the buffers.
+ */
  inline void set_buffer_async_io(struct buffer_head *bh)
  {
         bh->b_end_io = end_buffer_io_async;
         mark_buffer_async(bh, 1);
  }
  
+/*
+ * osync is designed to support O_SYNC io.  It waits synchronously for
+ * all already-submitted IO to complete, but does not queue any new
+ * writes to the disk.
+ *
+ * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
+ * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * completion.  Any other dirty buffers which are not yet queued for
+ * write will not be flushed to disk by the osync.
+ */
+static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
+{
+       struct buffer_head *bh;
+       struct list_head *p;
+       int err = 0;
+
+       if (lock == NULL)
+               lock = &global_bufferlist_lock;
+
+       spin_lock(lock);
+repeat:
+       for (p = list->prev; 
+            bh = BH_ENTRY(p), p != list;
+            p = bh->b_inode_buffers.prev) {
+               if (buffer_locked(bh)) {
+                       get_bh(bh);
+                       spin_unlock(lock);
+                       wait_on_buffer(bh);
+                       if (!buffer_uptodate(bh))
+                               err = -EIO;
+                       brelse(bh);
+                       spin_lock(lock);
+                       goto repeat;
+               }
+       }
+       spin_unlock(lock);
+       return err;
+}
+
  /*
   * Synchronise all the inode's dirty buffers to the disk.
   *
@@ -846,17 +654,18 @@ inline void set_buffer_async_io(struct buffer_head *bh)
   * the osync code to catch these locked, dirty buffers without requeuing
   * any newly dirty buffers for write.
   */
-
-int fsync_buffers_list(struct list_head *list)
+int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
  {
         struct buffer_head *bh;
         struct list_head tmp;
         int err = 0, err2;
+
+       if (lock == NULL)
+               lock = &global_bufferlist_lock;
         
         INIT_LIST_HEAD(&tmp);
-       
-       spin_lock(&lru_list_lock);
  
+       spin_lock(lock);
         while (!list_empty(list)) {
                 bh = BH_ENTRY(list->next);
                 list_del(&bh->b_inode_buffers);
@@ -867,29 +676,28 @@ int fsync_buffers_list(struct list_head *list)
                         list_add(&bh->b_inode_buffers, &tmp);
                         if (buffer_dirty(bh)) {
                                 get_bh(bh);
-                               spin_unlock(&lru_list_lock);
+                               spin_unlock(lock);
                                 ll_rw_block(WRITE, 1, &bh);
                                 brelse(bh);
-                               spin_lock(&lru_list_lock);
+                               spin_lock(lock);
                         }
                 }
         }
  
         while (!list_empty(&tmp)) {
                 bh = BH_ENTRY(tmp.prev);
-               remove_inode_queue(bh);
+               __remove_inode_queue(bh);
                 get_bh(bh);
-               spin_unlock(&lru_list_lock);
+               spin_unlock(lock);
                 wait_on_buffer(bh);
                 if (!buffer_uptodate(bh))
                         err = -EIO;
                 brelse(bh);
-               spin_lock(&lru_list_lock);
+               spin_lock(lock);
         }
         
-       spin_unlock(&lru_list_lock);
-       err2 = osync_buffers_list(list);
-
+       spin_unlock(lock);
+       err2 = osync_buffers_list(lock, list);
         if (err)
                 return err;
         else
@@ -897,198 +705,335 @@ int fsync_buffers_list(struct list_head *list)
  }
  
  /*
- * osync is designed to support O_SYNC io.  It waits synchronously for
- * all already-submitted IO to complete, but does not queue any new
- * writes to the disk.
- *
- * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
- * you dirty the buffers, and then use osync_inode_buffers to wait for
- * completion.  Any other dirty buffers which are not yet queued for
- * write will not be flushed to disk by the osync.
+ * Invalidate any and all dirty buffers on a given inode.  We are
+ * probably unmounting the fs, but that doesn't mean we have already
+ * done a sync().  Just drop the buffers from the inode list.
   */
+void invalidate_inode_buffers(struct inode *inode)
+{
+       struct list_head * entry;
+       
+       spin_lock(&inode->i_bufferlist_lock);
+       while ((entry = inode->i_dirty_buffers.next) !=
+                               &inode->i_dirty_buffers)
+               __remove_inode_queue(BH_ENTRY(entry));
+       while ((entry = inode->i_dirty_data_buffers.next) !=
+                               &inode->i_dirty_data_buffers)
+               __remove_inode_queue(BH_ENTRY(entry));
+       spin_unlock(&inode->i_bufferlist_lock);
+}
  
-int osync_buffers_list(struct list_head *list)
+static void __put_unused_buffer_head(struct buffer_head * bh)
  {
-       struct buffer_head *bh;
-       struct list_head *p;
-       int err = 0;
+       if (bh->b_inode)
+               BUG();
+       if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
+               kmem_cache_free(bh_cachep, bh);
+       } else {
+               bh->b_bdev = NULL;
+               bh->b_blocknr = -1;
+               bh->b_this_page = NULL;
  
-       spin_lock(&lru_list_lock);
-       
- repeat:
-       
-       for (p = list->prev; 
-            bh = BH_ENTRY(p), p != list;
-            p = bh->b_inode_buffers.prev) {
-               if (buffer_locked(bh)) {
-                       get_bh(bh);
-                       spin_unlock(&lru_list_lock);
-                       wait_on_buffer(bh);
-                       if (!buffer_uptodate(bh))
-                               err = -EIO;
-                       brelse(bh);
-                       spin_lock(&lru_list_lock);
-                       goto repeat;
+               nr_unused_buffer_heads++;
+               bh->b_private = unused_list;
+               unused_list = bh;
+               if (waitqueue_active(&buffer_wait))
+                       wake_up(&buffer_wait);
+       }
+}
+
+void put_unused_buffer_head(struct buffer_head *bh)
+{
+       spin_lock(&unused_list_lock);
+       __put_unused_buffer_head(bh);
+       spin_unlock(&unused_list_lock);
+}
+EXPORT_SYMBOL(put_unused_buffer_head);
+
+/*
+ * Create the appropriate buffers when given a page for data area and
+ * the size of each buffer.. Use the bh->b_this_page linked list to
+ * follow the buffers created.  Return NULL if unable to create more
+ * buffers.
+ * The async flag is used to differentiate async IO (paging, swapping)
+ * from ordinary buffer allocations, and only async requests are allowed
+ * to sleep waiting for buffer heads. 
+ */
+static struct buffer_head *
+create_buffers(struct page * page, unsigned long size, int async)
+{
+       struct buffer_head *bh, *head;
+       long offset;
+
+try_again:
+       head = NULL;
+       offset = PAGE_SIZE;
+       while ((offset -= size) >= 0) {
+               bh = get_unused_buffer_head(async);
+               if (!bh)
+                       goto no_grow;
+
+               bh->b_bdev = NULL;
+               bh->b_this_page = head;
+               head = bh;
+
+               bh->b_state = 0;
+               atomic_set(&bh->b_count, 0);
+               bh->b_size = size;
+
+               /* Link the buffer to its page */
+               set_bh_page(bh, page, offset);
+
+               bh->b_end_io = NULL;
+       }
+       return head;
+/*
+ * In case anything failed, we just free everything we got.
+ */
+no_grow:
+       if (head) {
+               spin_lock(&unused_list_lock);
+               do {
+                       bh = head;
+                       head = head->b_this_page;
+                       __put_unused_buffer_head(bh);
+               } while (head);
+               spin_unlock(&unused_list_lock);
+       }
+
+       /*
+        * Return failure for non-async IO requests.  Async IO requests
+        * are not allowed to fail, so we have to wait until buffer heads
+        * become available.  But we don't want tasks sleeping with 
+        * partially complete buffers, so all were released above.
+        */
+       if (!async)
+               return NULL;
+
+       /* We're _really_ low on memory. Now we just
+        * wait for old buffer heads to become free due to
+        * finishing IO.  Since this is an async request and
+        * the reserve list is empty, we're sure there are 
+        * async buffer heads in use.
+        */
+       run_task_queue(&tq_disk);
+
+       free_more_memory();
+       goto try_again;
+}
+
+static inline void
+link_dev_buffers(struct page *page, struct buffer_head *head)
+{
+       struct buffer_head *bh, *tail;
+
+       bh = head;
+       do {
+               tail = bh;
+               bh = bh->b_this_page;
+       } while (bh);
+       tail->b_this_page = head;
+       __set_page_buffers(page, head);
+}
+
+/*
+ * Initialise the state of a blockdev page's buffers.
+ */ 
+static /*inline*/ void
+init_page_buffers(struct page *page, struct block_device *bdev,
+                       int block, int size)
+{
+       struct buffer_head *head = page_buffers(page);
+       struct buffer_head *bh = head;
+       unsigned int b_state;
+
+       b_state = 1 << BH_Mapped;
+       if (Page_Uptodate(page))
+               b_state |= 1 << BH_Uptodate;
+
+       do {
+               if (!(bh->b_state & (1 << BH_Mapped))) {
+                       init_buffer(bh, NULL, NULL);
+                       bh->b_bdev = bdev;
+                       bh->b_blocknr = block;
+                       bh->b_state = b_state;
                 }
+               block++;
+               bh = bh->b_this_page;
+       } while (bh != head);
+}
+
+/*
+ * Create the page-cache page that contains the requested block.
+ *
+ * This is user purely for blockdev mappings.
+ */
+static /*inline*/ struct page *
+grow_dev_page(struct block_device *bdev, unsigned long block,
+                       unsigned long index, int size)
+{
+       struct inode *inode = bdev->bd_inode;
+       struct page *page;
+       struct buffer_head *bh;
+
+       page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+       if (!page)
+               return NULL;
+
+       if (!PageLocked(page))
+               BUG();
+
+       if (page_has_buffers(page)) {
+               bh = page_buffers(page);
+               if (bh->b_size == size)
+                       return page;
+               if (!try_to_free_buffers(page))
+                       goto failed;
         }
  
-       spin_unlock(&lru_list_lock);
-       return err;
+       /*
+        * Allocate some buffers for this page
+        */
+       bh = create_buffers(page, size, 0);
+       if (!bh)
+               goto failed;
+
+       /*
+        * Link the page to the buffers and initialise them.  Take the
+        * lock to be atomic wrt __get_hash_table(), which does not
+        * run under the page lock.
+        */
+       spin_lock(&inode->i_bufferlist_lock);
+       link_dev_buffers(page, bh);
+       init_page_buffers(page, bdev, block, size);
+       spin_unlock(&inode->i_bufferlist_lock);
+       return page;
+
+failed:
+       buffer_error();
+       UnlockPage(page);
+       page_cache_release(page);
+       return NULL;
  }
  
-/*
- * Invalidate any and all dirty buffers on a given inode.  We are
- * probably unmounting the fs, but that doesn't mean we have already
- * done a sync().  Just drop the buffers from the inode list.
- */
-void invalidate_inode_buffers(struct inode *inode)
-{
-       struct list_head * entry;
-       
-       spin_lock(&lru_list_lock);
-       while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
-               remove_inode_queue(BH_ENTRY(entry));
-       while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
-               remove_inode_queue(BH_ENTRY(entry));
-       spin_unlock(&lru_list_lock);
+/*
+ * Create buffers for the specified block device block's page.  If
+ * that page was dirty, the buffers are set dirty also.
+ *
+ * Except that's a bug.  Attaching dirty buffers to a dirty
+ * blockdev's page can result in filesystem corruption, because
+ * some of those buffers may be aliases of filesystem data.
+ * grow_dev_page() will go BUG() if this happens.
+ */
+static inline int
+grow_buffers(struct block_device *bdev, unsigned long block, int size)
+{
+       struct page *page;
+       unsigned long index;
+       int sizebits;
+
+       /* Size must be multiple of hard sectorsize */
+       if (size & (bdev_hardsect_size(bdev)-1))
+               BUG();
+       if (size < 512 || size > PAGE_SIZE)
+               BUG();
+
+       sizebits = -1;
+       do {
+               sizebits++;
+       } while ((size << sizebits) < PAGE_SIZE);
+
+       index = block >> sizebits;
+       block = index << sizebits;
+
+       /* Create a page with the proper size buffers.. */
+       page = grow_dev_page(bdev, block, index, size);
+       if (!page)
+               return 0;
+       UnlockPage(page);
+       page_cache_release(page);
+       return 1;
  }
  
-
  /*
- * Ok, this is getblk, and it isn't very clear, again to hinder
- * race-conditions. Most of the code is seldom used, (ie repeating),
- * so it should be much more efficient than it looks.
+ * __getblk will locate (and, if necessary, create) the buffer_head
+ * which corresponds to the passed block_device, block and size. The
+ * returned buffer has its reference count incremented.
   *
- * The algorithm is changed: hopefully better, and an elusive bug removed.
+ * __getblk() cannot fail - it just keeps trying.  If you pass it an
+ * illegal block number, __getblk() will happily return a buffer_head
+ * which represents the non-existent block.  Very weird.
   *
- * 14.02.92: changed it to sync dirty buffers a bit: better performance
- * when the filesystem starts to get full of dirty blocks (I hope).
+ * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
+ * attempt is failing.  FIXME, perhaps?
   */
-struct buffer_head * __getblk(struct block_device *bdev, sector_t block, int size)
+struct buffer_head *
+__getblk(struct block_device *bdev, sector_t block, int size)
  {
         for (;;) {
                 struct buffer_head * bh;
  
                 bh = __get_hash_table(bdev, block, size);
-               if (bh)
+               if (bh) {
+                       touch_buffer(bh);
                         return bh;
+               }
  
                 if (!grow_buffers(bdev, block, size))
                         free_more_memory();
         }
  }
  
-/* -1 -> no need to flush
-    0 -> async flush
-    1 -> sync flush (wait for I/O completion) */
-static int balance_dirty_state(void)
-{
-       unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
-
-       dirty = (size_buffers_type[BUF_DIRTY] + size_buffers_type[BUF_LOCKED]) >> PAGE_SHIFT;
-       tot = nr_free_buffer_pages();
-
-       dirty *= 100;
-       soft_dirty_limit = tot * bdf_prm.b_un.nfract;
-       hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
-
-       /* First, check for the "real" dirty limit. */
-       if (dirty > soft_dirty_limit) {
-               if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
-                       return 1;
-               return 0;
-       }
-
-       return -1;
-}
-
  /*
- * if a new dirty buffer is created we need to balance bdflush.
+ * The relationship between dirty buffers and dirty pages:
+ *
+ * Whenever a page has any dirty buffers, the page's dirty bit is set, and
+ * the page appears on its address_space.dirty_pages list.
+ *
+ * At all times, the dirtiness of the buffers represents the dirtiness of
+ * subsections of the page.  If the page has buffers, the page dirty bit is
+ * merely a hint about the true dirty state.
   *
- * in the future we might want to make bdflush aware of different
- * pressures on different devices - thus the (currently unused)
- * 'dev' parameter.
+ * When a page is set dirty in its entirety, all its buffers are marked dirty
+ * (if the page has buffers).
+ *
+ * When a buffer is marked dirty, its page is dirtied, but the page's other
+ * buffers are not.
+ *
+ * Also.  When blockdev buffers are explicitly read with bread(), they
+ * individually become uptodate.  But their backing page remains not
+ * uptodate - even if all of its buffers are uptodate.  A subsequent
+ * block_read_full_page() against that page will discover all the uptodate
+ * buffers, will set the page uptodate and will perform no I/O.
   */
-void balance_dirty(void)
-{
-       int state = balance_dirty_state();
-
-       if (state < 0)
-               return;
-
-       /* If we're getting into imbalance, start write-out */
-       spin_lock(&lru_list_lock);
-       write_some_buffers(NULL);
-
-       /*
-        * And if we're _really_ out of balance, wait for
-        * some of the dirty/locked buffers ourselves and
-        * start bdflush.
-        * This will throttle heavy writers.
-        */
-       if (state > 0) {
-               wait_for_some_buffers(NULL);
-               wakeup_bdflush();
-       }
-}
-
-inline void __mark_dirty(struct buffer_head *bh)
+static inline void __mark_dirty(struct buffer_head *bh)
  {
-       bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
-       refile_buffer(bh);
-}
-
-/* atomic version, the user must call balance_dirty() by hand
-   as soon as it become possible to block */
-void __mark_buffer_dirty(struct buffer_head *bh)
-{
-       if (!atomic_set_buffer_dirty(bh))
-               __mark_dirty(bh);
+       __set_page_dirty_nobuffers(bh->b_page);
  }
  
+/**
+ * mark_buffer_dirty - mark a buffer_head as needing writeout
+ *
+ * mark_buffer_dirty() will set the dirty bit against the buffer,
+ * then set its backing page dirty, then attach the page to its
+ * address_space's dirty_pages list and then attach the address_space's
+ * inode to its superblock's dirty inode list.
+ *
+ * mark_buffer_dirty() is atomic.  It takes inode->i_bufferlist_lock,
+ * mapping->page_lock and the global inode_lock.
+ */
  void mark_buffer_dirty(struct buffer_head *bh)
  {
-       if (!atomic_set_buffer_dirty(bh)) {
+       if (!atomic_set_buffer_dirty(bh))
                 __mark_dirty(bh);
-               balance_dirty();
-       }
-}
-
-void set_buffer_flushtime(struct buffer_head *bh)
-{
-       bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
-}
-EXPORT_SYMBOL(set_buffer_flushtime);
-
-/*
- * A buffer may need to be moved from one buffer list to another
- * (e.g. in case it is not shared any more). Handle this.
- */
-static void __refile_buffer(struct buffer_head *bh)
-{
-       int dispose = BUF_CLEAN;
-       if (buffer_locked(bh))
-               dispose = BUF_LOCKED;
-       if (buffer_dirty(bh))
-               dispose = BUF_DIRTY;
-       if (dispose != bh->b_list) {
-               __remove_from_lru_list(bh);
-               bh->b_list = dispose;
-               if (dispose == BUF_CLEAN)
-                       remove_inode_queue(bh);
-               __insert_into_lru_list(bh, dispose);
-       }
-}
-
-void refile_buffer(struct buffer_head *bh)
-{
-       spin_lock(&lru_list_lock);
-       __refile_buffer(bh);
-       spin_unlock(&lru_list_lock);
  }
  
  /*
- * Release a buffer head
+ * Decrement a buffer_head's reference count.  If all buffers against a page
+ * have zero reference count, are clean and unlocked, and if the page is clean
+ * and unlocked then try_to_free_buffers() may strip the buffers from the page
+ * in preparation for freeing it (sometimes, rarely, buffers are removed from
+ * a page but it ends up not being freed, and buffers may later be reattached).
   */
  void __brelse(struct buffer_head * buf)
  {
@@ -1097,6 +1042,7 @@ void __brelse(struct buffer_head * buf)
                 return;
         }
         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
+       buffer_error();         /* For the stack backtrace */
  }
  
  /*
@@ -1110,56 +1056,37 @@ void __bforget(struct buffer_head * buf)
  }
  
  /**
- *     bread() - reads a specified block and returns the bh
- *     @block: number of block
- *     @size: size (in bytes) to read
+ *  bread() - reads a specified block and returns the bh
+ *  @block: number of block
+ *  @size: size (in bytes) to read
   * 
- *     Reads a specified block, and returns buffer head that
- *     contains it. It returns NULL if the block was unreadable.
+ *  Reads a specified block, and returns buffer head that contains it.
+ *  It returns NULL if the block was unreadable.
   */
  struct buffer_head * __bread(struct block_device *bdev, int block, int size)
  {
-       struct buffer_head * bh = __getblk(bdev, block, size);
+       struct buffer_head *bh = __getblk(bdev, block, size);
  
-       touch_buffer(bh);
         if (buffer_uptodate(bh))
                 return bh;
-       ll_rw_block(READ, 1, &bh);
-       wait_on_buffer(bh);
-       if (buffer_uptodate(bh))
+       lock_buffer(bh);
+       if (buffer_uptodate(bh)) {
+               unlock_buffer(bh);
                 return bh;
-       brelse(bh);
-       return NULL;
-}
-
-/*
- * Note: the caller should wake up the buffer_wait list if needed.
- */
-static void __put_unused_buffer_head(struct buffer_head * bh)
-{
-       if (bh->b_inode)
-               BUG();
-       if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
-               kmem_cache_free(bh_cachep, bh);
         } else {
-               bh->b_bdev = NULL;
-               bh->b_blocknr = -1;
-               bh->b_this_page = NULL;
-
-               nr_unused_buffer_heads++;
-               bh->b_next_free = unused_list;
-               unused_list = bh;
+               if (buffer_dirty(bh))
+                       buffer_error();
+               get_bh(bh);
+               bh->b_end_io = end_buffer_io_sync;
+               submit_bh(READ, bh);
+               wait_on_buffer(bh);
+               if (buffer_uptodate(bh))
+                       return bh;
         }
+       brelse(bh);
+       return NULL;
  }
  
-void put_unused_buffer_head(struct buffer_head *bh)
-{
-       spin_lock(&unused_list_lock);
-       __put_unused_buffer_head(bh);
-       spin_unlock(&unused_list_lock);
-}
-EXPORT_SYMBOL(put_unused_buffer_head);
-
  /*
   * Reserve NR_RESERVED buffer heads for async IO requests to avoid
   * no-buffer-head deadlock.  Return NULL on failure; waiting for
@@ -1172,7 +1099,7 @@ struct buffer_head * get_unused_buffer_head(int async)
         spin_lock(&unused_list_lock);
         if (nr_unused_buffer_heads > NR_RESERVED) {
                 bh = unused_list;
-               unused_list = bh->b_next_free;
+               unused_list = bh->b_private;
                 nr_unused_buffer_heads--;
                 spin_unlock(&unused_list_lock);
                 return bh;
@@ -1196,7 +1123,7 @@ struct buffer_head * get_unused_buffer_head(int async)
                 spin_lock(&unused_list_lock);
                 if (unused_list) {
                         bh = unused_list;
-                       unused_list = bh->b_next_free;
+                       unused_list = bh->b_private;
                         nr_unused_buffer_heads--;
                         spin_unlock(&unused_list_lock);
                         return bh;
@@ -1208,7 +1135,8 @@ struct buffer_head * get_unused_buffer_head(int async)
  }
  EXPORT_SYMBOL(get_unused_buffer_head);
  
-void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
+void set_bh_page(struct buffer_head *bh,
+               struct page *page, unsigned long offset)
  {
         bh->b_page = page;
         if (offset >= PAGE_SIZE)
@@ -1223,82 +1151,6 @@ void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offse
  }
  EXPORT_SYMBOL(set_bh_page);
  
-/*
- * Create the appropriate buffers when given a page for data area and
- * the size of each buffer.. Use the bh->b_this_page linked list to
- * follow the buffers created.  Return NULL if unable to create more
- * buffers.
- * The async flag is used to differentiate async IO (paging, swapping)
- * from ordinary buffer allocations, and only async requests are allowed
- * to sleep waiting for buffer heads. 
- */
-static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
-{
-       struct buffer_head *bh, *head;
-       long offset;
-
-try_again:
-       head = NULL;
-       offset = PAGE_SIZE;
-       while ((offset -= size) >= 0) {
-               bh = get_unused_buffer_head(async);
-               if (!bh)
-                       goto no_grow;
-
-               bh->b_bdev = NULL;
-               bh->b_this_page = head;
-               head = bh;
-
-               bh->b_state = 0;
-               bh->b_next_free = NULL;
-               bh->b_pprev = NULL;
-               atomic_set(&bh->b_count, 0);
-               bh->b_size = size;
-
-               set_bh_page(bh, page, offset);
-
-               bh->b_list = BUF_CLEAN;
-               bh->b_end_io = NULL;
-       }
-       return head;
-/*
- * In case anything failed, we just free everything we got.
- */
-no_grow:
-       if (head) {
-               spin_lock(&unused_list_lock);
-               do {
-                       bh = head;
-                       head = head->b_this_page;
-                       __put_unused_buffer_head(bh);
-               } while (head);
-               spin_unlock(&unused_list_lock);
-
-               /* Wake up any waiters ... */
-               wake_up(&buffer_wait);
-       }
-
-       /*
-        * Return failure for non-async IO requests.  Async IO requests
-        * are not allowed to fail, so we have to wait until buffer heads
-        * become available.  But we don't want tasks sleeping with 
-        * partially complete buffers, so all were released above.
-        */
-       if (!async)
-               return NULL;
-
-       /* We're _really_ low on memory. Now we just
-        * wait for old buffer heads to become free due to
-        * finishing IO.  Since this is an async request and
-        * the reserve list is empty, we're sure there are 
-        * async buffer heads in use.
-        */
-       run_task_queue(&tq_disk);
-
-       free_more_memory();
-       goto try_again;
-}
-
  /*
   * Called when truncating a buffer on a page completely.
   */
@@ -1308,21 +1160,29 @@ static void discard_buffer(struct buffer_head * bh)
                 mark_buffer_clean(bh);
                 lock_buffer(bh);
                 bh->b_bdev = NULL;
-               clear_bit(BH_Uptodate, &bh->b_state);
                 clear_bit(BH_Mapped, &bh->b_state);
                 clear_bit(BH_Req, &bh->b_state);
                 clear_bit(BH_New, &bh->b_state);
-               remove_from_queues(bh);
                 unlock_buffer(bh);
         }
  }
  
  /**
- * try_to_release_page - release old fs-specific metadata on a page
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
   *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private).  If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ *
+ * NOTE: @gfp_mask may go away, and this function may become non-blocking.
   */
-
-int try_to_release_page(struct page * page, int gfp_mask)
+int try_to_release_page(struct page *page, int gfp_mask)
  {
         struct address_space * const mapping = page->mapping;
  
@@ -1331,17 +1191,26 @@ int try_to_release_page(struct page * page, int gfp_mask)
         
         if (mapping && mapping->a_ops->releasepage)
                 return mapping->a_ops->releasepage(page, gfp_mask);
-       return try_to_free_buffers(page, gfp_mask);
+       return try_to_free_buffers(page);
  }
  
-/*
- * We don't have to release all buffers here, but
- * we have to be sure that no dirty buffer is left
- * and no IO is going on (no buffer is locked), because
- * we have truncated the file and are going to free the
- * blocks on-disk..
+/**
+ * block_flushpage - invalidate part of all of a buffer-backed page
+ *
+ * @page: the page which is affected
+ * @offset: the index of the truncation point
+ *
+ * block_flushpage() should be called block_invalidatepage().  It is
+ * called when all or part of the page has become invalidatedby a truncate
+ * operation.
+ *
+ * block_flushpage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point.  Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
   */
-int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
+int block_flushpage(struct page *page, unsigned long offset)
  {
         struct buffer_head *head, *bh, *next;
         unsigned int curr_off = 0;
@@ -1358,7 +1227,7 @@ int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
                 next = bh->b_this_page;
  
                 /*
-                * is this block fully flushed?
+                * is this block fully invalidated?
                  */
                 if (offset <= curr_off)
                         discard_buffer(bh);
@@ -1367,16 +1236,11 @@ int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
         } while (bh != head);
  
         /*
-        * subtle. We release buffer-heads only if this is
-        * the 'final' flushpage. We have invalidated the get_block
-        * cached value unconditionally, so real IO is not
-        * possible anymore.
-        *
-        * If the free doesn't work out, the buffers can be
-        * left around - they just turn into anonymous buffers
-        * instead.
+        * We release buffers only if the entire page is being invalidated.
+        * The get_block cached value has been unconditionally invalidated,
+        * so real IO is not possible anymore.
          */
-       if (!offset) {
+       if (offset == 0) {
                 if (!try_to_release_page(page, 0))
                         return 0;
         }
@@ -1384,24 +1248,36 @@ int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
         return 1;
  }
  
-void create_empty_buffers(struct page *page, unsigned long blocksize)
+/*
+ * We attach and possibly dirty the buffers atomically wrt
+ * __set_page_dirty_buffers() via i_bufferlist_lock.  try_to_free_buffers
+ * is already excluded via the page lock.
+ */
+void create_empty_buffers(struct page *page,
+                       unsigned long blocksize, unsigned long b_state)
  {
         struct buffer_head *bh, *head, *tail;
  
-       /* FIXME: create_buffers should fail if there's no enough memory */
         head = create_buffers(page, blocksize, 1);
-       if (page_has_buffers(page))
-               BUG();
-
         bh = head;
         do {
                 bh->b_end_io = NULL;
+               bh->b_state |= b_state;
                 tail = bh;
                 bh = bh->b_this_page;
         } while (bh);
         tail->b_this_page = head;
-       set_page_buffers(page, head);
-       page_cache_get(page);
+
+       spin_lock(&page->mapping->host->i_bufferlist_lock);
+       if (PageDirty(page)) {
+               bh = head;
+               do {
+                       set_bit(BH_Dirty, &bh->b_state);
+                       bh = bh->b_this_page;
+               } while (bh != head);
+       }
+       __set_page_buffers(page, head);
+       spin_unlock(&page->mapping->host->i_bufferlist_lock);
  }
  EXPORT_SYMBOL(create_empty_buffers);
  
@@ -1416,13 +1292,16 @@ EXPORT_SYMBOL(create_empty_buffers);
   * don't want to mark the alias unmapped, for example - it would confuse
   * anyone who might pick it with bread() afterwards...
   */
-
-static void unmap_underlying_metadata(struct buffer_head * bh)
+static void unmap_underlying_metadata(struct buffer_head *bh)
  {
         struct buffer_head *old_bh;
  
-       old_bh = __get_hash_table(bh->b_bdev, bh->b_blocknr, bh->b_size);
+       old_bh = __get_hash_table(bh->b_bdev, bh->b_blocknr, 0);
         if (old_bh) {
+#if 0  /* This happens.  Later. */
+               if (buffer_dirty(old_bh))
+                       buffer_error();
+#endif
                 mark_buffer_clean(old_bh);
                 wait_on_buffer(old_bh);
                 clear_bit(BH_Req, &old_bh->b_state);
@@ -1444,69 +1323,125 @@ static void unmap_underlying_metadata(struct buffer_head * bh)
   */
  
  /*
- * block_write_full_page() is SMP threaded - the kernel lock is not held.
+ * While block_write_full_page is writing back the dirty buffers under
+ * the page lock, whoever dirtied the buffers may decide to clean them
+ * again at any time.  We handle that by only looking at the buffer
+ * state inside lock_buffer().
   */
-static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
+static int __block_write_full_page(struct inode *inode,
+                       struct page *page, get_block_t *get_block)
  {
-       int err, i;
+       int err;
         unsigned long block;
+       unsigned long last_block;
         struct buffer_head *bh, *head;
-       int need_unlock;
+       int nr_underway = 0;
  
         if (!PageLocked(page))
                 BUG();
  
-       if (!page_has_buffers(page))
-               create_empty_buffers(page, 1 << inode->i_blkbits);
-       head = page_buffers(page);
+       last_block = (inode->i_size - 1) >> inode->i_blkbits;
  
-       block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       if (!page_has_buffers(page)) {
+               if (S_ISBLK(inode->i_mode))
+                       buffer_error();
+               if (!Page_Uptodate(page))
+                       buffer_error();
+               create_empty_buffers(page, 1 << inode->i_blkbits,
+                                       (1 << BH_Dirty)|(1 << BH_Uptodate));
+       }
  
+       block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       head = page_buffers(page);
         bh = head;
-       i = 0;
  
-       /* Stage 1: make sure we have all the buffers mapped! */
+       /*
+        * Get all the dirty buffers mapped to disk addresses and
+        * handle any aliases from the underlying blockdev's mapping.
+        */
         do {
-               /*
-                * If the buffer isn't up-to-date, we can't be sure
-                * that the buffer has been initialized with the proper
-                * block number information etc..
-                *
-                * Leave it to the low-level FS to make all those
-                * decisions (block #0 may actually be a valid block)
-                */
-               if (!buffer_mapped(bh)) {
+               if (block > last_block) {
+                       if (buffer_dirty(bh))
+                               buffer_error();
+                       if (buffer_mapped(bh))
+                               buffer_error();
+                       /*
+                        * NOTE: this buffer can only be marked uptodate
+                        * because we know that block_write_full_page has
+                        * zeroed it out.  That seems unnecessary and may go
+                        * away.
+                        */
+                       mark_buffer_uptodate(bh, 1);
+               } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+                       if (buffer_new(bh))
+                               buffer_error();
                         err = get_block(inode, block, bh, 1);
                         if (err)
-                               goto out;
-                       if (buffer_new(bh))
+                               goto recover;
+                       if (buffer_new(bh)) {
+                               /* blockdev mappings never come here */
+                               clear_bit(BH_New, &bh->b_state);
                                 unmap_underlying_metadata(bh);
+                       }
                 }
                 bh = bh->b_this_page;
                 block++;
         } while (bh != head);
  
-       /* Stage 2: lock the buffers, mark them clean */
         do {
-               lock_buffer(bh);
-               set_buffer_async_io(bh);
-               set_bit(BH_Uptodate, &bh->b_state);
-               clear_bit(BH_Dirty, &bh->b_state);
+               get_bh(bh);
+               if (buffer_dirty(bh)) {
+                       lock_buffer(bh);
+                       if (buffer_dirty(bh)) {
+                               if (!buffer_mapped(bh))
+                                       buffer_error();
+                               if (!buffer_uptodate(bh))
+                                       buffer_error();
+                               set_buffer_async_io(bh);
+                       } else {
+                               unlock_buffer(bh);
+                       }
+               }
                 bh = bh->b_this_page;
         } while (bh != head);
  
-       /* Stage 3: submit the IO */
+       /*
+        * The page may come unlocked any time after the *first* submit_bh()
+        * call.  Be careful with its buffers.
+        */
         do {
                 struct buffer_head *next = bh->b_this_page;
-               submit_bh(WRITE, bh);
+               if (buffer_async(bh)) {
+                       mark_buffer_clean(bh);
+                       submit_bh(WRITE, bh);
+                       nr_underway++;
+               }
+               put_bh(bh);
                 bh = next;
         } while (bh != head);
  
-       /* Done - end_buffer_io_async will unlock */
-       SetPageUptodate(page);
-       return 0;
-
-out:
+       err = 0;
+done:
+       if (nr_underway == 0) {
+               /*
+                * The page was marked dirty, but the buffers were
+                * clean.  Someone wrote them back by hand with
+                * ll_rw_block/submit_bh.  A rare case.
+                */
+               int uptodate = 1;
+               do {
+                       if (!buffer_uptodate(bh)) {
+                               uptodate = 0;
+                               break;
+                       }
+                       bh = bh->b_this_page;
+               } while (bh != head);
+               if (uptodate)
+                       SetPageUptodate(page);
+               UnlockPage(page);
+       }
+       return err;
+recover:
         /*
          * ENOSPC, or some other error.  We may already have added some
          * blocks to the file, so we need to write these out to avoid
@@ -1514,28 +1449,31 @@ out:
          */
         ClearPageUptodate(page);
         bh = head;
-       need_unlock = 1;
         /* Recovery: lock and submit the mapped buffers */
         do {
                 if (buffer_mapped(bh)) {
                         lock_buffer(bh);
                         set_buffer_async_io(bh);
-                       need_unlock = 0;
+               } else {
+                       /*
+                        * The buffer may have been set dirty during
+                        * attachment to a dirty page.
+                        */
+                       mark_buffer_clean(bh);
                 }
                 bh = bh->b_this_page;
         } while (bh != head);
         do {
                 struct buffer_head *next = bh->b_this_page;
                 if (buffer_mapped(bh)) {
-                       set_bit(BH_Uptodate, &bh->b_state);
-                       clear_bit(BH_Dirty, &bh->b_state);
+                       mark_buffer_uptodate(bh, 1);
+                       mark_buffer_clean(bh);
                         submit_bh(WRITE, bh);
+                       nr_underway++;
                 }
                 bh = next;
         } while (bh != head);
-       if (need_unlock)
-               UnlockPage(page);
-       return err;
+       goto done;
  }
  
  static int __block_prepare_write(struct inode *inode, struct page *page,
@@ -1548,9 +1486,14 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
         char *kaddr = kmap(page);
  
+       BUG_ON(!PageLocked(page));
+       BUG_ON(from > PAGE_CACHE_SIZE);
+       BUG_ON(to > PAGE_CACHE_SIZE);
+       BUG_ON(from > to);
+
         blocksize = 1 << inode->i_blkbits;
         if (!page_has_buffers(page))
-               create_empty_buffers(page, blocksize);
+               create_empty_buffers(page, blocksize, 0);
         head = page_buffers(page);
  
         bbits = inode->i_blkbits;
@@ -1558,35 +1501,38 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
  
         for(bh = head, block_start = 0; bh != head || !block_start;
             block++, block_start=block_end, bh = bh->b_this_page) {
-               if (!bh)
-                       BUG();
-               block_end = block_start+blocksize;
-               if (block_end <= from)
+               block_end = block_start + blocksize;
+               if (block_end <= from || block_start >= to) {
+                       if (Page_Uptodate(page))
+                               mark_buffer_uptodate(bh, 1);
                         continue;
-               if (block_start >= to)
-                       break;
+               }
                 clear_bit(BH_New, &bh->b_state);
                 if (!buffer_mapped(bh)) {
                         err = get_block(inode, block, bh, 1);
                         if (err)
                                 goto out;
                         if (buffer_new(bh)) {
+                               clear_bit(BH_New, &bh->b_state);
                                 unmap_underlying_metadata(bh);
                                 if (Page_Uptodate(page)) {
-                                       set_bit(BH_Uptodate, &bh->b_state);
+                                       if (!buffer_mapped(bh))
+                                               buffer_error();
+                                       mark_buffer_uptodate(bh, 1);
                                         continue;
                                 }
                                 if (block_end > to)
                                         memset(kaddr+to, 0, block_end-to);
                                 if (block_start < from)
-                                       memset(kaddr+block_start, 0, from-block_start);
+                                       memset(kaddr+block_start,
+                                               0, from-block_start);
                                 if (block_end > to || block_start < from)
                                         flush_dcache_page(page);
                                 continue;
                         }
                 }
                 if (Page_Uptodate(page)) {
-                       set_bit(BH_Uptodate, &bh->b_state);
+                       mark_buffer_uptodate(bh, 1);
                         continue; 
                 }
                 if (!buffer_uptodate(bh) &&
@@ -1619,10 +1565,11 @@ out:
                 if (block_start >= to)
                         break;
                 if (buffer_new(bh)) {
+                       clear_bit(BH_New, &bh->b_state);
                         if (buffer_uptodate(bh))
-                               printk(KERN_ERR "%s: zeroing uptodate buffer!\n", __FUNCTION__);
+                               buffer_error();
                         memset(kaddr+block_start, 0, bh->b_size);
-                       set_bit(BH_Uptodate, &bh->b_state);
+                       mark_buffer_uptodate(bh, 1);
                         mark_buffer_dirty(bh);
                 }
  next_bh:
@@ -1636,7 +1583,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
                 unsigned from, unsigned to)
  {
         unsigned block_start, block_end;
-       int partial = 0, need_balance_dirty = 0;
+       int partial = 0;
         unsigned blocksize;
         struct buffer_head *bh, *head;
  
@@ -1650,21 +1597,18 @@ static int __block_commit_write(struct inode *inode, struct page *page,
                         if (!buffer_uptodate(bh))
                                 partial = 1;
                 } else {
-                       set_bit(BH_Uptodate, &bh->b_state);
+                       mark_buffer_uptodate(bh, 1);
                         if (!atomic_set_buffer_dirty(bh)) {
                                 __mark_dirty(bh);
                                 buffer_insert_inode_data_queue(bh, inode);
-                               need_balance_dirty = 1;
                         }
                 }
         }
  
-       if (need_balance_dirty)
-               balance_dirty();
         /*
-        * is this a partial write that happened to make all buffers
+        * If this is a partial write which happened to make all buffers
          * uptodate then we can optimize away a bogus readpage() for
-        * the next read(). Here we 'discover' wether the page went
+        * the next read(). Here we 'discover' whether the page went
          * uptodate as a result of this (potentially partial) write.
          */
         if (!partial)
@@ -1689,9 +1633,11 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
  
         if (!PageLocked(page))
                 PAGE_BUG(page);
+       if (Page_Uptodate(page))
+               buffer_error();
         blocksize = 1 << inode->i_blkbits;
         if (!page_has_buffers(page))
-               create_empty_buffers(page, blocksize);
+               create_empty_buffers(page, blocksize, 0);
         head = page_buffers(page);
  
         blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
@@ -1714,16 +1660,17 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
                                 memset(kmap(page) + i*blocksize, 0, blocksize);
                                 flush_dcache_page(page);
                                 kunmap(page);
-                               set_bit(BH_Uptodate, &bh->b_state);
+                               mark_buffer_uptodate(bh, 1);
                                 continue;
                         }
-                       /* get_block() might have updated the buffer synchronously */
+                       /*
+                        * get_block() might have updated the buffer
+                        * synchronously
+                        */
                         if (buffer_uptodate(bh))
                                 continue;
                 }
-
-               arr[nr] = bh;
-               nr++;
+               arr[nr++] = bh;
         } while (i++, iblock++, (bh = bh->b_this_page) != head);
  
         if (!nr) {
@@ -1741,13 +1688,25 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
         for (i = 0; i < nr; i++) {
                 struct buffer_head * bh = arr[i];
                 lock_buffer(bh);
+               if (buffer_uptodate(bh))
+                       buffer_error();
+               if (buffer_dirty(bh))
+                       buffer_error();
                 set_buffer_async_io(bh);
         }
-
-       /* Stage 3: start the IO */
-       for (i = 0; i < nr; i++)
-               submit_bh(READ, arr[i]);
-
+
+       /*
+        * Stage 3: start the IO.  Check for uptodateness
+        * inside the buffer lock in case another process reading
+        * the underlying blockdev brought it uptodate (the sct fix).
+        */
+       for (i = 0; i < nr; i++) {
+               struct buffer_head * bh = arr[i];
+               if (buffer_uptodate(bh))
+                       end_buffer_io_async(bh, 1);
+               else
+                       submit_bh(READ, bh);
+       }
         return 0;
  }
  
@@ -1802,7 +1761,8 @@ out:
   * We may have to extend the file.
   */
  
-int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
+int cont_prepare_write(struct page *page, unsigned offset,
+               unsigned to, get_block_t *get_block, unsigned long *bytes)
  {
         struct address_space *mapping = page->mapping;
         struct inode *inode = mapping->host;
@@ -1836,7 +1796,8 @@ int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_bloc
                 kaddr = page_address(new_page);
                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
                 flush_dcache_page(new_page);
-               __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
+               __block_commit_write(inode, new_page,
+                               zerofrom, PAGE_CACHE_SIZE);
                 kunmap(new_page);
                 UnlockPage(new_page);
                 page_cache_release(new_page);
@@ -1917,7 +1878,8 @@ int generic_commit_write(struct file *file, struct page *page,
         return 0;
  }
  
-int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
+int block_truncate_page(struct address_space *mapping,
+                       loff_t from, get_block_t *get_block)
  {
         unsigned long index = from >> PAGE_CACHE_SHIFT;
         unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@ -1943,7 +1905,7 @@ int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t
                 goto out;
  
         if (!page_has_buffers(page))
-               create_empty_buffers(page, blocksize);
+               create_empty_buffers(page, blocksize, 0);
  
         /* Find the buffer that contains "offset" */
         bh = page_buffers(page);
@@ -1967,7 +1929,7 @@ int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t
  
         /* Ok, it's mapped. Make sure it's up-to-date */
         if (Page_Uptodate(page))
-               set_bit(BH_Uptodate, &bh->b_state);
+               mark_buffer_uptodate(bh, 1);
  
         if (!buffer_uptodate(bh)) {
                 err = -EIO;
@@ -1982,7 +1944,7 @@ int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t
         flush_dcache_page(page);
         kunmap(page);
  
-       __mark_buffer_dirty(bh);
+       mark_buffer_dirty(bh);
         err = 0;
  
  unlock:
@@ -1992,38 +1954,33 @@ out:
         return err;
  }
  
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
  int block_write_full_page(struct page *page, get_block_t *get_block)
  {
-       struct inode *inode = page->mapping->host;
-       unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+       struct inode * const inode = page->mapping->host;
+       const unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
         unsigned offset;
-       int err;
+       char *kaddr;
  
-       /* easy case */
+       /* Is the page fully inside i_size? */
         if (page->index < end_index)
                 return __block_write_full_page(inode, page, get_block);
  
-       /* things got complicated... */
+       /* Is the page fully outside i_size? (truncate in progress) */
         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
-       /* OK, are we completely out? */
         if (page->index >= end_index+1 || !offset) {
                 UnlockPage(page);
                 return -EIO;
         }
  
-       /* Sigh... will have to work, then... */
-       err = __block_prepare_write(inode, page, 0, offset, get_block);
-       if (!err) {
-               memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
-               flush_dcache_page(page);
-               __block_commit_write(inode,page,0,offset);
-done:
-               kunmap(page);
-               UnlockPage(page);
-               return err;
-       }
-       ClearPageUptodate(page);
-       goto done;
+       /* The page straddles i_size */
+       kaddr = kmap(page);
+       memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+       flush_dcache_page(page);
+       kunmap(page);
+       return __block_write_full_page(inode, page, get_block);
  }
  
  /*
@@ -2033,18 +1990,19 @@ done:
   */
  int writeout_one_page(struct page *page)
  {
-       struct buffer_head *bh, *head = page_buffers(page);
-
-       if (!PageLocked(page))
-               BUG();
+       struct buffer_head * const head = page_buffers(page);
+       struct buffer_head *arr[MAX_BUF_PER_PAGE];
+       struct buffer_head *bh;
+       int nr = 0;
+       BUG_ON(!PageLocked(page));
         bh = head;
         do {
-               if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
-                       continue;
-
-               bh->b_flushtime = jiffies;
-               ll_rw_block(WRITE, 1, &bh);     
+               if (!buffer_locked(bh) && buffer_dirty(bh) &&
+                               buffer_mapped(bh) && buffer_uptodate(bh))
+                       arr[nr++] = bh;
         } while ((bh = bh->b_this_page) != head);
+       if (nr)
+               ll_rw_block(WRITE, nr, arr);    
         return 0;
  }
  EXPORT_SYMBOL(writeout_one_page);
@@ -2079,7 +2037,9 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
         return tmp.b_blocknr;
  }
  
-int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
+int generic_direct_IO(int rw, struct inode *inode,
+                       struct kiobuf *iobuf, unsigned long blocknr,
+                       int blocksize, get_block_t *get_block)
  {
         int i, nr_blocks, retval;
         sector_t *blocks = iobuf->blocks;
@@ -2114,7 +2074,8 @@ int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsig
         }
  
         /* This does not understand multi-device filesystems currently */
-       retval = brw_kiovec(rw, 1, &iobuf, inode->i_sb->s_bdev, blocks, blocksize);
+       retval = brw_kiovec(rw, 1, &iobuf,
+                       inode->i_sb->s_bdev, blocks, blocksize);
  
   out:
         return retval;
@@ -2185,21 +2146,18 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
   * before I/O is complete. You then have to check page->locked
   * and page->uptodate.
   *
- * brw_page() is SMP-safe, although it's being called with the
- * kernel lock held - but the code is ready.
- *
   * FIXME: we need a swapper_inode->get_block function to remove
   *        some of the bmap kludges and interface ugliness here.
   */
-int brw_page(int rw, struct page *page, struct block_device *bdev, sector_t b[], int size)
+int brw_page(int rw, struct page *page,
+               struct block_device *bdev, sector_t b[], int size)
  {
         struct buffer_head *head, *bh;
  
-       if (!PageLocked(page))
-               panic("brw_page: page not locked for I/O");
+       BUG_ON(!PageLocked(page));
  
         if (!page_has_buffers(page))
-               create_empty_buffers(page, size);
+               create_empty_buffers(page, size, 0);
         head = bh = page_buffers(page);
  
         /* Stage 1: lock all the buffers */
@@ -2208,6 +2166,8 @@ int brw_page(int rw, struct page *page, struct block_device *bdev, sector_t b[],
                 bh->b_blocknr = *(b++);
                 bh->b_bdev = bdev;
                 set_bit(BH_Mapped, &bh->b_state);
+               if (rw == WRITE)        /* To support submit_bh debug tests */
+                       mark_buffer_uptodate(bh, 1);
                 set_buffer_async_io(bh);
                 bh = bh->b_this_page;
         } while (bh != head);
@@ -2257,238 +2217,106 @@ fail:
         return err;
  }
  
-static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
+/*
+ * Sanity checks for try_to_free_buffers.
+ */
+static void check_ttfb_buffer(struct page *page, struct buffer_head *bh)
  {
-       struct buffer_head *bh, *tail;
-
-       bh = head;
-       do {
-               tail = bh;
-               bh = bh->b_this_page;
-       } while (bh);
-       tail->b_this_page = head;
-       set_page_buffers(page, head);
-       page_cache_get(page);
+       if (!buffer_uptodate(bh)) {
+               if (Page_Uptodate(page) && page->mapping
+                       && buffer_mapped(bh)    /* discard_buffer */
+                       && S_ISBLK(page->mapping->host->i_mode))
+               {
+                       buffer_error();
+               }
+       }
  }
  
  /*
- * Create the page-cache page that contains the requested block
+ * try_to_free_buffers() checks if all the buffers on this particular page
+ * are unused, and releases them if so.
+ *
+ * Exclusion against try_to_free_buffers may be obtained by either
+ * locking the page or by holding its inode's i_bufferlist_lock.
+ *
+ * If the page is dirty but all the buffers are clean then we need to
+ * be sure to mark the page clean as well.  This is because the page
+ * may be against a block device, and a later reattachment of buffers
+ * to a dirty page will set *all* buffers dirty.  Which would corrupt
+ * filesystem data on the same device.
+ *
+ * The same applies to regular filesystem pages: if all the buffers are
+ * clean then we set the page clean and proceed.  To do that, we require
+ * total exclusion from __set_page_dirty_buffers().  That is obtained with
+ * i_bufferlist_lock.
+ *
+ * Nobody should be calling try_to_free_buffers against a page which is
+ * eligible for set_page_dirty() treatment anyway - the page is clearly
+ * not freeable.  So we could just test page_count(page) here and complain
+ * then scram if it's wrong.
+ *
+ * If any buffer is not uptodate then the entire page is set not uptodate,
+ * as the partial uptodateness information is about to be lost.
+ *
+ * try_to_free_buffers() is non-blocking.
   */
-static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
+static inline int buffer_busy(struct buffer_head *bh)
  {
-       struct page * page;
-       struct buffer_head *bh;
-
-       page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
-       if (!page)
-               return NULL;
-
-       if (!PageLocked(page))
-               BUG();
-
-       if (page_has_buffers(page)) {
-               bh = page_buffers(page);
-               if (bh->b_size == size)
-                       return page;
-               if (!try_to_free_buffers(page, GFP_NOFS))
-                       goto failed;
-       }
-
-       bh = create_buffers(page, size, 0);
-       if (!bh)
-               goto failed;
-       link_dev_buffers(page, bh);
-       return page;
-
-failed:
-       UnlockPage(page);
-       page_cache_release(page);
-       return NULL;
+       return atomic_read(&bh->b_count) |
+               (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
  }
  
-static void hash_page_buffers(struct page *page, struct block_device *bdev, int block, int size)
+static /*inline*/ int drop_buffers(struct page *page)
  {
         struct buffer_head *head = page_buffers(page);
-       struct buffer_head *bh = head;
-       unsigned int uptodate;
-
-       uptodate = 1 << BH_Mapped;
-       if (Page_Uptodate(page))
-               uptodate |= 1 << BH_Uptodate;
+       struct buffer_head *bh;
+       int was_uptodate = 1;
  
-       write_lock(&hash_table_lock);
+       bh = head;
         do {
-               if (!(bh->b_state & (1 << BH_Mapped))) {
-                       init_buffer(bh, NULL, NULL);
-                       bh->b_bdev = bdev;
-                       bh->b_blocknr = block;
-                       bh->b_state = uptodate;
-               }
-
-               /* Insert the buffer into the hash lists if necessary */
-               if (!bh->b_pprev)
-                       __insert_into_hash_list(bh);
-
-               block++;
+               check_ttfb_buffer(page, bh);
+               if (buffer_busy(bh))
+                       goto failed;
+               if (!buffer_uptodate(bh))
+                       was_uptodate = 0;
                 bh = bh->b_this_page;
         } while (bh != head);
-       write_unlock(&hash_table_lock);
-}
-
-/*
- * Try to increase the number of buffers available: the size argument
- * is used to determine what kind of buffers we want.
- */
-static int grow_buffers(struct block_device *bdev, unsigned long block, int size)
-{
-       struct page * page;
-       unsigned long index;
-       int sizebits;
  
-       /* Size must be multiple of hard sectorsize */
-       if (size & (bdev_hardsect_size(bdev)-1))
-               BUG();
-       /* Size must be within 512 bytes and PAGE_SIZE */
-       if (size < 512 || size > PAGE_SIZE)
-               BUG();
+       if (!was_uptodate && Page_Uptodate(page))
+               buffer_error();
  
-       sizebits = -1;
+       spin_lock(&unused_list_lock);
         do {
-               sizebits++;
-       } while ((size << sizebits) < PAGE_SIZE);
-
-       index = block >> sizebits;
-       block = index << sizebits;
-
-       /* Create a page with the proper size buffers.. */
-       page = grow_dev_page(bdev, index, size);
-
-       if (!page)
-               return 0;
-
-       /* Hash in the buffers on the hash list */
-       hash_page_buffers(page, bdev, block, size);
-       UnlockPage(page);
-       page_cache_release(page);
+               struct buffer_head *next = bh->b_this_page;
  
-       /* We hashed up this page, so increment buffermem */
-       atomic_inc(&buffermem_pages);
+               __remove_inode_queue(bh);
+               __put_unused_buffer_head(bh);
+               bh = next;
+       } while (bh != head);
+       spin_unlock(&unused_list_lock);
+       __clear_page_buffers(page);
         return 1;
+failed:
+       return 0;
  }
  
-static int sync_page_buffers(struct buffer_head *head, unsigned int gfp_mask)
-{
-       struct buffer_head * bh = head;
-       int tryagain = 0;
-
-       do {
-               if (!buffer_dirty(bh) && !buffer_locked(bh))
-                       continue;
-
-               /* Don't start IO first time around.. */
-               if (!test_and_set_bit(BH_Wait_IO, &bh->b_state))
-                       continue;
-
-               /* Second time through we start actively writing out.. */
-               if (test_and_set_bit(BH_Lock, &bh->b_state)) {
-                       if (!test_bit(BH_launder, &bh->b_state))
-                               continue;
-                       wait_on_buffer(bh);
-                       tryagain = 1;
-                       continue;
-               }
-
-               if (!atomic_set_buffer_clean(bh)) {
-                       unlock_buffer(bh);
-                       continue;
-               }
-
-               __mark_buffer_clean(bh);
-               get_bh(bh);
-               set_bit(BH_launder, &bh->b_state);
-               bh->b_end_io = end_buffer_io_sync;
-               submit_bh(WRITE, bh);
-               tryagain = 0;
-       } while ((bh = bh->b_this_page) != head);
-
-       return tryagain;
-}
-
-/*
- * Can the buffer be thrown out?
- */
-#define BUFFER_BUSY_BITS       ((1<<BH_Dirty) | (1<<BH_Lock))
-#define buffer_busy(bh)                (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
-
-/*
- * try_to_free_buffers() checks if all the buffers on this particular page
- * are unused, and free's the page if so.
- *
- * Wake up bdflush() if this fails - if we're running low on memory due
- * to dirty buffers, we need to flush them out as quickly as possible.
- *
- * NOTE: There are quite a number of ways that threads of control can
- *       obtain a reference to a buffer head within a page.  So we must
- *      lock out all of these paths to cleanly toss the page.
- */
-int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
+int try_to_free_buffers(struct page *page)
  {
-       struct buffer_head * tmp, * bh = page_buffers(page);
+       struct inode *inode;
+       int ret = 0;
  
         BUG_ON(!PageLocked(page));
-       BUG_ON(!bh);
-
-cleaned_buffers_try_again:
-       spin_lock(&lru_list_lock);
-       write_lock(&hash_table_lock);
-       tmp = bh;
-       do {
-               if (buffer_busy(tmp))
-                       goto busy_buffer_page;
-               tmp = tmp->b_this_page;
-       } while (tmp != bh);
-
-       spin_lock(&unused_list_lock);
-       tmp = bh;
-
-       /* if this buffer was hashed, this page counts as buffermem */
-       if (bh->b_pprev)
-               atomic_dec(&buffermem_pages);
-       do {
-               struct buffer_head * p = tmp;
-               tmp = tmp->b_this_page;
-               remove_inode_queue(p);
-               __remove_from_queues(p);
-               __put_unused_buffer_head(p);
-       } while (tmp != bh);
-       spin_unlock(&unused_list_lock);
-
-       /* Wake up anyone waiting for buffer heads */
-       wake_up(&buffer_wait);
  
-       /* And free the page */
-       clear_page_buffers(page);
-       page_cache_release(page);
-       write_unlock(&hash_table_lock);
-       spin_unlock(&lru_list_lock);
-       return 1;
+       if (page->mapping == NULL)      /* swapped-in anon page */
+               return drop_buffers(page);
  
-busy_buffer_page:
-       /* Uhhuh, start writeback so that we don't end up with all dirty pages */
-       write_unlock(&hash_table_lock);
-       spin_unlock(&lru_list_lock);
-       if (gfp_mask & __GFP_IO) {
-               if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
-                       if (sync_page_buffers(bh, gfp_mask)) {
-                               /* no IO or waiting next time */
-                               gfp_mask = 0;
-                               goto cleaned_buffers_try_again;
-                       }
-               }
-       }
-       if (balance_dirty_state() >= 0)
-               wakeup_bdflush();
-       return 0;
+       inode = page->mapping->host;
+       spin_lock(&inode->i_bufferlist_lock);
+       ret = drop_buffers(page);
+       if (ret)
+               ClearPageDirty(page);
+       spin_unlock(&inode->i_bufferlist_lock);
+       return ret;
  }
  EXPORT_SYMBOL(try_to_free_buffers);
  
@@ -2496,129 +2324,8 @@ EXPORT_SYMBOL(try_to_free_buffers);
  
  void show_buffers(void)
  {
-#ifdef CONFIG_SMP
-       struct buffer_head * bh;
-       int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
-       int nlist;
-       static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
-#endif
-
         printk("Buffer memory:   %6dkB\n",
                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
-
-#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
-       if (!spin_trylock(&lru_list_lock))
-               return;
-       for(nlist = 0; nlist < NR_LIST; nlist++) {
-               found = locked = dirty = used = lastused = 0;
-               bh = lru_list[nlist];
-               if(!bh) continue;
-
-               do {
-                       found++;
-                       if (buffer_locked(bh))
-                               locked++;
-                       if (buffer_dirty(bh))
-                               dirty++;
-                       if (atomic_read(&bh->b_count))
-                               used++, lastused = found;
-                       bh = bh->b_next_free;
-               } while (bh != lru_list[nlist]);
-               {
-                       int tmp = nr_buffers_type[nlist];
-                       if (found != tmp)
-                               printk("%9s: BUG -> found %d, reported %d\n",
-                                      buf_types[nlist], found, tmp);
-               }
-               printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
-                      "%d locked, %d dirty\n",
-                      buf_types[nlist], found, size_buffers_type[nlist]>>10,
-                      used, lastused, locked, dirty);
-       }
-       spin_unlock(&lru_list_lock);
-#endif
-}
-
-/* ===================== Init ======================= */
-
-/*
- * allocate the hash table and init the free list
- * Use gfp() for the hash table to decrease TLB misses, use
- * SLAB cache for buffer heads.
- */
-void __init buffer_init(unsigned long mempages)
-{
-       int order, i;
-       unsigned int nr_hash;
-
-       /* The buffer cache hash table is less important these days,
-        * trim it a bit.
-        */
-       mempages >>= 14;
-
-       mempages *= sizeof(struct buffer_head *);
-
-       for (order = 0; (1 << order) < mempages; order++)
-               ;
-
-       /* try to allocate something until we get it or we're asking
-          for something that is really too small */
-
-       do {
-               unsigned long tmp;
-
-               nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
-               bh_hash_mask = (nr_hash - 1);
-
-               tmp = nr_hash;
-               bh_hash_shift = 0;
-               while((tmp >>= 1UL) != 0UL)
-                       bh_hash_shift++;
-
-               hash_table = (struct buffer_head **)
-                   __get_free_pages(GFP_ATOMIC, order);
-       } while (hash_table == NULL && --order > 0);
-       printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
-              nr_hash, order, (PAGE_SIZE << order));
-
-       if (!hash_table)
-               panic("Failed to allocate buffer hash table\n");
-
-       /* Setup hash chains. */
-       for(i = 0; i < nr_hash; i++)
-               hash_table[i] = NULL;
-
-       /* Setup lru lists. */
-       for(i = 0; i < NR_LIST; i++)
-               lru_list[i] = NULL;
-
-}
-
-/* 
- * Here we attempt to write back old buffers.  We also try to flush inodes 
- * and supers as well, since this function is essentially "update", and 
- * otherwise there would be no way of ensuring that these quantities ever 
- * get written back.  Ideally, we would have a timestamp on the inodes
- * and superblocks so that we could write back only the old ones as well
- */
-
-static void sync_old_buffers(unsigned long dummy)
-{
-       sync_unlocked_inodes();
-       sync_supers();
-
-       for (;;) {
-               struct buffer_head *bh;
-
-               spin_lock(&lru_list_lock);
-               bh = lru_list[BUF_DIRTY];
-               if (!bh || time_before(jiffies, bh->b_flushtime))
-                       break;
-               if (write_some_buffers(NULL))
-                       continue;
-               return;
-       }
-       spin_unlock(&lru_list_lock);
  }
  
  int block_sync_page(struct page *page)
@@ -2627,98 +2334,20 @@ int block_sync_page(struct page *page)
         return 0;
  }
  
-/* This is the interface to bdflush.  As we get more sophisticated, we can
- * pass tuning parameters to this "process", to adjust how it behaves. 
- * We would want to verify each parameter, however, to make sure that it 
- * is reasonable. */
-
+/*
+ * There are no bdflush tunables left.  But distributions are
+ * still running obsolete flush daemons, so we terminate them here.
+ */
  asmlinkage long sys_bdflush(int func, long data)
  {
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
-
-       if (func == 1) {
-               /* do_exit directly and let kupdate to do its work alone. */
+       if (func == 1)
                 do_exit(0);
-#if 0 /* left here as it's the only example of lazy-mm-stuff used from
-        a syscall that doesn't care about the current mm context. */
-               int error;
-               struct mm_struct *user_mm;
-
-               /*
-                * bdflush will spend all of it's time in kernel-space,
-                * without touching user-space, so we can switch it into
-                * 'lazy TLB mode' to reduce the cost of context-switches
-                * to and from bdflush.
-                */
-               user_mm = start_lazy_tlb();
-               error = sync_old_buffers();
-               end_lazy_tlb(user_mm);
-               return error;
-#endif
-       }
-
-       /* Basically func 1 means read param 1, 2 means write param 1, etc */
-       if (func >= 2) {
-               int i = (func-2) >> 1;
-               if (i >= 0 && i < N_PARAM) {
-                       if ((func & 1) == 0)
-                               return put_user(bdf_prm.data[i], (int*)data);
-
-                       if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
-                               bdf_prm.data[i] = data;
-                               return 0;
-                       }
-               }
-               return -EINVAL;
-       }
-
-       /* Having func 0 used to launch the actual bdflush and then never
-        * return (unless explicitly killed). We return zero here to 
-        * remain semi-compatible with present update(8) programs.
-        */
         return 0;
  }
  
-static void bdflush(unsigned long pexclude)
-{
-       while (balance_dirty_state() >= 0) {
-               spin_lock(&lru_list_lock);
-               if (write_some_buffers(NULL) == 0)
-                       break;
-       }
-       clear_bit(0, (unsigned long *)pexclude);
-}
-
  void wakeup_bdflush(void)
  {
-       static unsigned long exclude;
-
-       if (!test_and_set_bit(0, &exclude)) {
-               if (pdflush_operation(bdflush, (unsigned long)&exclude))
-                       clear_bit(0, &exclude);
-       }
-}
-
-/*
- * kupdate
- */
-static struct timer_list kupdate_timer;
-static void kupdate_handler(unsigned long dummy)
-{
-       pdflush_operation(sync_old_buffers, 0);
-       mod_timer(&kupdate_timer, jiffies + bdf_prm.b_un.interval);
-}
-
-static int __init kupdate_init(void)
-{
-       init_timer(&kupdate_timer);
-       kupdate_timer.expires = jiffies + bdf_prm.b_un.interval;
-       kupdate_timer.data = 0;
-       kupdate_timer.function = kupdate_handler;
-       add_timer(&kupdate_timer);
-       return 0;
+       pdflush_flush(0);
  }
-
-module_init(kupdate_init)
-
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c

index 87220470cc5746fb3ae2b782098788920518b34d..3240e12e4011e68b88c9c0aac8e9bf254801ee41 100644 (file)
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1290,8 +1290,13 @@ static int ext3_writepage(struct page *page)
  
         /* bget() all the buffers */
         if (order_data) {
-               if (!page_has_buffers(page))
-                       create_empty_buffers(page, inode->i_sb->s_blocksize);
+               if (!page_has_buffers(page)) {
+                       if (!Page_Uptodate(page))
+                               buffer_error();
+                       create_empty_buffers(page,
+                               inode->i_sb->s_blocksize,
+                               (1 << BH_Dirty)|(1 << BH_Uptodate));
+               }
                 page_bufs = page_buffers(page);
                 walk_page_buffers(handle, page_bufs, 0,
                                 PAGE_CACHE_SIZE, NULL, bget_one);
@@ -1394,7 +1399,7 @@ static int ext3_block_truncate_page(handle_t *handle,
                 goto out;
  
         if (!page_has_buffers(page))
-               create_empty_buffers(page, blocksize);
+               create_empty_buffers(page, blocksize, 0);
  
         /* Find the buffer that contains "offset" */
         bh = page_buffers(page);
@@ -1448,7 +1453,7 @@ static int ext3_block_truncate_page(handle_t *handle,
         } else {
                 if (ext3_should_order_data(inode))
                         err = ext3_journal_dirty_data(handle, bh, 0);
-               __mark_buffer_dirty(bh);
+               mark_buffer_dirty(bh);
         }
  
  unlock:
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

new file mode 100644 (file)

index 0000000..9e46777
--- /dev/null
+++ b/fs/fs-writeback.c
@@ -0,0 +1,519 @@
+/*
+ * fs/fs-writeback.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains all the functions related to writing back and waiting
+ * upon dirty inodes against superblocks, and writing back dirty
+ * pages against inodes.  ie: data writeback.  Writeout of the
+ * inode itself is not handled here.
+ *
+ * 10Apr2002   akpm@zip.com.au
+ *             Split out of fs/inode.c
+ *             Additions for address_space-based writeback
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+
+/**
+ *     __mark_inode_dirty -    internal function
+ *     @inode: inode to mark
+ *     @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ *     Mark an inode as dirty. Callers should use mark_inode_dirty or
+ *     mark_inode_dirty_sync.
+ *
+ * Put the inode on the super block's dirty list.
+ *
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
+ * dirty list only if it is hashed or if it refers to a blockdev.
+ * If it was not hashed, it will never be added to the dirty list
+ * even if it is later hashed, as it will have been marked dirty already.
+ *
+ * In short, make sure you hash any inodes _before_ you start marking
+ * them dirty.
+ *
+ * This function *must* be atomic for the I_DIRTY_PAGES case -
+ * set_page_dirty() is called under spinlock in several places.
+ */
+void __mark_inode_dirty(struct inode *inode, int flags)
+{
+       struct super_block *sb = inode->i_sb;
+
+       if (!sb)
+               return;         /* swapper_space */
+
+       /*
+        * Don't do this for I_DIRTY_PAGES - that doesn't actually
+        * dirty the inode itself
+        */
+       if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+               if (sb->s_op && sb->s_op->dirty_inode)
+                       sb->s_op->dirty_inode(inode);
+       }
+
+       /* avoid the locking if we can */
+       if ((inode->i_state & flags) == flags)
+               return;
+
+       spin_lock(&inode_lock);
+       if ((inode->i_state & flags) != flags) {
+               inode->i_state |= flags;
+
+               /*
+                * If the inode is locked, just update its dirty state. 
+                * The unlocker will place the inode on the appropriate
+                * superblock list, based upon its state.
+                */
+               if (inode->i_state & I_LOCK)
+                       goto same_list;
+
+               /*
+                * Only add valid (hashed) inode to the superblock's
+                * dirty list.  Add blockdev inodes as well.
+                */
+               if (list_empty(&inode->i_hash) && !S_ISBLK(inode->i_mode))
+                       goto same_list;
+               if (inode->i_mapping->dirtied_when == 0)
+                       inode->i_mapping->dirtied_when = jiffies;
+               list_del(&inode->i_list);
+               list_add(&inode->i_list, &sb->s_dirty);
+       }
+same_list:
+       spin_unlock(&inode_lock);
+}
+
+static inline void write_inode(struct inode *inode, int sync)
+{
+       if (inode->i_sb->s_op && inode->i_sb->s_op->write_inode &&
+                       !is_bad_inode(inode))
+               inode->i_sb->s_op->write_inode(inode, sync);
+}
+
+/*
+ * Write a single inode's dirty pages and inode data out to disk.
+ * If `sync' is set, wait on the writeout.
+ * If `nr_to_write' is not NULL, subtract the number of written pages
+ * from *nr_to_write.
+ *
+ * Normally it is not legal for a single process to lock more than one
+ * page at a time, due to ab/ba deadlock problems.  But writeback_mapping()
+ * does want to lock a large number of pages, without immediately submitting
+ * I/O against them (starting I/O is a "deferred unlock_page").
+ *
+ * However it *is* legal to lock multiple pages, if this is only ever performed
+ * by a single process.  We provide that exclusion via locking in the
+ * filesystem's ->writeback_mapping a_op. This ensures that only a single
+ * process is locking multiple pages against this inode.  And as I/O is
+ * submitted against all those locked pages, there is no deadlock.
+ *
+ * Called under inode_lock.
+ */
+static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
+{
+       unsigned dirty;
+       struct address_space *mapping = inode->i_mapping;
+
+       list_del(&inode->i_list);
+       list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
+
+       if (inode->i_state & I_LOCK)
+               BUG();
+
+       /* Set I_LOCK, reset I_DIRTY */
+       dirty = inode->i_state & I_DIRTY;
+       inode->i_state |= I_LOCK;
+       inode->i_state &= ~I_DIRTY;
+       spin_unlock(&inode_lock);
+
+       if (mapping->a_ops->writeback_mapping)
+               mapping->a_ops->writeback_mapping(mapping, nr_to_write);
+       else
+               filemap_fdatasync(mapping);
+
+       /* Don't write the inode if only I_DIRTY_PAGES was set */
+       if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
+               write_inode(inode, wait);
+
+       if (wait)
+               filemap_fdatawait(mapping);
+
+       /*
+        * For non-blocking writeout (wait == 0), we still
+        * count the inode as being clean.
+        */
+       spin_lock(&inode_lock);
+
+       /*
+        * Did we write back all the pages?
+        */
+       if (nr_to_write && *nr_to_write == 0) {
+               /*
+                * Maybe not
+                */
+               if (!list_empty(&mapping->dirty_pages)) /* No lock needed */
+                       inode->i_state |= I_DIRTY_PAGES;
+       }
+
+       inode->i_state &= ~I_LOCK;
+       if (!(inode->i_state & I_FREEING)) {
+               struct list_head *to;
+               if (inode->i_state & I_DIRTY)
+                       to = &inode->i_sb->s_dirty;
+               else if (atomic_read(&inode->i_count))
+                       to = &inode_in_use;
+               else
+                       to = &inode_unused;
+               list_del(&inode->i_list);
+               list_add(&inode->i_list, to);
+       }
+       wake_up(&inode->i_wait);
+}
+
+/*
+ * Write out an inode's dirty pages.  Called under inode_lock.
+ */
+static void
+__writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
+{
+       while (inode->i_state & I_LOCK) {
+               __iget(inode);
+               spin_unlock(&inode_lock);
+               __wait_on_inode(inode);
+               iput(inode);
+               spin_lock(&inode_lock);
+       }
+       __sync_single_inode(inode, sync, nr_to_write);
+}
+
+void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
+{
+       spin_lock(&inode_lock);
+       __writeback_single_inode(inode, sync, nr_to_write);
+       spin_unlock(&inode_lock);
+}
+
+/*
+ * Write out a list of inodes' pages, and the inode itself.
+ *
+ * If `sync' is true, wait on writeout of the last mapping
+ * which we write.
+ *
+ * If older_than_this is non-NULL, then only write out mappings which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * Called under inode_lock.
+ *
+ * FIXME: putting all the inodes on a local list could introduce a
+ * race with umount.  Bump the superblock refcount?
+ */
+static void __sync_list(struct list_head *head, int sync_mode,
+               int *nr_to_write, unsigned long *older_than_this)
+{
+       struct list_head * tmp;
+       LIST_HEAD(hold);        /* Unready inodes go here */
+
+       while ((tmp = head->next) != head) {
+               struct inode *inode = list_entry(tmp, struct inode, i_list);
+               struct address_space *mapping = inode->i_mapping;
+               int really_sync;
+
+               if (older_than_this && *older_than_this) {
+                       if (time_after(mapping->dirtied_when,
+                                               *older_than_this)) {
+                               list_del(&inode->i_list);
+                               list_add(&inode->i_list, &hold);
+                               continue;
+                       }
+               }
+               really_sync = (sync_mode == WB_SYNC_ALL);
+               if ((sync_mode == WB_SYNC_LAST) && (head->prev == head))
+                       really_sync = 1;
+               __writeback_single_inode(inode, really_sync, nr_to_write);
+               if (nr_to_write && *nr_to_write == 0)
+                       break;
+       }
+       /*
+        * Put the not-ready inodes back
+        */
+       if (!list_empty(&hold))
+               list_splice(&hold, head);
+}
+
+/*
+ * Start writeback of dirty pagecache data against all unlocked inodes.
+ *
+ * Note:
+ * We don't need to grab a reference to superblock here. If it has non-empty
+ * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
+ * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are
+ * empty. Since __sync_single_inode() regains inode_lock before it finally moves
+ * inode from superblock lists we are OK.
+ *
+ * If `older_than_this' is non-zero then only flush inodes which have a
+ * flushtime older than *older_than_this.  Unless *older_than_this is
+ * zero.  In which case we flush everything, like the old (dumb) wakeup_bdflush.
+ */
+void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
+                               unsigned long *older_than_this)
+{
+       struct super_block * sb;
+       static unsigned short writeback_gen;
+
+       spin_lock(&inode_lock);
+       spin_lock(&sb_lock);
+
+       /*
+        * We could get into livelock here if someone is dirtying
+        * inodes fast enough.  writeback_gen is used to avoid that.
+        */
+       writeback_gen++;
+
+       sb = sb_entry(super_blocks.prev);
+       for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
+               if (sb->s_writeback_gen == writeback_gen)
+                       continue;
+               sb->s_writeback_gen = writeback_gen;
+
+               if (current->flags & PF_FLUSHER) {
+                       if (sb->s_flags & MS_FLUSHING) {
+                               /*
+                                * There's no point in two pdflush threads
+                                * flushing the same device.  But for other
+                                * callers, we want to perform the flush
+                                * because the fdatasync is how we implement
+                                * writer throttling.
+                                */
+                               continue;
+                       }
+                       sb->s_flags |= MS_FLUSHING;
+               }
+
+               if (!list_empty(&sb->s_dirty)) {
+                       spin_unlock(&sb_lock);
+                       __sync_list(&sb->s_dirty, sync_mode,
+                                       nr_to_write, older_than_this);
+                       spin_lock(&sb_lock);
+               }
+               if (current->flags & PF_FLUSHER)
+                       sb->s_flags &= ~MS_FLUSHING;
+               if (nr_to_write && *nr_to_write == 0)
+                       break;
+       }
+       spin_unlock(&sb_lock);
+       spin_unlock(&inode_lock);
+}
+
+/*
+ * Called under inode_lock
+ */
+static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
+{
+       struct list_head *tmp = head;
+       struct inode *inode;
+
+       while (nr_inodes && (tmp = tmp->prev) != head) {
+               inode = list_entry(tmp, struct inode, i_list);
+
+               if (!atomic_read(&inode->i_count)) {
+                       __sync_single_inode(inode, 0, NULL);
+                       nr_inodes--;
+
+                       /* 
+                        * __sync_single_inode moved the inode to another list,
+                        * so we have to start looking from the list head.
+                        */
+                       tmp = head;
+               }
+       }
+
+       return nr_inodes;
+}
+
+static void __wait_on_locked(struct list_head *head)
+{
+       struct list_head * tmp;
+       while ((tmp = head->prev) != head) {
+               struct inode *inode = list_entry(tmp, struct inode, i_list);
+               __iget(inode);
+               spin_unlock(&inode_lock);
+               __wait_on_inode(inode);
+               iput(inode);
+               spin_lock(&inode_lock);
+       }
+}
+
+/*
+ * writeback and wait upon the filesystem's dirty inodes.
+ * We do it in two passes - one to write, and one to wait.
+ */
+void sync_inodes_sb(struct super_block *sb)
+{
+       spin_lock(&inode_lock);
+       while (!list_empty(&sb->s_dirty)||!list_empty(&sb->s_locked_inodes)) {
+               __sync_list(&sb->s_dirty, WB_SYNC_NONE, NULL, NULL);
+               __sync_list(&sb->s_dirty, WB_SYNC_ALL, NULL, NULL);
+               __wait_on_locked(&sb->s_locked_inodes);
+       }
+       spin_unlock(&inode_lock);
+}
+
+/*
+ * writeback the dirty inodes for this filesystem
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+       spin_lock(&inode_lock);
+       while (!list_empty(&sb->s_dirty))
+               __sync_list(&sb->s_dirty, WB_SYNC_NONE, NULL, NULL);
+       spin_unlock(&inode_lock);
+}
+
+/*
+ * Find a superblock with inodes that need to be synced
+ */
+
+static struct super_block *get_super_to_sync(void)
+{
+       struct list_head *p;
+restart:
+       spin_lock(&inode_lock);
+       spin_lock(&sb_lock);
+       list_for_each(p, &super_blocks) {
+               struct super_block *s = list_entry(p,struct super_block,s_list);
+               if (list_empty(&s->s_dirty) && list_empty(&s->s_locked_inodes))
+                       continue;
+               s->s_count++;
+               spin_unlock(&sb_lock);
+               spin_unlock(&inode_lock);
+               down_read(&s->s_umount);
+               if (!s->s_root) {
+                       drop_super(s);
+                       goto restart;
+               }
+               return s;
+       }
+       spin_unlock(&sb_lock);
+       spin_unlock(&inode_lock);
+       return NULL;
+}
+
+/**
+ *     sync_inodes
+ *     @dev: device to sync the inodes from.
+ *
+ *     sync_inodes goes through the super block's dirty list, 
+ *     writes them out, waits on the writeout and puts the inodes
+ *     back on the normal list.
+ */
+
+void sync_inodes(void)
+{
+       struct super_block * s;
+       /*
+        * Search the super_blocks array for the device(s) to sync.
+        */
+       while ((s = get_super_to_sync()) != NULL) {
+               sync_inodes_sb(s);
+               drop_super(s);
+       }
+}
+
+void try_to_writeback_unused_inodes(unsigned long pexclusive)
+{
+       struct super_block * sb;
+       int nr_inodes = inodes_stat.nr_unused;
+
+       spin_lock(&inode_lock);
+       spin_lock(&sb_lock);
+       sb = sb_entry(super_blocks.next);
+       for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
+               if (list_empty(&sb->s_dirty))
+                       continue;
+               spin_unlock(&sb_lock);
+               nr_inodes = __try_to_writeback_unused_list(&sb->s_dirty, nr_inodes);
+               spin_lock(&sb_lock);
+       }
+       spin_unlock(&sb_lock);
+       spin_unlock(&inode_lock);
+       clear_bit(0, (unsigned long *)pexclusive);
+}
+
+/**
+ *     write_inode_now -       write an inode to disk
+ *     @inode: inode to write to disk
+ *     @sync: whether the write should be synchronous or not
+ *
+ *     This function commits an inode to disk immediately if it is
+ *     dirty. This is primarily needed by knfsd.
+ */
+ 
+void write_inode_now(struct inode *inode, int sync)
+{
+       spin_lock(&inode_lock);
+       __writeback_single_inode(inode, sync, NULL);
+       spin_unlock(&inode_lock);
+       if (sync)
+               wait_on_inode(inode);
+}
+
+/**
+ * generic_osync_inode - flush all dirty data for a given inode to disk
+ * @inode: inode to write
+ * @datasync: if set, don't bother flushing timestamps
+ *
+ * This can be called by file_write functions for files which have the
+ * O_SYNC flag set, to flush dirty writes to disk.  
+ */
+
+int generic_osync_inode(struct inode *inode, int what)
+{
+       int err = 0, err2 = 0, need_write_inode_now = 0;
+       
+       /* 
+        * WARNING
+        *
+        * Currently, the filesystem write path does not pass the
+        * filp down to the low-level write functions.  Therefore it
+        * is impossible for (say) __block_commit_write to know if
+        * the operation is O_SYNC or not.
+        *
+        * Ideally, O_SYNC writes would have the filesystem call
+        * ll_rw_block as it went to kick-start the writes, and we
+        * could call osync_inode_buffers() here to wait only for
+        * those IOs which have already been submitted to the device
+        * driver layer.  As it stands, if we did this we'd not write
+        * anything to disk since our writes have not been queued by
+        * this point: they are still on the dirty LRU.
+        * 
+        * So, currently we will call fsync_inode_buffers() instead,
+        * to flush _all_ dirty buffers for this inode to disk on 
+        * every O_SYNC write, not just the synchronous I/Os.  --sct
+        */
+
+       if (what & OSYNC_DATA)
+               writeback_single_inode(inode, 0, NULL);
+       if (what & (OSYNC_METADATA|OSYNC_DATA))
+               err = fsync_inode_buffers(inode);
+       if (what & OSYNC_DATA) {
+               err2 = filemap_fdatasync(inode->i_mapping);
+               if (!err)
+                       err = err2;
+       }
+
+       spin_lock(&inode_lock);
+       if ((inode->i_state & I_DIRTY) &&
+           ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
+               need_write_inode_now = 1;
+       spin_unlock(&inode_lock);
+
+       if (need_write_inode_now)
+               write_inode_now(inode, 1);
+       else
+               wait_on_inode(inode);
+
+       return err;
+}
diff --git a/fs/inode.c b/fs/inode.c

index 91d7a9da223f601468cc011e580d6040fe2e1715..3cc5dd6abbc3a0144b0da11473cdc48b92e7be07 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -6,17 +6,12 @@
  
  #include <linux/config.h>
  #include <linux/fs.h>
-#include <linux/string.h>
  #include <linux/mm.h>
  #include <linux/dcache.h>
  #include <linux/init.h>
  #include <linux/quotaops.h>
  #include <linux/slab.h>
-#include <linux/cache.h>
-#include <linux/swap.h>
-#include <linux/swapctl.h>
-#include <linux/prefetch.h>
-#include <linux/locks.h>
+#include <linux/writeback.h>
  
  /*
   * New inode.c implementation.
@@ -55,8 +50,8 @@ static unsigned int i_hash_shift;
   * allowing for low-overhead inode sync() operations.
   */
  
-static LIST_HEAD(inode_in_use);
-static LIST_HEAD(inode_unused);
+LIST_HEAD(inode_in_use);
+LIST_HEAD(inode_unused);
  static struct list_head *inode_hashtable;
  static LIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
  
@@ -66,7 +61,7 @@ static LIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
   * NOTE! You also have to own the lock if you change
   * the i_state of an inode while it is in use..
   */
-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
  
  /*
   * Statistics gathering..
@@ -108,6 +103,7 @@ static struct inode *alloc_inode(struct super_block *sb)
                 inode->i_data.a_ops = &empty_aops;
                 inode->i_data.host = inode;
                 inode->i_data.gfp_mask = GFP_HIGHUSER;
+               inode->i_data.dirtied_when = 0;
                 inode->i_mapping = &inode->i_data;
                 inode->i_data.ra_pages = &default_ra_pages;
                 if (sb->s_bdev)
@@ -141,6 +137,7 @@ void inode_init_once(struct inode *inode)
         INIT_LIST_HEAD(&inode->i_data.clean_pages);
         INIT_LIST_HEAD(&inode->i_data.dirty_pages);
         INIT_LIST_HEAD(&inode->i_data.locked_pages);
+       INIT_LIST_HEAD(&inode->i_data.io_pages);
         INIT_LIST_HEAD(&inode->i_dentry);
         INIT_LIST_HEAD(&inode->i_dirty_buffers);
         INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
@@ -149,6 +146,7 @@ void inode_init_once(struct inode *inode)
         INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
         rwlock_init(&inode->i_data.page_lock);
         spin_lock_init(&inode->i_data.i_shared_lock);
+       spin_lock_init(&inode->i_bufferlist_lock);
         INIT_LIST_HEAD(&inode->i_data.i_mmap);
         INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
  }
@@ -162,57 +160,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
                 inode_init_once(inode);
  }
  
-/*
- * Put the inode on the super block's dirty list.
- *
- * CAREFUL! We mark it dirty unconditionally, but
- * move it onto the dirty list only if it is hashed.
- * If it was not hashed, it will never be added to
- * the dirty list even if it is later hashed, as it
- * will have been marked dirty already.
- *
- * In short, make sure you hash any inodes _before_
- * you start marking them dirty..
- */
- 
-/**
- *     __mark_inode_dirty -    internal function
- *     @inode: inode to mark
- *     @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- *     Mark an inode as dirty. Callers should use mark_inode_dirty or
- *     mark_inode_dirty_sync.
- */
- 
-void __mark_inode_dirty(struct inode *inode, int flags)
-{
-       struct super_block * sb = inode->i_sb;
-
-       if (!sb)
-               return;
-
-       /* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */
-       if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
-               if (sb->s_op && sb->s_op->dirty_inode)
-                       sb->s_op->dirty_inode(inode);
-       }
-
-       /* avoid the locking if we can */
-       if ((inode->i_state & flags) == flags)
-               return;
-
-       spin_lock(&inode_lock);
-       if ((inode->i_state & flags) != flags) {
-               inode->i_state |= flags;
-               /* Only add valid (ie hashed) inodes to the dirty list */
-               if (!(inode->i_state & I_LOCK) && !list_empty(&inode->i_hash)) {
-                       list_del(&inode->i_list);
-                       list_add(&inode->i_list, &sb->s_dirty);
-               }
-       }
-       spin_unlock(&inode_lock);
-}
-
-static void __wait_on_inode(struct inode * inode)
+void __wait_on_inode(struct inode * inode)
  {
         DECLARE_WAITQUEUE(wait, current);
  
@@ -227,20 +175,10 @@ repeat:
         current->state = TASK_RUNNING;
  }
  
-static inline void wait_on_inode(struct inode *inode)
-{
-       if (inode->i_state & I_LOCK)
-               __wait_on_inode(inode);
-}
-
-
-static inline void write_inode(struct inode *inode, int sync)
-{
-       if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
-               inode->i_sb->s_op->write_inode(inode, sync);
-}
-
-static inline void __iget(struct inode * inode)
+/*
+ * inode_lock must be held
+ */
+void __iget(struct inode * inode)
  {
         if (atomic_read(&inode->i_count)) {
                 atomic_inc(&inode->i_count);
@@ -254,287 +192,6 @@ static inline void __iget(struct inode * inode)
         inodes_stat.nr_unused--;
  }
  
-static inline void __sync_one(struct inode *inode, int sync)
-{
-       unsigned dirty;
-
-       list_del(&inode->i_list);
-       list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
-
-       if (inode->i_state & I_LOCK)
-               BUG();
-
-       /* Set I_LOCK, reset I_DIRTY */
-       dirty = inode->i_state & I_DIRTY;
-       inode->i_state |= I_LOCK;
-       inode->i_state &= ~I_DIRTY;
-       spin_unlock(&inode_lock);
-
-       filemap_fdatasync(inode->i_mapping);
-
-       /* Don't write the inode if only I_DIRTY_PAGES was set */
-       if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
-               write_inode(inode, sync);
-
-       filemap_fdatawait(inode->i_mapping);
-
-       spin_lock(&inode_lock);
-       inode->i_state &= ~I_LOCK;
-       if (!(inode->i_state & I_FREEING)) {
-               struct list_head *to;
-               if (inode->i_state & I_DIRTY)
-                       to = &inode->i_sb->s_dirty;
-               else if (atomic_read(&inode->i_count))
-                       to = &inode_in_use;
-               else
-                       to = &inode_unused;
-               list_del(&inode->i_list);
-               list_add(&inode->i_list, to);
-       }
-       wake_up(&inode->i_wait);
-}
-
-static inline void sync_one(struct inode *inode, int sync)
-{
-       while (inode->i_state & I_LOCK) {
-               __iget(inode);
-               spin_unlock(&inode_lock);
-               __wait_on_inode(inode);
-               iput(inode);
-               spin_lock(&inode_lock);
-       }
-
-       __sync_one(inode, sync);
-}
-
-static inline void sync_list(struct list_head *head)
-{
-       struct list_head * tmp;
-
-       while ((tmp = head->prev) != head) 
-               __sync_one(list_entry(tmp, struct inode, i_list), 0);
-}
-
-static inline void wait_on_locked(struct list_head *head)
-{
-       struct list_head * tmp;
-       while ((tmp = head->prev) != head) {
-               struct inode *inode = list_entry(tmp, struct inode, i_list);
-               __iget(inode);
-               spin_unlock(&inode_lock);
-               __wait_on_inode(inode);
-               iput(inode);
-               spin_lock(&inode_lock);
-       }
-}
-
-static inline int try_to_sync_unused_list(struct list_head *head, int nr_inodes)
-{
-       struct list_head *tmp = head;
-       struct inode *inode;
-
-       while (nr_inodes && (tmp = tmp->prev) != head) {
-               inode = list_entry(tmp, struct inode, i_list);
-
-               if (!atomic_read(&inode->i_count)) {
-                       __sync_one(inode, 0);
-                       nr_inodes--;
-
-                       /* 
-                        * __sync_one moved the inode to another list,
-                        * so we have to start looking from the list head.
-                        */
-                       tmp = head;
-               }
-       }
-
-       return nr_inodes;
-}
-
-void sync_inodes_sb(struct super_block *sb)
-{
-       spin_lock(&inode_lock);
-       while (!list_empty(&sb->s_dirty)||!list_empty(&sb->s_locked_inodes)) {
-               sync_list(&sb->s_dirty);
-               wait_on_locked(&sb->s_locked_inodes);
-       }
-       spin_unlock(&inode_lock);
-}
-
-/*
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are
- * empty. Since __sync_one() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
- */
-
-void sync_unlocked_inodes(void)
-{
-       struct super_block * sb;
-       spin_lock(&inode_lock);
-       spin_lock(&sb_lock);
-       sb = sb_entry(super_blocks.next);
-       for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
-               if (!list_empty(&sb->s_dirty)) {
-                       spin_unlock(&sb_lock);
-                       sync_list(&sb->s_dirty);
-                       spin_lock(&sb_lock);
-               }
-       }
-       spin_unlock(&sb_lock);
-       spin_unlock(&inode_lock);
-}
-
-/*
- * Find a superblock with inodes that need to be synced
- */
-
-static struct super_block *get_super_to_sync(void)
-{
-       struct list_head *p;
-restart:
-       spin_lock(&inode_lock);
-       spin_lock(&sb_lock);
-       list_for_each(p, &super_blocks) {
-               struct super_block *s = list_entry(p,struct super_block,s_list);
-               if (list_empty(&s->s_dirty) && list_empty(&s->s_locked_inodes))
-                       continue;
-               s->s_count++;
-               spin_unlock(&sb_lock);
-               spin_unlock(&inode_lock);
-               down_read(&s->s_umount);
-               if (!s->s_root) {
-                       drop_super(s);
-                       goto restart;
-               }
-               return s;
-       }
-       spin_unlock(&sb_lock);
-       spin_unlock(&inode_lock);
-       return NULL;
-}
-
-/**
- *     sync_inodes
- *     @dev: device to sync the inodes from.
- *
- *     sync_inodes goes through the super block's dirty list, 
- *     writes them out, and puts them back on the normal list.
- */
-
-void sync_inodes(void)
-{
-       struct super_block * s;
-       /*
-        * Search the super_blocks array for the device(s) to sync.
-        */
-       while ((s = get_super_to_sync()) != NULL) {
-               sync_inodes_sb(s);
-               drop_super(s);
-       }
-}
-
-static void try_to_sync_unused_inodes(unsigned long pexclusive)
-{
-       struct super_block * sb;
-       int nr_inodes = inodes_stat.nr_unused;
-
-       spin_lock(&inode_lock);
-       spin_lock(&sb_lock);
-       sb = sb_entry(super_blocks.next);
-       for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
-               if (list_empty(&sb->s_dirty))
-                       continue;
-               spin_unlock(&sb_lock);
-               nr_inodes = try_to_sync_unused_list(&sb->s_dirty, nr_inodes);
-               spin_lock(&sb_lock);
-       }
-       spin_unlock(&sb_lock);
-       spin_unlock(&inode_lock);
-       clear_bit(0, (unsigned long *)pexclusive);
-}
-
-/**
- *     write_inode_now -       write an inode to disk
- *     @inode: inode to write to disk
- *     @sync: whether the write should be synchronous or not
- *
- *     This function commits an inode to disk immediately if it is
- *     dirty. This is primarily needed by knfsd.
- */
- 
-void write_inode_now(struct inode *inode, int sync)
-{
-       struct super_block * sb = inode->i_sb;
-
-       if (sb) {
-               spin_lock(&inode_lock);
-               sync_one(inode, sync);
-               spin_unlock(&inode_lock);
-               if (sync)
-                       wait_on_inode(inode);
-       }
-       else
-               printk(KERN_ERR "write_inode_now: no super block\n");
-}
-
-/**
- * generic_osync_inode - flush all dirty data for a given inode to disk
- * @inode: inode to write
- * @datasync: if set, don't bother flushing timestamps
- *
- * This can be called by file_write functions for files which have the
- * O_SYNC flag set, to flush dirty writes to disk.  
- */
-
-int generic_osync_inode(struct inode *inode, int what)
-{
-       int err = 0, err2 = 0, need_write_inode_now = 0;
-       
-       /* 
-        * WARNING
-        *
-        * Currently, the filesystem write path does not pass the
-        * filp down to the low-level write functions.  Therefore it
-        * is impossible for (say) __block_commit_write to know if
-        * the operation is O_SYNC or not.
-        *
-        * Ideally, O_SYNC writes would have the filesystem call
-        * ll_rw_block as it went to kick-start the writes, and we
-        * could call osync_inode_buffers() here to wait only for
-        * those IOs which have already been submitted to the device
-        * driver layer.  As it stands, if we did this we'd not write
-        * anything to disk since our writes have not been queued by
-        * this point: they are still on the dirty LRU.
-        * 
-        * So, currently we will call fsync_inode_buffers() instead,
-        * to flush _all_ dirty buffers for this inode to disk on 
-        * every O_SYNC write, not just the synchronous I/Os.  --sct
-        */
-
-       if (what & OSYNC_METADATA)
-               err = fsync_inode_buffers(inode);
-       if (what & OSYNC_DATA)
-               err2 = fsync_inode_data_buffers(inode);
-       if (!err)
-               err = err2;
-
-       spin_lock(&inode_lock);
-       if ((inode->i_state & I_DIRTY) &&
-           ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
-               need_write_inode_now = 1;
-       spin_unlock(&inode_lock);
-
-       if (need_write_inode_now)
-               write_inode_now(inode, 1);
-       else
-               wait_on_inode(inode);
-
-       return err;
-}
-
  /**
   * clear_inode - clear an inode
   * @inode: inode to clear
@@ -752,7 +409,7 @@ void prune_icache(int goal)
                 static unsigned long exclusive;
  
                 if (!test_and_set_bit(0, &exclusive)) {
-                       if (pdflush_operation(try_to_sync_unused_inodes,
+                       if (pdflush_operation(try_to_writeback_unused_inodes,
                                                 (unsigned long)&exclusive))
                                 clear_bit(0, &exclusive);
                 }
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c

index 895bd40537c0ca18e42c4e6df7486a5de3bd965e..d809febc5abc168f4e81b5f1395be26ba4af7895 100644 (file)
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -62,8 +62,6 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
                 __journal_remove_checkpoint(jh);
                 __journal_remove_journal_head(bh);
                 BUFFER_TRACE(bh, "release");
-               /* BUF_LOCKED -> BUF_CLEAN (fwiw) */
-               refile_buffer(bh);
                 __brelse(bh);
                 ret = 1;
         }
@@ -149,8 +147,7 @@ static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
                 /*
                  * We used to test for (jh->b_list != BUF_CLEAN) here.
                  * But unmap_underlying_metadata() can place buffer onto
-                * BUF_CLEAN. Since refile_buffer() no longer takes buffers
-                * off checkpoint lists, we cope with it here
+                * BUF_CLEAN.
                  */
                 /*
                  * AKPM: I think the buffer_jdirty test is redundant - it
@@ -161,7 +158,6 @@ static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
                         BUFFER_TRACE(bh, "remove from checkpoint");
                         __journal_remove_checkpoint(jh);
                         __journal_remove_journal_head(bh);
-                       refile_buffer(bh);
                         __brelse(bh);
                         ret = 1;
                 }
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c

index 94900b3406f630bd48a7d0a292e9d729065e4d6a..1cfe68a31bfb867564d238c23044998c2e030b1e 100644 (file)
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -210,7 +210,6 @@ write_out_data_locked:
                                 __journal_unfile_buffer(jh);
                                 jh->b_transaction = NULL;
                                 __journal_remove_journal_head(bh);
-                               refile_buffer(bh);
                                 __brelse(bh);
                         }
                 }
@@ -291,10 +290,6 @@ sync_datalist_empty:
                         jh->b_transaction = NULL;
                         __journal_remove_journal_head(bh);
                         BUFFER_TRACE(bh, "finished async writeout: refile");
-                       /* It can sometimes be on BUF_LOCKED due to migration
-                        * from syncdata to asyncdata */
-                       if (bh->b_list != BUF_CLEAN)
-                               refile_buffer(bh);
                         __brelse(bh);
                 }
         }
@@ -454,6 +449,7 @@ start_journal_io:
                                 struct buffer_head *bh = wbuf[i];
                                 set_bit(BH_Lock, &bh->b_state);
                                 clear_bit(BH_Dirty, &bh->b_state);
+                               mark_buffer_uptodate(bh, 1);
                                 bh->b_end_io = journal_end_buffer_io_sync;
                                 submit_bh(WRITE, bh);
                         }
@@ -592,6 +588,7 @@ start_journal_io:
         JBUFFER_TRACE(descriptor, "write commit block");
         {
                 struct buffer_head *bh = jh2bh(descriptor);
+               mark_buffer_uptodate(bh, 1);
                 ll_rw_block(WRITE, 1, &bh);
                 wait_on_buffer(bh);
                 __brelse(bh);           /* One for getblk() */
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c

index a71b6611d7eb12a6c7b7d501dfe73afdcc4b89cd..da77a6734a3a23b452c0976c5f9ab46c958522e0 100644 (file)
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -328,7 +328,6 @@ restart:
                         __journal_unfile_buffer(jh);
                         jh->b_transaction = NULL;
                         __journal_remove_journal_head(bh);
-                       refile_buffer(bh);
                         __brelse(bh);
                         goto restart;
                 }
@@ -464,8 +463,6 @@ int journal_write_metadata_buffer(transaction_t *transaction,
                 }
         } while (!new_bh);
         /* keep subsequent assertions sane */
-       new_bh->b_prev_free = 0;
-       new_bh->b_next_free = 0;
         new_bh->b_state = 0;
         init_buffer(new_bh, NULL, NULL);
         atomic_set(&new_bh->b_count, 1);
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c

index 77efb355287a3eeb19e04a32f51643b9da2ba2f8..f232729de76e137ee85860d4b20c530114d5f4d1 100644 (file)
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -406,11 +406,12 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
          * buffer_head?  If so, we'd better make sure we clear the
          * revoked status on any hashed alias too, otherwise the revoke
          * state machine will get very upset later on. */
-       if (need_cancel && !bh->b_pprev) {
+       if (need_cancel) {
                 struct buffer_head *bh2;
                 bh2 = __get_hash_table(bh->b_bdev, bh->b_blocknr, bh->b_size);
                 if (bh2) {
-                       clear_bit(BH_Revoked, &bh2->b_state);
+                       if (bh2 != bh)
+                               clear_bit(BH_Revoked, &bh2->b_state);
                         __brelse(bh2);
                 }
         }
@@ -540,6 +541,7 @@ static void flush_descriptor(journal_t *journal,
         {
                 struct buffer_head *bh = jh2bh(descriptor);
                 BUFFER_TRACE(bh, "write");
+               mark_buffer_uptodate(bh, 1);
                 ll_rw_block (WRITE, 1, &bh);
         }
  }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c

index a0244b4f91024f5bba0324aea7b83b95c9d50f09..fe1042b9974f327d4243fae02687538e998ddd87 100644 (file)
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -592,9 +592,6 @@ repeat:
                         JBUFFER_TRACE(jh, "file as BJ_Reserved");
                         __journal_file_buffer(jh, transaction, BJ_Reserved);
  
-                       /* And pull it off BUF_DIRTY, onto BUF_CLEAN */
-                       refile_buffer(jh2bh(jh));
-
                         /*
                          * The buffer is now hidden from bdflush.   It is
                          * metadata against the current transaction.
@@ -812,8 +809,6 @@ int journal_get_create_access (handle_t *handle, struct buffer_head *bh)
                 jh->b_transaction = transaction;
                 JBUFFER_TRACE(jh, "file as BJ_Reserved");
                 __journal_file_buffer(jh, transaction, BJ_Reserved);
-               JBUFFER_TRACE(jh, "refile");
-               refile_buffer(jh2bh(jh));
         } else if (jh->b_transaction == journal->j_committing_transaction) {
                 JBUFFER_TRACE(jh, "set next transaction");
                 jh->b_next_transaction = transaction;
@@ -1099,7 +1094,6 @@ int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
         
         spin_lock(&journal_datalist_lock);
         set_bit(BH_JBDDirty, &bh->b_state);
-       set_buffer_flushtime(bh);
  
         J_ASSERT_JH(jh, jh->b_transaction != NULL);
         
@@ -1691,7 +1685,7 @@ int journal_try_to_free_buffers(journal_t *journal,
  out:
         ret = 0;
         if (call_ttfb)
-               ret = try_to_free_buffers(page, gfp_mask);
+               ret = try_to_free_buffers(page);
         return ret;
  }
  
@@ -1864,7 +1858,7 @@ zap_buffer:
         if (buffer_dirty(bh))
                 mark_buffer_clean(bh);
         J_ASSERT_BH(bh, !buffer_jdirty(bh));
-       clear_bit(BH_Uptodate, &bh->b_state);
+//     clear_bit(BH_Uptodate, &bh->b_state);
         clear_bit(BH_Mapped, &bh->b_state);
         clear_bit(BH_Req, &bh->b_state);
         clear_bit(BH_New, &bh->b_state);
@@ -1913,7 +1907,7 @@ int journal_flushpage(journal_t *journal,
         unlock_journal(journal);
  
         if (!offset) {
-               if (!may_free || !try_to_free_buffers(page, 0))
+               if (!may_free || !try_to_free_buffers(page))
                         return 0;
                 J_ASSERT(!page_has_buffers(page));
         }
@@ -2021,9 +2015,6 @@ void __journal_refile_buffer(struct journal_head *jh)
         if (jh->b_transaction != NULL) {
                 __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
                 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
-       } else {
-               /* Onto BUF_DIRTY for writeback */
-               refile_buffer(jh2bh(jh));
         }
  }
  
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c

index 8a22ba2e5cbb28ad0eb321492c85c4d8bb312e1e..339fbc0969e4f66b4af54e2b7bd221aa2bb8a558 100644 (file)
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -120,7 +120,7 @@ static int ntfs_file_read_block(struct page *page)
         blocksize = 1 << blocksize_bits;
  
         if (!page_has_buffers(page))
-               create_empty_buffers(page, blocksize);
+               create_empty_buffers(page, blocksize, 0);
         bh = head = page_buffers(page);
         if (!bh)
                 return -ENOMEM;
@@ -417,7 +417,7 @@ static int ntfs_mftbmp_readpage(ntfs_volume *vol, struct page *page)
         blocksize_bits = vol->sb->s_blocksize_bits;
  
         if (!page_has_buffers(page))
-               create_empty_buffers(page, blocksize);
+               create_empty_buffers(page, blocksize, 0);
         bh = head = page_buffers(page);
         if (!bh)
                 return -ENOMEM;
@@ -656,7 +656,7 @@ int ntfs_mst_readpage(struct file *dir, struct page *page)
         blocksize = 1 << blocksize_bits;
  
         if (!page_has_buffers(page))
-               create_empty_buffers(page, blocksize);
+               create_empty_buffers(page, blocksize, 0);
         bh = head = page_buffers(page);
         if (!bh)
                 return -ENOMEM;
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c

index 6302f1cfa2009ff0e2ea3753a8d5372360eb7052..87b85f5a73eeaf6e9d364af2648176f8b203be16 100644 (file)
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -29,13 +29,27 @@ struct tree_balance * cur_tb = NULL; /* detects whether more than one
                                          is interrupting do_balance */
  #endif
  
+/*
+ * AKPM: The __mark_buffer_dirty() call here will not
+ * put the buffer on the dirty buffer LRU because we've just
+ * set BH_Dirty.  That's a thinko in reiserfs.
+ *
+ * I'm reluctant to "fix" this bug because that would change
+ * behaviour.  Using mark_buffer_dirty() here would make the
+ * buffer eligible for VM and periodic writeback, which may
+ * violate ordering constraints.  I'll just leave the code
+ * as-is by removing the __mark_buffer_dirty call altogether.
+ *
+ * Chris says this code has "probably never been run" anyway.
+ * It is due to go away.
+ */
  
  inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, 
                                         struct buffer_head * bh, int flag)
  {
      if (reiserfs_dont_log(tb->tb_sb)) {
         if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
-           __mark_buffer_dirty(bh) ;
+//         __mark_buffer_dirty(bh) ;
             tb->need_balance_dirty = 1;
         }
      } else {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c

index 485ec26ffb6b915358b6ae834fa9f2173e0aaa78..27d4c44648a54f30ecf8da2309341fca0f1a1e03 100644 (file)
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -107,7 +107,7 @@ inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key
  static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
      struct list_head *list = &(SB_JOURNAL(inode->i_sb)->j_dirty_buffers) ;
  
-    buffer_insert_list(bh, list) ;
+    buffer_insert_list(NULL, bh, list) ;
  }
  
  //
@@ -779,7 +779,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
             /* mark it dirty now to prevent commit_write from adding
             ** this buffer to the inode's dirty buffer list
             */
-           __mark_buffer_dirty(unbh) ;
+               /*
+                * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
+                * It's still atomic, but it sets the page dirty too,
+                * which makes it eligible for writeback at any time by the
+                * VM (which was also the case with __mark_buffer_dirty())
+                */
+           mark_buffer_dirty(unbh) ;
                   
             //inode->i_blocks += inode->i_sb->s_blocksize / 512;
             //mark_tail_converted (inode);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c

index d32d0df19725d20aadcd90e1590774b9f44b81cd..85658fed235602a2b80dc23334e5f7b5be5decf3 100644 (file)
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -123,10 +123,8 @@ static void init_journal_hash(struct super_block *p_s_sb) {
  ** more details.
  */
  static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
-  if (bh) {
-    clear_bit(BH_Dirty, &bh->b_state) ;
-    refile_buffer(bh) ;
-  }
+  if (bh)
+    mark_buffer_clean(bh);
    return 0 ;
  }
  
@@ -1079,7 +1077,6 @@ free_cnode:
         if (!buffer_uptodate(cn->bh)) {
           reiserfs_panic(s, "journal-949: buffer write failed\n") ;
         }
-       refile_buffer(cn->bh) ;
          brelse(cn->bh) ;
        }
        cn = cn->next ;
@@ -3125,7 +3122,7 @@ printk("journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ;
    SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;
  
    /* write any buffers that must hit disk before this commit is done */
-  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  fsync_buffers_list(NULL, &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
  
    /* honor the flush and async wishes from the caller */
    if (flush) {
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c

index 8afd8ebeae47950d63613681b9c0e2fd098d2daa..cb092a14d118b7d3d384d28e9a00cbee7476c88e 100644 (file)
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -138,8 +138,9 @@ static void sprintf_block_head (char * buf, struct buffer_head * bh)
  
  static void sprintf_buffer_head (char * buf, struct buffer_head * bh) 
  {
-  sprintf (buf, "dev %s, size %d, blocknr %ld, count %d, list %d, state 0x%lx, page %p, (%s, %s, %s)",
-          bdevname (bh->b_bdev), bh->b_size, bh->b_blocknr, atomic_read (&(bh->b_count)), bh->b_list,
+  sprintf (buf, "dev %s, size %d, blocknr %ld, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+          bdevname (bh->b_bdev), bh->b_size, bh->b_blocknr,
+          atomic_read (&(bh->b_count)),
            bh->b_state, bh->b_page,
            buffer_uptodate (bh) ? "UPTODATE" : "!UPTODATE",
            buffer_dirty (bh) ? "DIRTY" : "CLEAN",
diff --git a/include/linux/fs.h b/include/linux/fs.h

index f0d997aeecb4c800af31abdf399ac2942ff7a242..4b38c11f97232336ee77938246a2ba708a3f2c73 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -112,6 +112,7 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
  #define MS_MOVE                8192
  #define MS_REC         16384
  #define MS_VERBOSE     32768
+#define MS_FLUSHING    (1<<16) /* inodes are currently under writeout */
  #define MS_ACTIVE      (1<<30)
  #define MS_NOUSER      (1<<31)
  
@@ -155,6 +156,7 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
  #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
  #define IS_SYNC(inode)         (__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC))
  #define IS_MANDLOCK(inode)     __IS_FLG(inode, MS_MANDLOCK)
+#define IS_FLUSHING(inode)     __IS_FLG(inode, MS_FLUSHING)
  
  #define IS_QUOTAINIT(inode)    ((inode)->i_flags & S_QUOTA)
  #define IS_NOQUOTA(inode)      ((inode)->i_flags & S_NOQUOTA)
@@ -215,11 +217,10 @@ enum bh_state_bits {
         BH_Dirty,       /* 1 if the buffer is dirty */
         BH_Lock,        /* 1 if the buffer is locked */
         BH_Req,         /* 0 if the buffer has been invalidated */
+
         BH_Mapped,      /* 1 if the buffer has a disk mapping */
         BH_New,         /* 1 if the buffer is new and not yet written out */
         BH_Async,       /* 1 if the buffer is under end_buffer_io_async I/O */
-       BH_Wait_IO,     /* 1 if we should write out this buffer */
-       BH_launder,     /* 1 if we should throttle on this buffer */
         BH_JBD,         /* 1 if it has an attached journal_head */
  
         BH_PrivateStart,/* not a state bit, but the first bit available
@@ -240,22 +241,16 @@ enum bh_state_bits {
   */
  struct buffer_head {
         /* First cache line: */
-       struct buffer_head *b_next;     /* Hash queue list */
         sector_t b_blocknr;             /* block number */
         unsigned short b_size;          /* block size */
-       unsigned short b_list;          /* List that this buffer appears */
         struct block_device *b_bdev;
  
         atomic_t b_count;               /* users using this block */
         unsigned long b_state;          /* buffer state bitmap (see above) */
-       unsigned long b_flushtime;      /* Time when (dirty) buffer should be written */
-
-       struct buffer_head *b_next_free;/* lru/free list linkage */
-       struct buffer_head *b_prev_free;/* doubly linked list of buffers */
         struct buffer_head *b_this_page;/* circular list of buffers in one page */
-       struct buffer_head **b_pprev;   /* doubly linked list of hash-queue */
-       char * b_data;                  /* pointer to data block */
         struct page *b_page;            /* the page this bh is mapped to */
+
+       char * b_data;                  /* pointer to data block */
         void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
         void *b_private;                /* reserved for b_end_io */
  
@@ -371,6 +366,16 @@ struct address_space_operations {
         int (*writepage)(struct page *);
         int (*readpage)(struct file *, struct page *);
         int (*sync_page)(struct page *);
+
+       /* Write back some dirty pages from this mapping. */
+       int (*writeback_mapping)(struct address_space *, int *nr_to_write);
+
+       /* Perform a writeback as a memory-freeing operation. */
+       int (*vm_writeback)(struct page *, int *nr_to_write);
+
+       /* Set a page dirty */
+       int (*set_page_dirty)(struct page *page);
+
         /*
          * ext3 requires that a successful prepare_write() call be followed
          * by a commit_write() call - they must be balanced
@@ -391,12 +396,14 @@ struct address_space {
         struct list_head        clean_pages;    /* list of clean pages */
         struct list_head        dirty_pages;    /* list of dirty pages */
         struct list_head        locked_pages;   /* list of locked pages */
+       struct list_head        io_pages;       /* being prepared for I/O */
         unsigned long           nrpages;        /* number of total pages */
         struct address_space_operations *a_ops; /* methods */
         struct inode            *host;          /* owner: inode, block_device */
         list_t                  i_mmap;         /* list of private mappings */
         list_t                  i_mmap_shared;  /* list of private mappings */
         spinlock_t              i_shared_lock;  /* and spinlock protecting it */
+       unsigned long           dirtied_when;   /* jiffies of first page dirtying */
         int                     gfp_mask;       /* how to allocate the pages */
         unsigned long           *ra_pages;      /* device readahead */
  };
@@ -427,9 +434,10 @@ struct inode {
         struct list_head        i_hash;
         struct list_head        i_list;
         struct list_head        i_dentry;
-       
-       struct list_head        i_dirty_buffers;
+
+       struct list_head        i_dirty_buffers;   /* uses i_bufferlist_lock */
         struct list_head        i_dirty_data_buffers;
+       spinlock_t              i_bufferlist_lock;
  
         unsigned long           i_ino;
         atomic_t                i_count;
@@ -697,8 +705,9 @@ struct super_block {
         struct list_head        s_list;         /* Keep this first */
         kdev_t                  s_dev;
         unsigned long           s_blocksize;
-       unsigned char           s_blocksize_bits;
         unsigned long           s_old_blocksize;
+       unsigned short          s_writeback_gen;/* To avoid writeback livelock */
+       unsigned char           s_blocksize_bits;
         unsigned char           s_dirt;
         unsigned long long      s_maxbytes;     /* Max file size */
         struct file_system_type *s_type;
@@ -903,7 +912,7 @@ struct super_operations {
         int (*show_options)(struct seq_file *, struct vfsmount *);
  };
  
-/* Inode state bits.. */
+/* Inode state bits.  Protected by inode_lock. */
  #define I_DIRTY_SYNC           1 /* Not dirty enough for O_DATASYNC */
  #define I_DIRTY_DATASYNC       2 /* Data-related inode changes pending */
  #define I_DIRTY_PAGES          4 /* Data-related inode changes pending */
@@ -924,11 +933,6 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
         __mark_inode_dirty(inode, I_DIRTY_SYNC);
  }
  
-static inline void mark_inode_dirty_pages(struct inode *inode)
-{
-       __mark_inode_dirty(inode, I_DIRTY_PAGES);
-}
-
  struct dquot_operations {
         void (*initialize) (struct inode *, short);
         void (*drop) (struct inode *);
@@ -1215,19 +1219,14 @@ extern struct file_operations rdwr_pipe_fops;
  
  extern int fs_may_remount_ro(struct super_block *);
  
-extern int try_to_free_buffers(struct page *, unsigned int);
-extern void refile_buffer(struct buffer_head * buf);
-extern void create_empty_buffers(struct page *, unsigned long);
+extern int try_to_free_buffers(struct page *);
+extern void create_empty_buffers(struct page *, unsigned long,
+                       unsigned long b_state);
  extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
  
  /* reiserfs_writepage needs this */
  extern void set_buffer_async_io(struct buffer_head *bh) ;
  
-#define BUF_CLEAN      0
-#define BUF_LOCKED     1       /* Buffers scheduled for write */
-#define BUF_DIRTY      2       /* Dirty buffers, not yet scheduled for write */
-#define NR_LIST                3
-
  static inline void get_bh(struct buffer_head * bh)
  {
          atomic_inc(&(bh)->b_count);
@@ -1252,29 +1251,27 @@ static inline void mark_buffer_uptodate(struct buffer_head * bh, int on)
  
  #define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state)
  
-static inline void __mark_buffer_clean(struct buffer_head *bh)
-{
-       refile_buffer(bh);
-}
-
  static inline void mark_buffer_clean(struct buffer_head * bh)
  {
-       if (atomic_set_buffer_clean(bh))
-               __mark_buffer_clean(bh);
+       clear_bit(BH_Dirty, &(bh)->b_state);
  }
  
-extern void FASTCALL(__mark_dirty(struct buffer_head *bh));
-extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh));
  extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
-extern void FASTCALL(buffer_insert_list(struct buffer_head *, struct list_head *));
+extern void buffer_insert_list(spinlock_t *lock,
+               struct buffer_head *, struct list_head *);
  
-static inline void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
+static inline void
+buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
  {
-       buffer_insert_list(bh, &inode->i_dirty_buffers);
+       buffer_insert_list(&inode->i_bufferlist_lock,
+                       bh, &inode->i_dirty_buffers);
  }
-static inline void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
+
+static inline void
+buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
  {
-       buffer_insert_list(bh, &inode->i_dirty_data_buffers);
+       buffer_insert_list(&inode->i_bufferlist_lock,
+                       bh, &inode->i_dirty_data_buffers);
  }
  
  #define atomic_set_buffer_dirty(bh) test_and_set_bit(BH_Dirty, &(bh)->b_state)
@@ -1322,8 +1319,6 @@ static inline void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode
         buffer_insert_inode_queue(bh, inode);
  }
  
-extern void set_buffer_flushtime(struct buffer_head *);
-extern void balance_dirty(void);
  extern int check_disk_change(kdev_t);
  extern int invalidate_inodes(struct super_block *);
  extern int invalidate_device(kdev_t, int);
@@ -1334,8 +1329,6 @@ extern void invalidate_inode_buffers(struct inode *);
  #define destroy_buffers(dev)   __invalidate_buffers((dev), 1)
  extern void invalidate_bdev(struct block_device *, int);
  extern void __invalidate_buffers(kdev_t dev, int);
-extern void sync_inodes(void);
-extern void sync_unlocked_inodes(void);
  extern void write_inode_now(struct inode *, int);
  extern int sync_buffers(struct block_device *, int);
  extern int fsync_dev(kdev_t);
@@ -1343,15 +1336,16 @@ extern int fsync_bdev(struct block_device *);
  extern int fsync_super(struct super_block *);
  extern int fsync_no_super(struct block_device *);
  extern void sync_inodes_sb(struct super_block *);
-extern int osync_buffers_list(struct list_head *);
-extern int fsync_buffers_list(struct list_head *);
+extern int fsync_buffers_list(spinlock_t *lock, struct list_head *);
  static inline int fsync_inode_buffers(struct inode *inode)
  {
-       return fsync_buffers_list(&inode->i_dirty_buffers);
+       return fsync_buffers_list(&inode->i_bufferlist_lock,
+                               &inode->i_dirty_buffers);
  }
  static inline int fsync_inode_data_buffers(struct inode *inode)
  {
-       return fsync_buffers_list(&inode->i_dirty_data_buffers);
+       return fsync_buffers_list(&inode->i_bufferlist_lock,
+                               &inode->i_dirty_data_buffers);
  }
  extern int inode_has_buffers(struct inode *);
  extern int filemap_fdatasync(struct address_space *);
@@ -1452,6 +1446,7 @@ static inline struct inode *iget(struct super_block *sb, unsigned long ino)
         return iget4(sb, ino, NULL, NULL);
  }
  
+extern void __iget(struct inode * inode);
  extern void clear_inode(struct inode *);
  extern struct inode *new_inode(struct super_block *);
  extern void remove_suid(struct dentry *);
@@ -1539,6 +1534,7 @@ static inline void map_bh(struct buffer_head *bh, struct super_block *sb, int bl
         bh->b_bdev = sb->s_bdev;
         bh->b_blocknr = block;
  }
+
  extern void wakeup_bdflush(void);
  extern void put_unused_buffer_head(struct buffer_head * bh);
  extern struct buffer_head * get_unused_buffer_head(int async);
@@ -1549,9 +1545,7 @@ typedef int (get_block_t)(struct inode*,sector_t,struct buffer_head*,int);
  
  /* Generic buffer handling for block filesystems.. */
  extern int try_to_release_page(struct page * page, int gfp_mask);
-extern int discard_bh_page(struct page *, unsigned long, int);
-#define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
-#define block_invalidate_page(page) discard_bh_page(page, 0, 0)
+extern int block_flushpage(struct page *page, unsigned long offset);
  extern int block_symlink(struct inode *, const char *, int);
  extern int block_write_full_page(struct page*, get_block_t*);
  extern int block_read_full_page(struct page*, get_block_t*);
@@ -1579,6 +1573,8 @@ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
  extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
  extern int generic_file_open(struct inode * inode, struct file * filp);
  
+extern int generic_vm_writeback(struct page *page, int *nr_to_write);
+
  extern struct file_operations generic_ro_fops;
  
  extern int vfs_readlink(struct dentry *, char *, int, const char *);
@@ -1636,6 +1632,9 @@ static inline ino_t parent_ino(struct dentry *dentry)
         return res;
  }
  
+void __buffer_error(char *file, int line);
+#define buffer_error() __buffer_error(__FILE__, __LINE__)
+
  #endif /* __KERNEL__ */
  
  #endif /* _LINUX_FS_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 5f1c731ddde19219ebdd47f88a745ce59ddca25f..b548d2cd8504b551bd4f25621b29309036f66399 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -361,8 +361,6 @@ static inline void set_page_zone(struct page *page, unsigned long zone_num)
  
  #endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
  
-extern void FASTCALL(set_page_dirty(struct page *));
-
  /*
   * Error return values for the *_nopage functions
   */
@@ -405,6 +403,26 @@ extern int ptrace_check_attach(struct task_struct *task, int kill);
  int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
                 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
  
+int __set_page_dirty_buffers(struct page *page);
+int __set_page_dirty_nobuffers(struct page *page);
+
+/*
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ * FIXME: make the method unconditional.
+ */
+static inline int set_page_dirty(struct page *page)
+{
+       if (page->mapping) {
+               int (*spd)(struct page *);
+
+               spd = page->mapping->a_ops->set_page_dirty;
+               if (spd)
+                       return (*spd)(page);
+       }
+       return __set_page_dirty_buffers(page);
+}
+
  /*
   * On a two-level page table, this ends up being trivial. Thus the
   * inlining and the symmetry break with pte_alloc_map() that does all
@@ -496,6 +514,9 @@ extern void truncate_inode_pages(struct address_space *, loff_t);
  extern int filemap_sync(struct vm_area_struct *, unsigned long,        size_t, unsigned int);
  extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int);
  
+/* mm/page-writeback.c */
+int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write);
+
  /* readahead.c */
  #define VM_MAX_READAHEAD       128     /* kbytes */
  #define VM_MIN_READAHEAD       16      /* kbytes (includes current page) */
@@ -550,9 +571,6 @@ static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * m
  
  extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
  
-extern int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
-extern int pdflush_flush(unsigned long nr_pages);
-
  extern struct page * vmalloc_to_page(void *addr);
  extern unsigned long get_page_cache_size(void);
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 8a4826427f7f04f39d086b90a5517046c986e40d..09056c01bc8c5630b6c193f2fde16f40122ddddd 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -368,8 +368,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
  #define PF_MEMALLOC    0x00000800      /* Allocating memory */
  #define PF_MEMDIE      0x00001000      /* Killed for out-of-memory */
  #define PF_FREE_PAGES  0x00002000      /* per process page freeing */
-#define PF_NOIO                0x00004000      /* avoid generating further I/O */
-#define PF_FLUSHER     0x00008000      /* responsible for disk writeback */
+#define PF_FLUSHER     0x00004000      /* responsible for disk writeback */
  
  /*
   * Ptrace flags
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 287faa9dc6208c2ab82dc743701642ca79e05040..86eb09dfca0d3cfa72aa646d5ede6872144390ee 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -97,6 +97,7 @@ extern int nr_swap_pages;
  
  extern unsigned int nr_free_pages(void);
  extern unsigned int nr_free_buffer_pages(void);
+extern unsigned int nr_free_pagecache_pages(void);
  extern int nr_active_pages;
  extern int nr_inactive_pages;
  extern atomic_t nr_async_pages;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h

index 30caa40c26be9eb7716e4c43479605a425243115..2f25df04d925467016b460441f28eb073914f844 100644 (file)
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -133,7 +133,7 @@ enum
         VM_SWAPCTL=1,           /* struct: Set vm swapping control */
         VM_SWAPOUT=2,           /* int: Linear or sqrt() swapout for hogs */
         VM_FREEPG=3,            /* struct: Set free page thresholds */
-       VM_BDFLUSH=4,           /* struct: Control buffer cache flushing */
+       VM_BDFLUSH_UNUSED=4,    /* Spare */
         VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */
         VM_BUFFERMEM=6,         /* struct: Set buffer memory thresholds */
         VM_PAGECACHE=7,         /* struct: Set cache memory thresholds */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

new file mode 100644 (file)

index 0000000..1978e06
--- /dev/null
+++ b/include/linux/writeback.h
@@ -0,0 +1,53 @@
+/*
+ * include/linux/writeback.h.
+ *
+ * These declarations are private to fs/ and mm/.
+ * Declarations which are exported to filesystems do not
+ * get placed here.
+ */
+#ifndef WRITEBACK_H
+#define WRITEBACK_H
+
+extern spinlock_t inode_lock;
+extern struct list_head inode_in_use;
+extern struct list_head inode_unused;
+
+/*
+ * fs/fs-writeback.c
+ */
+#define WB_SYNC_NONE   0       /* Don't wait on anything */
+#define WB_SYNC_LAST   1       /* Wait on the last-written mapping */
+#define WB_SYNC_ALL    2       /* Wait on every mapping */
+
+void try_to_writeback_unused_inodes(unsigned long pexclusive);
+void writeback_single_inode(struct inode *inode,
+                               int sync, int *nr_to_write);
+void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
+                               unsigned long *older_than_this);
+void writeback_inodes_sb(struct super_block *);
+void __wait_on_inode(struct inode * inode);
+void sync_inodes(void);
+
+static inline void wait_on_inode(struct inode *inode)
+{
+       if (inode->i_state & I_LOCK)
+               __wait_on_inode(inode);
+}
+
+/*
+ * mm/page-writeback.c
+ */
+/*
+ * How much data to write out at a time in various places.  This isn't
+ * really very important - it's just here to prevent any thread from
+ * locking an inode for too long and blocking other threads which wish
+ * to write the same file for allocation throttling purposes.
+ */
+#define WRITEOUT_PAGES ((4096 * 1024) / PAGE_CACHE_SIZE)
+
+void balance_dirty_pages(struct address_space *mapping);
+void balance_dirty_pages_ratelimited(struct address_space *mapping);
+int pdflush_flush(unsigned long nr_pages);
+int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
+
+#endif         /* WRITEBACK_H */
diff --git a/init/main.c b/init/main.c

index 80c3086b4bdc7fce7387d1574e8e9331749eaefe..33c69bdaa27e2eafc0d0c3da190d7f93729d9085 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -390,7 +390,6 @@ asmlinkage void __init start_kernel(void)
         fork_init(mempages);
         proc_caches_init();
         vfs_caches_init(mempages);
-       buffer_init(mempages);
         radix_tree_init();
  #if defined(CONFIG_ARCH_S390)
         ccwcache_init();
diff --git a/kernel/ksyms.c b/kernel/ksyms.c

index 71fb9ec853654835a9dd16f1ab7b92afd924ceb6..46cd25250912c94d18ba8d25ef69352581b05bcf 100644 (file)
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -169,7 +169,6 @@ EXPORT_SYMBOL(__d_path);
  EXPORT_SYMBOL(mark_buffer_dirty);
  EXPORT_SYMBOL(end_buffer_io_sync);
  EXPORT_SYMBOL(set_buffer_async_io);
-EXPORT_SYMBOL(__mark_buffer_dirty);
  EXPORT_SYMBOL(__mark_inode_dirty);
  EXPORT_SYMBOL(get_empty_filp);
  EXPORT_SYMBOL(init_private_file);
@@ -212,7 +211,6 @@ EXPORT_SYMBOL(unlock_buffer);
  EXPORT_SYMBOL(__wait_on_buffer);
  EXPORT_SYMBOL(___wait_on_page);
  EXPORT_SYMBOL(generic_direct_IO);
-EXPORT_SYMBOL(discard_bh_page);
  EXPORT_SYMBOL(block_write_full_page);
  EXPORT_SYMBOL(block_read_full_page);
  EXPORT_SYMBOL(block_prepare_write);
@@ -339,7 +337,6 @@ EXPORT_SYMBOL(register_disk);
  EXPORT_SYMBOL(read_dev_sector);
  EXPORT_SYMBOL(tq_disk);
  EXPORT_SYMBOL(init_buffer);
-EXPORT_SYMBOL(refile_buffer);
  EXPORT_SYMBOL(wipe_partitions);
  
  /* tty routines */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 66ccb010e1e5a17af683cff2ddef115b0d06c318..7869159de04ab0deca86415f90d51094d451a41c 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -43,7 +43,6 @@
  /* External variables not in a header file. */
  extern int panic_timeout;
  extern int C_A_D;
-extern int bdf_prm[], bdflush_min[], bdflush_max[];
  extern int sysctl_overcommit_memory;
  extern int max_threads;
  extern atomic_t nr_queued_signals;
@@ -259,9 +258,6 @@ static ctl_table kern_table[] = {
  };
  
  static ctl_table vm_table[] = {
-       {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
-        &proc_dointvec_minmax, &sysctl_intvec, NULL,
-        &bdflush_min, &bdflush_max},
         {VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
          sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
         {VM_PAGERDAEMON, "kswapd",
diff --git a/mm/Makefile b/mm/Makefile

index 464eb1810ea644db30844732a24e83cefb3de0fc..bcc0c36c23a105996ef19ae7004095873d517202 100644 (file)
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,12 +9,13 @@
  
  O_TARGET := mm.o
  
-export-objs := shmem.o filemap.o mempool.o page_alloc.o
+export-objs := shmem.o filemap.o mempool.o page_alloc.o \
+               page-writeback.o
  
  obj-y   := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
             vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
             page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
             shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
-           pdflush.o
+           pdflush.o page-writeback.o
  
  include $(TOPDIR)/Rules.make
diff --git a/mm/filemap.c b/mm/filemap.c

index eb85c6fab91fdc6243ae8fadcdb0c7ea7a00f63b..d95ba0691800f126d37292d8bebeac20ba16e7e7 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -11,29 +11,19 @@
   */
  #include <linux/module.h>
  #include <linux/slab.h>
-#include <linux/shm.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
  #include <linux/mman.h>
-#include <linux/locks.h>
  #include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <linux/smp_lock.h>
-#include <linux/blkdev.h>
  #include <linux/file.h>
-#include <linux/swapctl.h>
-#include <linux/init.h>
-#include <linux/mm.h>
  #include <linux/iobuf.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
  #include <linux/hash.h>
-#include <linux/blkdev.h>
+#include <linux/writeback.h>
  
-#include <asm/pgalloc.h>
  #include <asm/uaccess.h>
  #include <asm/mman.h>
  
-#include <linux/highmem.h>
-
  /*
   * Shared mappings implemented 30.11.1994. It's not fully working yet,
   * though.
@@ -49,13 +39,17 @@
  
  /*
   * Lock ordering:
- *     pagemap_lru_lock ==> page_lock ==> i_shared_lock
+ *
+ *  pagemap_lru_lock
+ *  ->i_shared_lock            (vmtruncate)
+ *    ->i_bufferlist_lock      (__free_pte->__set_page_dirty_buffers)
+ *      ->unused_list_lock     (try_to_free_buffers)
+ *        ->mapping->page_lock
+ *      ->inode_lock           (__mark_inode_dirty)
+ *        ->sb_lock            (fs/fs-writeback.c)
   */
  spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
  
-#define CLUSTER_PAGES          (1 << page_cluster)
-#define CLUSTER_OFFSET(x)      (((x) >> page_cluster) << page_cluster)
-
  /*
   * Remove a page from the page cache and free it. Caller has to make
   * sure the page is locked and that nobody else uses it - or that usage
@@ -97,26 +91,6 @@ static inline int sync_page(struct page *page)
         return 0;
  }
  
-/*
- * Add a page to the dirty page list.
- */
-void set_page_dirty(struct page *page)
-{
-       if (!TestSetPageDirty(page)) {
-               struct address_space *mapping = page->mapping;
-
-               if (mapping) {
-                       write_lock(&mapping->page_lock);
-                       list_del(&page->list);
-                       list_add(&page->list, &mapping->dirty_pages);
-                       write_unlock(&mapping->page_lock);
-
-                       if (mapping->host)
-                               mark_inode_dirty_pages(mapping->host);
-               }
-       }
-}
-
  /**
   * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
   * @inode: the inode which pages we want to invalidate
@@ -194,20 +168,19 @@ static void truncate_complete_page(struct page *page)
         /* Leave it on the LRU if it gets converted into anonymous buffers */
         if (!PagePrivate(page) || do_flushpage(page, 0))
                 lru_cache_del(page);
-
-       /*
-        * We remove the page from the page cache _after_ we have
-        * destroyed all buffer-cache references to it. Otherwise some
-        * other process might think this inode page is not in the
-        * page cache and creates a buffer-cache alias to it causing
-        * all sorts of fun problems ...  
-        */
         ClearPageDirty(page);
         ClearPageUptodate(page);
         remove_inode_page(page);
         page_cache_release(page);
  }
  
+/*
+ * Writeback walks the page list in ->prev order, which is low-to-high file
+ * offsets in the common case where he file was written linearly. So truncate
+ * walks the page list in the opposite (->next) direction, to avoid getting
+ * into lockstep with writeback's cursor.  To prune as many pages as possible
+ * before the truncate cursor collides with the writeback cursor.
+ */
  static int truncate_list_pages(struct address_space *mapping,
         struct list_head *head, unsigned long start, unsigned *partial)
  {
@@ -216,7 +189,7 @@ static int truncate_list_pages(struct address_space *mapping,
         int unlocked = 0;
  
   restart:
-       curr = head->prev;
+       curr = head->next;
         while (curr != head) {
                 unsigned long offset;
  
@@ -233,10 +206,10 @@ static int truncate_list_pages(struct address_space *mapping,
                         list_del(head);
                         if (!failed)
                                 /* Restart after this page */
-                               list_add_tail(head, curr);
+                               list_add(head, curr);
                         else
                                 /* Restart on this page */
-                               list_add(head, curr);
+                               list_add_tail(head, curr);
  
                         write_unlock(&mapping->page_lock);
                         unlocked = 1;
@@ -262,7 +235,7 @@ static int truncate_list_pages(struct address_space *mapping,
                         write_lock(&mapping->page_lock);
                         goto restart;
                 }
-               curr = curr->prev;
+               curr = curr->next;
         }
         return unlocked;
  }
@@ -284,10 +257,12 @@ void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
  
         write_lock(&mapping->page_lock);
         do {
-               unlocked = truncate_list_pages(mapping,
-                               &mapping->clean_pages, start, &partial);
+               unlocked |= truncate_list_pages(mapping,
+                               &mapping->io_pages, start, &partial);
                 unlocked |= truncate_list_pages(mapping,
                                 &mapping->dirty_pages, start, &partial);
+               unlocked = truncate_list_pages(mapping,
+                               &mapping->clean_pages, start, &partial);
                 unlocked |= truncate_list_pages(mapping,
                                 &mapping->locked_pages, start, &partial);
         } while (unlocked);
@@ -305,6 +280,7 @@ static inline int invalidate_this_page2(struct address_space * mapping,
         /*
          * The page is locked and we hold the mapping lock as well
          * so both page_count(page) and page_buffers stays constant here.
+        * AKPM: fixme: No global lock any more.  Is this still OK?
          */
         if (page_count(page) == 1 + !!page_has_buffers(page)) {
                 /* Restart after this page */
@@ -322,7 +298,7 @@ static inline int invalidate_this_page2(struct address_space * mapping,
  
                         page_cache_get(page);
                         write_unlock(&mapping->page_lock);
-                       block_invalidate_page(page);
+                       block_flushpage(page, 0);
                 } else
                         unlocked = 0;
  
@@ -393,6 +369,8 @@ void invalidate_inode_pages2(struct address_space * mapping)
                                 &mapping->clean_pages);
                 unlocked |= invalidate_list_pages2(mapping,
                                 &mapping->dirty_pages);
+               unlocked |= invalidate_list_pages2(mapping,
+                               &mapping->io_pages);
                 unlocked |= invalidate_list_pages2(mapping,
                                 &mapping->locked_pages);
         } while (unlocked);
@@ -449,6 +427,8 @@ int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsig
         /* writeout dirty buffers on pages from both clean and dirty lists */
         retval = do_buffer_fdatasync(mapping, &mapping->dirty_pages,
                         start_idx, end_idx, writeout_one_page);
+       retval = do_buffer_fdatasync(mapping, &mapping->io_pages,
+                       start_idx, end_idx, writeout_one_page);
         retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages,
                         start_idx, end_idx, writeout_one_page);
         retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages,
@@ -457,6 +437,8 @@ int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsig
         /* now wait for locked buffers on pages from both clean and dirty lists */
         retval |= do_buffer_fdatasync(mapping, &mapping->dirty_pages,
                         start_idx, end_idx, waitfor_one_page);
+       retval |= do_buffer_fdatasync(mapping, &mapping->io_pages,
+                       start_idx, end_idx, waitfor_one_page);
         retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages,
                         start_idx, end_idx, waitfor_one_page);
         retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages,
@@ -495,47 +477,17 @@ int fail_writepage(struct page *page)
  EXPORT_SYMBOL(fail_writepage);
  
  /**
- *      filemap_fdatasync - walk the list of dirty pages of the given address space
- *             and writepage() all of them.
- * 
- *      @mapping: address space structure to write
+ *  filemap_fdatasync - walk the list of dirty pages of the given address space
+ *                      and writepage() all of them.
+ *
+ *  @mapping: address space structure to write
   *
   */
-int filemap_fdatasync(struct address_space * mapping)
+int filemap_fdatasync(struct address_space *mapping)
  {
-       int ret = 0;
-       int (*writepage)(struct page *) = mapping->a_ops->writepage;
-
-       write_lock(&mapping->page_lock);
-
-        while (!list_empty(&mapping->dirty_pages)) {
-               struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
-
-               list_del(&page->list);
-               list_add(&page->list, &mapping->locked_pages);
-
-               if (!PageDirty(page))
-                       continue;
-
-               page_cache_get(page);
-               write_unlock(&mapping->page_lock);
-
-               lock_page(page);
-
-               if (PageDirty(page)) {
-                       int err;
-                       ClearPageDirty(page);
-                       err = writepage(page);
-                       if (err && !ret)
-                               ret = err;
-               } else
-                       UnlockPage(page);
-
-               page_cache_release(page);
-               write_lock(&mapping->page_lock);
-       }
-       write_unlock(&mapping->page_lock);
-       return ret;
+       if (mapping->a_ops->writeback_mapping)
+               return mapping->a_ops->writeback_mapping(mapping, NULL);
+       return generic_writeback_mapping(mapping, NULL);
  }
  
  /**
@@ -2324,6 +2276,7 @@ unlock:
  
                 if (status < 0)
                         break;
+               balance_dirty_pages_ratelimited(mapping);
         } while (count);
  done:
         *ppos = pos;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

new file mode 100644 (file)

index 0000000..5219717
--- /dev/null
+++ b/mm/page-writeback.c
@@ -0,0 +1,476 @@
+/*
+ * mm/page-writeback.c.
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains functions related to writing back dirty pages at the
+ * address_space level.
+ *
+ * 10Apr2002   akpm@zip.com.au
+ *             Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/init.h>
+#include <linux/sysrq.h>
+
+/*
+ * Memory thresholds, in percentages
+ * FIXME: expose these via /proc or whatever.
+ */
+
+/*
+ * Start background writeback (via pdflush) at this level
+ */
+static int dirty_background_ratio = 30;
+
+/*
+ * The generator of dirty data starts async writeback at this level
+ */
+static int dirty_async_ratio = 45;
+
+/*
+ * The generator of dirty data performs sync writeout at this level
+ */
+static int dirty_sync_ratio = 60;
+
+/*
+ * balance_dirty_pages() must be called by processes which are
+ * generating dirty data.  It looks at the number of dirty pages
+ * in the machine and either:
+ *
+ * - Starts background writeback or
+ * - Causes the caller to perform async writeback or
+ * - Causes the caller to perform synchronous writeback, then
+ *   tells a pdflush thread to perform more writeback or
+ * - Does nothing at all.
+ *
+ * balance_dirty_pages() can sleep.
+ */
+void balance_dirty_pages(struct address_space *mapping)
+{
+       const int tot = nr_free_pagecache_pages();
+       struct page_state ps;
+       int background_thresh;
+       int async_thresh;
+       int sync_thresh;
+       int wake_pdflush = 0;
+       unsigned long dirty_and_locked;
+
+       get_page_state(&ps);
+       dirty_and_locked = ps.nr_dirty + ps.nr_locked;
+
+       background_thresh = (dirty_background_ratio * tot) / 100;
+       async_thresh = (dirty_async_ratio * tot) / 100;
+       sync_thresh = (dirty_sync_ratio * tot) / 100;
+
+       if (dirty_and_locked > sync_thresh) {
+               int nr_to_write = dirty_and_locked - async_thresh;
+
+               writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL);
+               wake_pdflush = 1;
+       } else if (dirty_and_locked > async_thresh) {
+               int nr_to_write = dirty_and_locked - async_thresh;
+
+               writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
+       } else if (dirty_and_locked > background_thresh) {
+               wake_pdflush = 1;
+       }
+
+       if (wake_pdflush && !IS_FLUSHING(mapping->host)) {
+               /*
+                * There is no flush thread against this device. Start one now.
+                */
+               get_page_state(&ps);
+               if (ps.nr_dirty > 0) {
+                       pdflush_flush(ps.nr_dirty);
+                       yield();
+               }
+       }
+}
+
+/*
+ * Front-end to balance_dirty_pages - just to make sure it's not called
+ * too often.
+ */
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+       static struct rate_limit_struct {
+               int count;
+       } ____cacheline_aligned ratelimits[NR_CPUS];
+       int cpu;
+
+       preempt_disable();
+       cpu = smp_processor_id();
+       if (ratelimits[cpu].count++ >= 32) {
+               ratelimits[cpu].count = 0;
+               preempt_enable();
+               balance_dirty_pages(mapping);
+               return;
+       }
+       preempt_enable();
+}
+
+/*
+ * Here are some applications of the pdflush thread pool
+ */
+
+/*
+ * Start heavy writeback of everything.  This is the analogue of the old
+ * wakeup_bdflush().  Returns zero if a thread was successfully launched.
+ *
+ * Is passed in the number of pages to write.
+ *
+ * We yield, to allow page allocators to perform their I/O against large files.
+ */
+
+static void pdflush_bdflush(unsigned long arg)
+{
+       int nr_pages = arg;
+
+       CHECK_EMERGENCY_SYNC
+
+       while (nr_pages) {
+               int nr_to_write = WRITEOUT_PAGES;
+
+               if (nr_to_write > nr_pages)
+                       nr_to_write = nr_pages;
+               nr_pages -= nr_to_write;
+               writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
+               yield();
+       }
+       run_task_queue(&tq_disk);
+}
+
+int pdflush_flush(unsigned long nr_pages)
+{
+       return pdflush_operation(pdflush_bdflush, nr_pages);
+}
+
+/*
+ * The interval between `kupdate'-style writebacks.
+ *
+ * Traditional kupdate writes back data which is 30-35 seconds old.
+ * This one does that, but it also writes back just 1/6th of the dirty
+ * data.  This is to avoid great I/O storms.
+ *
+ * We chunk the writes up and yield, to permit any throttled page-allocators
+ * to perform their I/O against a large file.
+ */
+static int wb_writeback_jifs = 5 * HZ;
+
+/*
+ * Periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space.  So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Spot the bug: at jiffies wraparound, the attempt to set the inode's dirtying
+ * time won't work, because zero means not-dirty.  That's OK. The data will get
+ * written out later by the VM (at least).
+ *
+ * We also limit the number of pages which are written out, to avoid writing
+ * huge amounts of data against a single file, which would cause memory
+ * allocators to block for too long.
+ */
+static void wb_kupdate(unsigned long arg)
+{
+       unsigned long oldest_jif = jiffies - 30*HZ;
+       struct page_state ps;
+       int total_to_write;
+       int nr_to_write;
+
+       sync_supers();
+
+       get_page_state(&ps);
+
+       total_to_write = ps.nr_dirty / 6;
+       if (total_to_write < 16384) {
+               total_to_write = 16384;
+               if (total_to_write > ps.nr_dirty)
+                       total_to_write = ps.nr_dirty;
+       }
+       while (total_to_write > 0) {
+               nr_to_write = total_to_write;
+               if (nr_to_write > WRITEOUT_PAGES)
+                       nr_to_write = WRITEOUT_PAGES;
+               total_to_write -= nr_to_write;
+               writeback_unlocked_inodes(&nr_to_write,
+                               WB_SYNC_NONE, &oldest_jif);
+               yield();
+       }
+       run_task_queue(&tq_disk);
+}
+
+/*
+ * The writeback timer, for kupdate-style functionality
+ */
+static struct timer_list wb_timer;
+
+static void wb_timer_fn(unsigned long unused)
+{
+       mod_timer(&wb_timer, jiffies + wb_writeback_jifs);
+       pdflush_operation(wb_kupdate, 0);
+}
+
+static int __init wb_timer_init(void)
+{
+       init_timer(&wb_timer);
+       wb_timer.expires = jiffies + wb_writeback_jifs;
+       wb_timer.data = 0;
+       wb_timer.function = wb_timer_fn;
+       add_timer(&wb_timer);
+       return 0;
+}
+module_init(wb_timer_init);
+
+/*
+ * FIXME: PG_launder gets cleared by accident.
+ */
+static int writeback_mapping(struct page *page, int *nr_to_write)
+{
+       struct inode *inode = page->mapping->host;
+
+       SetPageDirty(page);
+
+       /*
+        * We don't own this inode, so we don't want the address_space
+        * vanishing while writeback is walking the list
+        */
+       inode = igrab(inode);
+       unlock_page(page);
+
+       if (inode) {
+               writeback_single_inode(inode, 0, nr_to_write);
+
+               /*
+                * This iput() will internally call ext2_discard_prealloc(),
+                * which is rather bogus.  But there is no other way of
+                * dropping our ref to the inode.  However, there's no harm
+                * in dropping the prealloc, because there probably isn't any.
+                * Just a waste of cycles.
+                */
+               iput(inode);
+       }
+       return 0;
+}
+
+/*
+ * A library function, which implements the vm_writeback a_op.  It's fairly
+ * lame at this time.  The idea is: the VM wants to liberate this page,
+ * so we pass the page to the address_space and give the fs the opportunity
+ * to write out lots of pages around this one.  It allows extent-based
+ * filesytems to do intelligent things.  It lets delayed-allocate filesystems
+ * perform better file layout.  It lets the address_space opportunistically
+ * write back disk-contiguous pages which are in other zones.
+ */
+int generic_vm_writeback(struct page *page, int *nr_to_write)
+{
+       return writeback_mapping(page, nr_to_write);
+}
+EXPORT_SYMBOL(generic_vm_writeback);
+
+/**
+ * generic_writeback_mapping - walk the list of dirty pages of the given
+ * address space and writepage() all of them.
+ * 
+ * @mapping: address space structure to write
+ * @nr_to_write: subtract the number of written pages from *@nr_to_write
+ *
+ * This is a library function, which implements the writeback_mapping()
+ * address_space_operation for filesystems which are using multipage BIO
+ * writeback.
+ *
+ * We need to be careful to avoid deadlocks here.  mpage_bio_writepage() does
+ * not immediately start I/O against each page.  It waits until the bio is
+ * full, or until mpage_bio_flush() is called.  So generic_writeback_mapping()
+ * is locking multiple pages without necessarily starting I/O against them.
+ *
+ * AB/BA deadlocks are avoided via locking implemented in the filesystem.
+ * Only one process ever has multiple locked pages against any mapping.
+ *
+ * FIXME: doing the locking in the fs is a bit grotty, but it allows us to
+ * not have to put a new semaphore in struct inode.  The fs could
+ * pass its bio_write_state up here, I guess.
+ *
+ * Pages can be moved from clean_pages or locked_pages onto dirty_pages
+ * at any time - it's not possible to lock against that.  So pages which
+ * have already been added to a BIO may magically reappear on the dirty_pages
+ * list.  And generic_writeback_mapping() will again try to lock those pages.
+ * But I/O has not yet been started agains the page.  Thus deadlock.
+ *
+ * To avoid this, the entire contents of the dirty_pages list are moved
+ * onto io_pages up-front.  We then walk io_pages, locking the
+ * pages and submitting them for I/O, moving them to locked_pages.
+ *
+ * This has the added benefit of preventing a livelock which would otherwise
+ * occur if pages are being dirtied faster than we can write them out.
+ *
+ * Thus generic_writeback_mapping() only makes the guarantee that all pages
+ * which were dirty at the time it was called will have I/O started against
+ * them.  And it's not possible to make a stronger guarantee than that.
+ */
+int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write)
+{
+       int ret = 0;
+       int done = 0;
+       int err;
+       int (*writepage)(struct page *) = mapping->a_ops->writepage;
+
+       write_lock(&mapping->page_lock);
+
+       list_splice(&mapping->dirty_pages, &mapping->io_pages);
+       INIT_LIST_HEAD(&mapping->dirty_pages);
+       mapping->dirtied_when = 0;
+
+        while (!list_empty(&mapping->io_pages) && !done) {
+               struct page *page = list_entry(mapping->io_pages.prev,
+                                       struct page, list);
+               list_del(&page->list);
+               list_add(&page->list, &mapping->locked_pages);
+               if (!PageDirty(page))
+                       continue;
+
+               page_cache_get(page);
+               write_unlock(&mapping->page_lock);
+
+               lock_page(page);
+
+               if (TestClearPageDirty(page)) {
+                       err = writepage(page);
+                       if (!ret)
+                               ret = err;
+                       if (nr_to_write) {
+                               --(*nr_to_write);
+                               if (*nr_to_write <= 0)
+                                       done = 1;
+                       }
+               } else
+                       UnlockPage(page);
+
+               page_cache_release(page);
+               write_lock(&mapping->page_lock);
+       }
+       if (!list_empty(&mapping->io_pages)) {
+               /*
+                * Put the rest back, in the correct order.
+                */
+               list_splice(&mapping->io_pages, mapping->dirty_pages.prev);
+               INIT_LIST_HEAD(&mapping->io_pages);
+       }
+       write_unlock(&mapping->page_lock);
+       return ret;
+}
+EXPORT_SYMBOL(generic_writeback_mapping);
+
+/*
+ * Add a page to the dirty page list.
+ *
+ * It is a sad fact of life that this function is called from several places
+ * deeply under spinlocking.  It may not sleep.
+ *
+ * If the page has buffers, the uptodate buffers are set dirty, to preserve
+ * dirty-state coherency between the page and the buffers.  It the page does
+ * not have buffers then when they are later attached they will all be set
+ * dirty.
+ *
+ * The buffers are dirtied before the page is dirtied.  There's a small race
+ * window in which a writepage caller may see the page cleanness but not the
+ * buffer dirtiness.  That's fine.  If this code were to set the page dirty
+ * before the buffers, a concurrent writepage caller could clear the page dirty
+ * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
+ * page on the dirty page list.
+ *
+ * There is also a small window where the page is dirty, and not on dirty_pages.
+ * Also a possibility that by the time the page is added to dirty_pages, it has
+ * been set clean.  The page lists are somewhat approximate in this regard.
+ * It's better to have clean pages accidentally attached to dirty_pages than to
+ * leave dirty pages attached to clean_pages.
+ *
+ * We use i_bufferlist_lock to lock against try_to_free_buffers while using the
+ * page's buffer list.  Also use this to protect against clean buffers being
+ * added to the page after it was set dirty.
+ *
+ * FIXME: may need to call ->reservepage here as well.  That's rather up to the
+ * address_space though.
+ */
+int __set_page_dirty_buffers(struct page *page)
+{
+       int ret = 0;
+       struct address_space *mapping = page->mapping;
+       struct inode *inode;
+
+       if (mapping == NULL) {
+               SetPageDirty(page);
+               goto out;
+       }
+
+       inode = mapping->host;
+
+       spin_lock(&inode->i_bufferlist_lock);
+
+       if (page_has_buffers(page)) {
+               struct buffer_head *head = page_buffers(page);
+               struct buffer_head *bh = head;
+
+               do {
+                       if (buffer_uptodate(bh))
+                               set_bit(BH_Dirty, &bh->b_state);
+                       bh = bh->b_this_page;
+               } while (bh != head);
+       }
+
+       if (!TestSetPageDirty(page)) {
+               write_lock(&mapping->page_lock);
+               list_del(&page->list);
+               list_add(&page->list, &mapping->dirty_pages);
+               write_unlock(&mapping->page_lock);
+               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+       }
+       
+       spin_unlock(&inode->i_bufferlist_lock);
+out:
+       return ret;
+}
+EXPORT_SYMBOL(__set_page_dirty_buffers);
+
+/*
+ * For address_spaces which do not use buffers.  Just set the page's dirty bit
+ * and move it to the dirty_pages list.  Also perform space reservation if
+ * required.
+ *
+ * __set_page_dirty_nobuffers() may return -ENOSPC.  But if it does, the page
+ * is still safe, as long as it actually manages to find some blocks at
+ * writeback time.
+ *
+ * This is also used when a single buffer is being dirtied: we want to set the
+ * page dirty in that case, but not all the buffers.  This is a "bottom-up"
+ * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ */
+int __set_page_dirty_nobuffers(struct page *page)
+{
+       int ret = 0;
+
+       if (!TestSetPageDirty(page)) {
+               struct address_space *mapping = page->mapping;
+
+               if (mapping) {
+                       write_lock(&mapping->page_lock);
+                       list_del(&page->list);
+                       list_add(&page->list, &mapping->dirty_pages);
+                       write_unlock(&mapping->page_lock);
+                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+               }
+       }
+       return ret;
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 54ac6bff6934625ba6e425ecd7e0b4943d73f6ee..041aa8b944a12cbfcc173c6b83090008ac05c06f 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -525,6 +525,33 @@ unsigned int nr_free_buffer_pages (void)
         return sum;
  }
  
+/*
+ * Amount of free RAM allocatable as pagecache memory:
+ */
+unsigned int nr_free_pagecache_pages(void)
+{
+       pg_data_t *pgdat = pgdat_list;
+       unsigned int sum = 0;
+
+       do {
+               zonelist_t *zonelist = pgdat->node_zonelists +
+                               (GFP_HIGHUSER & GFP_ZONEMASK);
+               zone_t **zonep = zonelist->zones;
+               zone_t *zone;
+
+               for (zone = *zonep++; zone; zone = *zonep++) {
+                       unsigned long size = zone->size;
+                       unsigned long high = zone->pages_high;
+                       if (size > high)
+                               sum += size - high;
+               }
+
+               pgdat = pgdat->node_next;
+       } while (pgdat);
+
+       return sum;
+}
+
  #if CONFIG_HIGHMEM
  unsigned int nr_free_highpages (void)
  {
diff --git a/mm/pdflush.c b/mm/pdflush.c

index 8017d920d0a25718a69f954c40e9f047defcfb48..07ceb439e9aef43fe7ab482ace5452424c79ff42 100644 (file)
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -56,7 +56,7 @@ static unsigned long last_empty_jifs;
   *
   * Thread pool management algorithm:
   * 
- * - The minumum and maximum number of pdflush instances are bound
+ * - The minimum and maximum number of pdflush instances are bound
   *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
   * 
   * - If there have been no idle pdflush instances for 1 second, create
@@ -155,8 +155,8 @@ static int __pdflush(struct pdflush_work *my_work)
  /*
   * Of course, my_work wants to be just a local in __pdflush().  It is
   * separated out in this manner to hopefully prevent the compiler from
- * performing unfortunate optimisations agains the auto variables.  Because
- * there are visible to other tasks and CPUs.  (No problem has actually
+ * performing unfortunate optimisations against the auto variables.  Because
+ * these are visible to other tasks and CPUs.  (No problem has actually
   * been observed.  This is just paranoia).
   */
  static int pdflush(void *dummy)
diff --git a/mm/swap_state.c b/mm/swap_state.c

index 0206301c4740ee8650832eacb5aa75ea9a1c4721..6e918948a4de43b47938003f4a7ed0f133884f1e 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -36,12 +36,23 @@ static struct address_space_operations swap_aops = {
         sync_page: block_sync_page,
  };
  
+/*
+ * swapper_inode is needed only for for i_bufferlist_lock. This
+ * avoid special-casing in other parts of the kernel.
+ */
+static struct inode swapper_inode = {
+       i_bufferlist_lock:      SPIN_LOCK_UNLOCKED,
+       i_mapping:              &swapper_space,
+};
+
  struct address_space swapper_space = {
         page_tree:      RADIX_TREE_INIT(GFP_ATOMIC),
         page_lock:      RW_LOCK_UNLOCKED,
         clean_pages:    LIST_HEAD_INIT(swapper_space.clean_pages),
         dirty_pages:    LIST_HEAD_INIT(swapper_space.dirty_pages),
+       io_pages:       LIST_HEAD_INIT(swapper_space.io_pages),
         locked_pages:   LIST_HEAD_INIT(swapper_space.locked_pages),
+       host:           &swapper_inode,
         a_ops:          &swap_aops,
         i_shared_lock:  SPIN_LOCK_UNLOCKED,
  };
diff --git a/mm/vmscan.c b/mm/vmscan.c

index ea0e7b2bac3a803e0d392fa56506009517056396..91effe15b29bba1c485094f2b52e1e02a921e2c6 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -21,6 +21,7 @@
  #include <linux/init.h>
  #include <linux/highmem.h>
  #include <linux/file.h>
+#include <linux/writeback.h>
  #include <linux/compiler.h>
  
  #include <asm/pgalloc.h>
@@ -428,7 +429,8 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask,
  
                 mapping = page->mapping;
  
-               if (PageDirty(page) && is_page_cache_freeable(page) && mapping) {
+               if (PageDirty(page) && is_page_cache_freeable(page) &&
+                               page->mapping && (gfp_mask & __GFP_FS)) {
                         /*
                          * It is not critical here to write it only if
                          * the page is unmapped beause any direct writer
@@ -437,16 +439,30 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask,
                          * pinned it and after the I/O to the page is finished,
                          * so the direct writes to the page cannot get lost.
                          */
+                       struct address_space_operations *a_ops;
+                       int (*writeback)(struct page *, int *);
                         int (*writepage)(struct page *);
  
-                       writepage = mapping->a_ops->writepage;
-                       if ((gfp_mask & __GFP_FS) && writepage) {
+                       /*
+                        * There's no guarantee that writeback() will actually
+                        * start I/O against *this* page.  Which is broken if we're
+                        * trying to free memory in a particular zone.  FIXME.
+                        */
+                       a_ops = mapping->a_ops;
+                       writeback = a_ops->vm_writeback;
+                       writepage = a_ops->writepage;
+                       if (writeback || writepage) {
                                 ClearPageDirty(page);
                                 SetPageLaunder(page);
                                 page_cache_get(page);
                                 spin_unlock(&pagemap_lru_lock);
  
-                               writepage(page);
+                               if (writeback) {
+                                       int nr_to_write = WRITEOUT_PAGES;
+                                       writeback(page, &nr_to_write);
+                               } else {
+                                       writepage(page);
+                               }
                                 page_cache_release(page);
  
                                 spin_lock(&pagemap_lru_lock);
author	Andrew Morton <akpm@zip.com.au>
	Tue, 30 Apr 2002 06:52:10 +0000 (23:52 -0700)
committer	Linus Torvalds <torvalds@home.transmeta.com>
	Tue, 30 Apr 2002 06:52:10 +0000 (23:52 -0700)
drivers/block/ll_rw_blk.c		patch \| blob \| history
drivers/block/loop.c		patch \| blob \| history
drivers/md/raid5.c		patch \| blob \| history
fs/Makefile		patch \| blob \| history
fs/block_dev.c		patch \| blob \| history
fs/buffer.c		patch \| blob \| history
fs/ext3/inode.c		patch \| blob \| history
fs/fs-writeback.c	[new file with mode: 0644]	patch \| blob
fs/inode.c		patch \| blob \| history
fs/jbd/checkpoint.c		patch \| blob \| history
fs/jbd/commit.c		patch \| blob \| history
fs/jbd/journal.c		patch \| blob \| history
fs/jbd/revoke.c		patch \| blob \| history
fs/jbd/transaction.c		patch \| blob \| history
fs/ntfs/aops.c		patch \| blob \| history
fs/reiserfs/do_balan.c		patch \| blob \| history
fs/reiserfs/inode.c		patch \| blob \| history
fs/reiserfs/journal.c		patch \| blob \| history
fs/reiserfs/prints.c		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
include/linux/sysctl.h		patch \| blob \| history
include/linux/writeback.h	[new file with mode: 0644]	patch \| blob
init/main.c		patch \| blob \| history
kernel/ksyms.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/Makefile		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/page-writeback.c	[new file with mode: 0644]	patch \| blob
mm/page_alloc.c		patch \| blob \| history
mm/pdflush.c		patch \| blob \| history
mm/swap_state.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history