BUG_ON(!buffer_mapped(bh));
BUG_ON(!bh->b_end_io);
+ if ((rw == READ || rw == READA) && buffer_uptodate(bh))
+ printk("%s: read of uptodate buffer\n", __FUNCTION__);
+ if (rw == WRITE && !buffer_uptodate(bh))
+ printk("%s: write of non-uptodate buffer\n", __FUNCTION__);
+
set_bit(BH_Req, &bh->b_state);
/*
* a multiple of the current approved size for the device.
*
**/
+
void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
{
unsigned int major;
if (!atomic_set_buffer_clean(bh))
/* Hmmph! Nothing to write */
goto end_io;
- __mark_buffer_clean(bh);
break;
case READA:
atomic_inc(&lo->lo_pending);
spin_unlock_irq(&lo->lo_lock);
- current->flags |= PF_NOIO;
-
/*
* up sem, we are running
*/
bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
bh->b_size = sh->size;
- bh->b_list = BUF_LOCKED;
return bh;
}
bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
- filesystems.o namespace.o seq_file.o xattr.o libfs.o
+ filesystems.o namespace.o seq_file.o xattr.o libfs.o \
+ fs-writeback.o
ifneq ($(CONFIG_NFSD),n)
ifneq ($(CONFIG_NFSD),)
return retval;
}
-
+/*
+ * AKPM: fixme. unneeded stuff here.
+ */
static int __block_fsync(struct inode * inode)
{
int ret, err;
sync_page: block_sync_page,
prepare_write: blkdev_prepare_write,
commit_write: blkdev_commit_write,
+ writeback_mapping: generic_writeback_mapping,
+ vm_writeback: generic_vm_writeback,
direct_IO: blkdev_direct_IO,
};
/*
* linux/fs/buffer.c
*
- * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 1991, 1992, 2002 Linus Torvalds
*/
/*
- * 'buffer.c' implements the buffer-cache functions. Race-conditions have
- * been avoided by NEVER letting an interrupt change a buffer (except for the
- * data, of course), but instead letting the caller do it.
- */
-
-/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
-
-/* Removed a lot of unnecessary code and simplified things now that
+ * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
+ *
+ * Removed a lot of unnecessary code and simplified things now that
* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
+ *
+ * Speed up hash, lru, and free list operations. Use gfp() for allocating
+ * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
+ *
+ * Added 32k buffer block sizes - these are required older ARM systems. - RMK
+ *
+ * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
*/
-/* Speed up hash, lru, and free list operations. Use gfp() for allocating
- * hash table, use SLAB cache for buffer heads. -DaveM
- */
-
-/* Added 32k buffer block sizes - these are required older ARM systems.
- * - RMK
- */
-
-/* Thread it... -DaveM */
-
-/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
-
#include <linux/config.h>
-#include <linux/time.h>
#include <linux/fs.h>
+#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/locks.h>
-#include <linux/errno.h>
-#include <linux/swap.h>
-#include <linux/swapctl.h>
#include <linux/smp_lock.h>
-#include <linux/vmalloc.h>
#include <linux/blkdev.h>
-#include <linux/sysrq.h>
#include <linux/file.h>
-#include <linux/init.h>
#include <linux/quotaops.h>
#include <linux/iobuf.h>
-#include <linux/highmem.h>
#include <linux/module.h>
-#include <linux/compiler.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
+#include <linux/writeback.h>
#include <asm/bitops.h>
-#include <asm/mmu_context.h>
#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
#define NR_RESERVED (10*MAX_BUF_PER_PAGE)
number of unused buffer heads */
/* Anti-deadlock ordering:
- * lru_list_lock > hash_table_lock > unused_list_lock
+ * i_bufferlist_lock > unused_list_lock
*/
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
/*
- * Hash table gook..
+ * A local cache of buffer_heads is maintained at unused_list.
+ * Free buffers are chained through their b_private field.
*/
-static unsigned int bh_hash_mask;
-static unsigned int bh_hash_shift;
-static struct buffer_head **hash_table;
-static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
-
-static struct buffer_head *lru_list[NR_LIST];
-static spinlock_t lru_list_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-static int nr_buffers_type[NR_LIST];
-static unsigned long size_buffers_type[NR_LIST];
-
-static struct buffer_head * unused_list;
+static struct buffer_head *unused_list;
static int nr_unused_buffer_heads;
static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
-static int grow_buffers(struct block_device *bdev, unsigned long block, int size);
-static void __refile_buffer(struct buffer_head *);
-
/* This is used by some architectures to estimate available memory. */
atomic_t buffermem_pages = ATOMIC_INIT(0);
-/* Here is the parameter block for the bdflush process. If you add or
- * remove any of the parameters, make sure to update kernel/sysctl.c
- * and the documentation at linux/Documentation/sysctl/vm.txt.
+/*
+ * Several of these buffer list functions are exported to filesystems,
+ * so we do funny things with the spinlocking to support those
+ * filesystems while still using inode->i_bufferlist_lock for
+ * most applications.
+ * FIXME: put a spinlock in the reiserfs journal and kill this lock.
*/
+static spinlock_t global_bufferlist_lock = SPIN_LOCK_UNLOCKED;
-#define N_PARAM 9
-
-/* The dummy values in this structure are left in there for compatibility
- * with old programs that play with the /proc entries.
+/*
+ * Debug/devel support stuff
*/
-union bdflush_param {
- struct {
- int nfract; /* Percentage of buffer cache dirty to
- activate bdflush */
- int dummy1; /* old "ndirty" */
- int dummy2; /* old "nrefill" */
- int dummy3; /* unused */
- int interval; /* jiffies delay between kupdate flushes */
- int age_buffer; /* Time for normal buffer to age before we flush it */
- int nfract_sync;/* Percentage of buffer cache dirty to
- activate bdflush synchronously */
- int dummy4; /* unused */
- int dummy5; /* unused */
- } b_un;
- unsigned int data[N_PARAM];
-} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}};
-
-/* These are the min and max parameter values that we will allow to be assigned */
-int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0};
-int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
+
+void __buffer_error(char *file, int line)
+{
+ static int enough;
+
+ if (enough > 10)
+ return;
+ enough++;
+ printk("buffer layer error at %s:%d\n", file, line);
+#ifdef CONFIG_X86
+ printk("Pass this trace through ksymoops for reporting\n");
+ {
+ extern void show_stack(long *esp);
+ show_stack(0);
+ }
+#endif
+}
+EXPORT_SYMBOL(__buffer_error);
+
+inline void
+init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+{
+ bh->b_end_io = handler;
+ bh->b_private = private;
+}
void unlock_buffer(struct buffer_head *bh)
{
- clear_bit(BH_Wait_IO, &bh->b_state);
- clear_bit(BH_launder, &bh->b_state);
+ /*
+ * unlock_buffer against a zero-count bh is a bug, if the page
+ * is not locked. Because then nothing protects the buffer's
+ * waitqueue, which is used here. (Well. Other locked buffers
+ * against the page will pin it. But complain anyway).
+ */
+ if (atomic_read(&bh->b_count) == 0 && !PageLocked(bh->b_page))
+ buffer_error();
+
clear_bit(BH_Lock, &bh->b_state);
smp_mb__after_clear_bit();
if (waitqueue_active(&bh->b_wait))
wake_up(&bh->b_wait);
}
+static inline void
+__set_page_buffers(struct page *page, struct buffer_head *head)
+{
+ struct inode *inode = page->mapping->host;
+
+ if (inode && S_ISBLK(inode->i_mode))
+ atomic_inc(&buffermem_pages);
+ if (page_has_buffers(page))
+ buffer_error();
+ set_page_buffers(page, head);
+ page_cache_get(page);
+}
+
+static inline void
+__clear_page_buffers(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ if (mapping) {
+ struct inode *inode = mapping->host;
+
+ if (S_ISBLK(inode->i_mode))
+ atomic_dec(&buffermem_pages);
+ }
+ clear_page_buffers(page);
+ page_cache_release(page);
+}
+
/*
- * Rewrote the wait-routines to use the "new" wait-queue functionality,
- * and getting rid of the cli-sti pairs. The wait-queue routines still
- * need cli-sti, but now it's just a couple of 386 instructions or so.
- *
- * Note that the real wait_on_buffer() is an inline function that checks
- * if 'b_wait' is set before calling this, so that the queues aren't set
- * up unnecessarily.
+ * Block until a buffer comes unlocked. This doesn't stop it
+ * from becoming locked again - you have to lock it yourself
+ * if you want to preserve its state.
*/
void __wait_on_buffer(struct buffer_head * bh)
{
*/
void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
+ if (!uptodate)
+ printk("%s: I/O error\n", __FUNCTION__);
mark_buffer_uptodate(bh, uptodate);
unlock_buffer(bh);
put_bh(bh);
}
/*
- * The buffers have been marked clean and locked. Just submit the dang
- * things..
+ * write out all the dirty data associated with a block device
+ * via its mapping. Does not take the superblock lock.
+ *
+ * If `wait' is true, wait on the writeout.
*/
-static void write_locked_buffers(struct buffer_head **array, unsigned int count)
+int sync_buffers(struct block_device *bdev, int wait)
{
- do {
- struct buffer_head * bh = *array++;
- bh->b_end_io = end_buffer_io_sync;
- submit_bh(WRITE, bh);
- } while (--count);
-}
+ int ret;
-/*
- * Write some buffers from the head of the dirty queue.
- *
- * This must be called with the LRU lock held, and will
- * return without it!
- */
-#define NRSYNC (32)
-static int write_some_buffers(struct block_device *bdev)
-{
- struct buffer_head *next;
- struct buffer_head *array[NRSYNC];
- unsigned int count;
- int nr;
-
- next = lru_list[BUF_DIRTY];
- nr = nr_buffers_type[BUF_DIRTY];
- count = 0;
- while (next && --nr >= 0) {
- struct buffer_head * bh = next;
- next = bh->b_next_free;
-
- if (bdev && bh->b_bdev != bdev)
- continue;
- if (test_and_set_bit(BH_Lock, &bh->b_state))
- continue;
- if (atomic_set_buffer_clean(bh)) {
- __refile_buffer(bh);
- get_bh(bh);
- array[count++] = bh;
- if (count < NRSYNC)
- continue;
+ ret = filemap_fdatasync(bdev->bd_inode->i_mapping);
+ if (wait) {
+ int err;
- spin_unlock(&lru_list_lock);
- write_locked_buffers(array, count);
- return -EAGAIN;
- }
- unlock_buffer(bh);
- __refile_buffer(bh);
+ err = filemap_fdatawait(bdev->bd_inode->i_mapping);
+ if (!ret)
+ ret = err;
}
- spin_unlock(&lru_list_lock);
-
- if (count)
- write_locked_buffers(array, count);
- return 0;
-}
-
-/*
- * Write out all buffers on the dirty list.
- */
-static void write_unlocked_buffers(struct block_device *bdev)
-{
- do {
- spin_lock(&lru_list_lock);
- } while (write_some_buffers(bdev));
- run_task_queue(&tq_disk);
+ return ret;
}
/*
- * Wait for a buffer on the proper list.
+ * Write out all the dirty data associated with a block device
+ * via its mapping. Does not take the superblock lock.
*
- * This must be called with the LRU lock held, and
- * will return with it released.
+ * Wait on the writeout.
*/
-static int wait_for_buffers(struct block_device *bdev, int index, int refile)
-{
- struct buffer_head * next;
- int nr;
-
- next = lru_list[index];
- nr = nr_buffers_type[index];
- while (next && --nr >= 0) {
- struct buffer_head *bh = next;
- next = bh->b_next_free;
-
- if (!buffer_locked(bh)) {
- if (refile)
- __refile_buffer(bh);
- continue;
- }
- if (bdev && bh->b_bdev != bdev)
- continue;
-
- get_bh(bh);
- spin_unlock(&lru_list_lock);
- wait_on_buffer (bh);
- put_bh(bh);
- return -EAGAIN;
- }
- spin_unlock(&lru_list_lock);
- return 0;
-}
-
-static inline void wait_for_some_buffers(struct block_device *bdev)
+int fsync_no_super(struct block_device *bdev)
{
- spin_lock(&lru_list_lock);
- wait_for_buffers(bdev, BUF_LOCKED, 1);
-}
+ int ret = 0;
-static int wait_for_locked_buffers(struct block_device *bdev, int index, int refile)
-{
- do {
- spin_lock(&lru_list_lock);
- } while (wait_for_buffers(bdev, index, refile));
- return 0;
+ if (bdev)
+ ret = sync_buffers(bdev, 1);
+ return ret;
}
-/* Call sync_buffers with wait!=0 to ensure that the call does not
- * return until all buffer writes have completed. Sync() may return
- * before the writes have finished; fsync() may not.
- */
-
-/* Godamity-damn. Some buffers (bitmaps for filesystems)
- * spontaneously dirty themselves without ever brelse being called.
- * We will ultimately want to put these in a separate list, but for
- * now we search all of the lists for dirty buffers.
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock. Filesystem data as well as the underlying block
+ * device. Takes the superblock lock.
*/
-int sync_buffers(struct block_device *bdev, int wait)
-{
- int err = 0;
-
- if (!bdev)
- return 0;
-
- /* One pass for no-wait, three for wait:
- * 0) write out all dirty, unlocked buffers;
- * 1) wait for all dirty locked buffers;
- * 2) write out all dirty, unlocked buffers;
- * 2) wait for completion by waiting for all buffers to unlock.
- */
- write_unlocked_buffers(bdev);
- if (wait) {
- err = wait_for_locked_buffers(bdev, BUF_DIRTY, 0);
- write_unlocked_buffers(bdev);
- err |= wait_for_locked_buffers(bdev, BUF_LOCKED, 1);
- }
- return err;
-}
-
-int sync_all_buffers(int wait)
-{
- int err = 0;
-
- /* One pass for no-wait, three for wait:
- * 0) write out all dirty, unlocked buffers;
- * 1) wait for all dirty locked buffers;
- * 2) write out all dirty, unlocked buffers;
- * 2) wait for completion by waiting for all buffers to unlock.
- */
- write_unlocked_buffers(NULL);
- if (wait) {
- err = wait_for_locked_buffers(NULL, BUF_DIRTY, 0);
- write_unlocked_buffers(NULL);
- err |= wait_for_locked_buffers(NULL, BUF_LOCKED, 1);
- }
- return err;
-}
-
int fsync_super(struct super_block *sb)
{
- sync_buffers(sb->s_bdev, 0);
-
- lock_kernel();
- sync_inodes_sb(sb);
+ sync_inodes_sb(sb); /* All the inodes */
DQUOT_SYNC(sb);
lock_super(sb);
if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
sb->s_op->write_super(sb);
unlock_super(sb);
- unlock_kernel();
- return sync_buffers(sb->s_bdev, 1);
+ return fsync_no_super(sb->s_bdev);
}
-int fsync_no_super(struct block_device *bdev)
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device. Filesystem data as well as the underlying block
+ * device. Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
{
- sync_buffers(bdev, 0);
- return sync_buffers(bdev, 1);
+ struct super_block *sb = get_super(to_kdev_t(bdev->bd_dev));
+ if (sb) {
+ int res = fsync_super(sb);
+ drop_super(sb);
+ return res;
+ }
+ return fsync_no_super(bdev);
}
+/*
+ * Write out and wait upon all dirty data associated with this
+ * kdev_t. Filesystem data as well as the underlying block
+ * device. Takes the superblock lock.
+ */
int fsync_dev(kdev_t dev)
{
struct block_device *bdev = bdget(kdev_t_to_nr(dev));
return 0;
}
-int fsync_bdev(struct block_device *bdev)
-{
- struct super_block *sb = get_super(to_kdev_t(bdev->bd_dev));
- if (sb) {
- int res = fsync_super(sb);
- drop_super(sb);
- return res;
- }
- return fsync_no_super(bdev);
-}
-
+/*
+ * sync everything.
+ */
asmlinkage long sys_sync(void)
{
- sync_all_buffers(0);
-
- lock_kernel();
- sync_inodes();
+ sync_inodes(); /* All mappings and inodes, including block devices */
DQUOT_SYNC(NULL);
- sync_supers();
- unlock_kernel();
-
- sync_all_buffers(1);
+ sync_supers(); /* Write the superblocks */
+ sync_inodes(); /* All the mappings and inodes, again. */
return 0;
}
/*
- * filp may be NULL if called via the msync of a vma.
+ * Generic function to fsync a file.
+ *
+ * filp may be NULL if called via the msync of a vma.
*/
int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
struct super_block * sb;
int ret;
- lock_kernel();
/* sync the inode to buffers */
write_inode_now(inode, 0);
/* .. finally sync the buffers to disk */
ret = sync_buffers(sb->s_bdev, 1);
- unlock_kernel();
return ret;
}
return ret;
}
-/* After several hours of tedious analysis, the following hash
- * function won. Do not mess with it... -DaveM
+/*
+ * Various filesystems appear to want __get_hash_table to be non-blocking.
+ * But it's the page lock which protects the buffers. To get around this,
+ * we get exclusion from try_to_free_buffers with the inode's
+ * i_bufferlist_lock.
+ *
+ * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
+ * may be quite high. This code could TryLock the page, and if that
+ * succeeds, there is no need to take i_bufferlist_lock. (But if
+ * i_bufferlist_lock is contended then so is mapping->page_lock).
*/
-#define _hashfn(dev,block) \
- ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
- (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
- ((block) << (bh_hash_shift - 12))))
-#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
-
-static inline void __insert_into_hash_list(struct buffer_head *bh)
+struct buffer_head *
+__get_hash_table(struct block_device *bdev, sector_t block, int unused)
{
- struct buffer_head **head = &hash(to_kdev_t(bh->b_bdev->bd_dev), bh->b_blocknr);
- struct buffer_head *next = *head;
-
- *head = bh;
- bh->b_pprev = head;
- bh->b_next = next;
- if (next != NULL)
- next->b_pprev = &bh->b_next;
-}
-
-static __inline__ void __hash_unlink(struct buffer_head *bh)
-{
- struct buffer_head **pprev = bh->b_pprev;
- if (pprev) {
- struct buffer_head *next = bh->b_next;
- if (next)
- next->b_pprev = pprev;
- *pprev = next;
- bh->b_pprev = NULL;
- }
-}
-
-static void __insert_into_lru_list(struct buffer_head * bh, int blist)
-{
- struct buffer_head **bhp = &lru_list[blist];
+ struct inode * const inode = bdev->bd_inode;
+ struct buffer_head *ret = NULL;
+ unsigned long index;
+ struct buffer_head *bh;
+ struct buffer_head *head;
+ struct page *page;
- if (bh->b_prev_free || bh->b_next_free) BUG();
+ index = block >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ page = find_get_page(inode->i_mapping, index);
+ if (!page)
+ goto out;
- if(!*bhp) {
- *bhp = bh;
- bh->b_prev_free = bh;
- }
- bh->b_next_free = *bhp;
- bh->b_prev_free = (*bhp)->b_prev_free;
- (*bhp)->b_prev_free->b_next_free = bh;
- (*bhp)->b_prev_free = bh;
- nr_buffers_type[blist]++;
- size_buffers_type[blist] += bh->b_size;
-}
-
-static void __remove_from_lru_list(struct buffer_head * bh)
-{
- struct buffer_head *next = bh->b_next_free;
- if (next) {
- struct buffer_head *prev = bh->b_prev_free;
- int blist = bh->b_list;
-
- prev->b_next_free = next;
- next->b_prev_free = prev;
- if (lru_list[blist] == bh) {
- if (next == bh)
- next = NULL;
- lru_list[blist] = next;
+ spin_lock(&inode->i_bufferlist_lock);
+ if (!page_has_buffers(page))
+ goto out_unlock;
+ head = page_buffers(page);
+ bh = head;
+ do {
+ if (bh->b_blocknr == block) {
+ ret = bh;
+ get_bh(bh);
+ goto out_unlock;
}
- bh->b_next_free = NULL;
- bh->b_prev_free = NULL;
- nr_buffers_type[blist]--;
- size_buffers_type[blist] -= bh->b_size;
- }
-}
-
-/* must be called with both the hash_table_lock and the lru_list_lock
- held */
-static void __remove_from_queues(struct buffer_head *bh)
-{
- __hash_unlink(bh);
- __remove_from_lru_list(bh);
-}
-
-static void remove_from_queues(struct buffer_head *bh)
-{
- spin_lock(&lru_list_lock);
- write_lock(&hash_table_lock);
- __remove_from_queues(bh);
- write_unlock(&hash_table_lock);
- spin_unlock(&lru_list_lock);
-}
-
-struct buffer_head * __get_hash_table(struct block_device *bdev, sector_t block, int size)
-{
- struct buffer_head *bh, **p = &hash(to_kdev_t(bdev->bd_dev), block);
-
- read_lock(&hash_table_lock);
-
- for (;;) {
- bh = *p;
- if (!bh)
- break;
- p = &bh->b_next;
- if (bh->b_blocknr != block)
- continue;
- if (bh->b_size != size)
- continue;
- if (bh->b_bdev != bdev)
- continue;
- get_bh(bh);
- break;
- }
-
- read_unlock(&hash_table_lock);
- return bh;
+ bh = bh->b_this_page;
+ } while (bh != head);
+ buffer_error();
+out_unlock:
+ spin_unlock(&inode->i_bufferlist_lock);
+ page_cache_release(page);
+out:
+ return ret;
}
-void buffer_insert_list(struct buffer_head *bh, struct list_head *list)
+void buffer_insert_list(spinlock_t *lock,
+ struct buffer_head *bh, struct list_head *list)
{
- spin_lock(&lru_list_lock);
+ if (lock == NULL)
+ lock = &global_bufferlist_lock;
+ spin_lock(lock);
if (bh->b_inode)
list_del(&bh->b_inode_buffers);
bh->b_inode = 1;
list_add(&bh->b_inode_buffers, list);
- spin_unlock(&lru_list_lock);
+ spin_unlock(lock);
}
-/* The caller must have the lru_list lock before calling the
- remove_inode_queue functions. */
-static void __remove_inode_queue(struct buffer_head *bh)
-{
- bh->b_inode = 0;
- list_del(&bh->b_inode_buffers);
-}
-
-static inline void remove_inode_queue(struct buffer_head *bh)
+/*
+ * i_bufferlist_lock must be held
+ */
+static inline void __remove_inode_queue(struct buffer_head *bh)
{
- if (bh->b_inode)
- __remove_inode_queue(bh);
+ if (bh->b_inode) {
+ list_del(&bh->b_inode_buffers);
+ bh->b_inode = 0;
+ }
}
int inode_has_buffers(struct inode *inode)
{
int ret;
- spin_lock(&lru_list_lock);
- ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
- spin_unlock(&lru_list_lock);
+ spin_lock(&inode->i_bufferlist_lock);
+ ret = !list_empty(&inode->i_dirty_buffers) ||
+ !list_empty(&inode->i_dirty_data_buffers);
+ spin_unlock(&inode->i_bufferlist_lock);
return ret;
}
pass does the actual I/O. */
void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
{
- int i, nlist, slept;
- struct buffer_head * bh, * bh_next;
-
- retry:
- slept = 0;
- spin_lock(&lru_list_lock);
- for(nlist = 0; nlist < NR_LIST; nlist++) {
- bh = lru_list[nlist];
- if (!bh)
- continue;
- for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
- bh_next = bh->b_next_free;
-
- /* Another device? */
- if (bh->b_bdev != bdev)
- continue;
- /* Not hashed? */
- if (!bh->b_pprev)
- continue;
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&lru_list_lock);
- wait_on_buffer(bh);
- slept = 1;
- spin_lock(&lru_list_lock);
- put_bh(bh);
- }
-
- write_lock(&hash_table_lock);
- /* All buffers in the lru lists are mapped */
- if (!buffer_mapped(bh))
- BUG();
- if (buffer_dirty(bh))
- printk("invalidate: dirty buffer\n");
- if (!atomic_read(&bh->b_count)) {
- if (destroy_dirty_buffers || !buffer_dirty(bh)) {
- remove_inode_queue(bh);
- }
- } else
- printk("invalidate: busy buffer\n");
-
- write_unlock(&hash_table_lock);
- if (slept)
- goto out;
- }
- }
-out:
- spin_unlock(&lru_list_lock);
- if (slept)
- goto retry;
-
- /* Get rid of the page cache */
+ /*
+ * FIXME: what about destroy_dirty_buffers?
+ * We really want to use invalidate_inode_pages2() for
+ * that, but not until that's cleaned up.
+ */
invalidate_inode_pages(bdev->bd_inode);
}
}
}
+/*
+ * FIXME: What is this function actually trying to do? Why "zones[0]"?
+ * Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER?
+ */
static void free_more_memory(void)
{
- zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
-
- balance_dirty();
+ zone_t *zone;
+
+ zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
+
wakeup_bdflush();
try_to_free_pages(zone, GFP_NOFS, 0);
run_task_queue(&tq_disk);
yield();
}
-void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
-{
- bh->b_list = BUF_CLEAN;
- bh->b_end_io = handler;
- bh->b_private = private;
-}
-
-static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
+static void end_buffer_io_async(struct buffer_head *bh, int uptodate)
{
static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
unsigned long flags;
struct buffer_head *tmp;
struct page *page;
+ int page_uptodate = 1;
- mark_buffer_uptodate(bh, uptodate);
+ if (!uptodate)
+ printk("%s: I/O error\n", __FUNCTION__);
- /* This is a temporary buffer used for page I/O. */
+ mark_buffer_uptodate(bh, uptodate);
page = bh->b_page;
-
if (!uptodate)
SetPageError(page);
* Be _very_ careful from here on. Bad things can happen if
* two buffer heads end IO at almost the same time and both
* decide that the page is now completely done.
- *
- * Async buffer_heads are here only as labels for IO, and get
- * thrown away once the IO for this page is complete. IO is
- * deemed complete once all buffers have been visited
- * (b_count==0) and are now unlocked. We must make sure that
- * only the _last_ buffer that decrements its count is the one
- * that unlock the page..
*/
spin_lock_irqsave(&page_uptodate_lock, flags);
mark_buffer_async(bh, 0);
unlock_buffer(bh);
- tmp = bh->b_this_page;
- while (tmp != bh) {
- if (buffer_locked(tmp)) {
- if (buffer_async(tmp))
+ tmp = bh;
+ do {
+ if (!buffer_uptodate(tmp))
+ page_uptodate = 0;
+ if (buffer_async(tmp)) {
+ if (buffer_locked(tmp))
goto still_busy;
- } else if (!buffer_uptodate(tmp))
- SetPageError(page);
+ if (!buffer_mapped(bh))
+ BUG();
+ }
tmp = tmp->b_this_page;
- }
-
- /* OK, the async IO on this page is complete. */
+ } while (tmp != bh);
spin_unlock_irqrestore(&page_uptodate_lock, flags);
/*
- * if none of the buffers had errors then we can set the
- * page uptodate:
+ * If none of the buffers had errors and they are all
+ * uptodate then we can set the page uptodate.
*/
- if (!PageError(page))
+ if (page_uptodate && !PageError(page))
SetPageUptodate(page);
-
UnlockPage(page);
-
return;
still_busy:
return;
}
+/*
+ * If a page's buffers are under async writeout (end_buffer_io_async
+ * completion) then there is a possibility that another thread of
+ * control could lock one of the buffers after it has completed
+ * but while some of the other buffers have not completed. This
+ * locked buffer would confuse end_buffer_io_async() into not unlocking
+ * the page. So the absence of BH_Async tells end_buffer_io_async()
+ * that this buffer is not under async I/O.
+ *
+ * The page comes unlocked when it has no locked buffer_async buffers
+ * left.
+ *
+ * The page lock prevents anyone starting new async I/O against any of
+ * the buffers.
+ */
inline void set_buffer_async_io(struct buffer_head *bh)
{
bh->b_end_io = end_buffer_io_async;
mark_buffer_async(bh, 1);
}
+/*
+ * osync is designed to support O_SYNC io. It waits synchronously for
+ * all already-submitted IO to complete, but does not queue any new
+ * writes to the disk.
+ *
+ * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
+ * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * completion. Any other dirty buffers which are not yet queued for
+ * write will not be flushed to disk by the osync.
+ */
+static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
+{
+ struct buffer_head *bh;
+ struct list_head *p;
+ int err = 0;
+
+ if (lock == NULL)
+ lock = &global_bufferlist_lock;
+
+ spin_lock(lock);
+repeat:
+ for (p = list->prev;
+ bh = BH_ENTRY(p), p != list;
+ p = bh->b_inode_buffers.prev) {
+ if (buffer_locked(bh)) {
+ get_bh(bh);
+ spin_unlock(lock);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ err = -EIO;
+ brelse(bh);
+ spin_lock(lock);
+ goto repeat;
+ }
+ }
+ spin_unlock(lock);
+ return err;
+}
+
/*
* Synchronise all the inode's dirty buffers to the disk.
*
* the osync code to catch these locked, dirty buffers without requeuing
* any newly dirty buffers for write.
*/
-
-int fsync_buffers_list(struct list_head *list)
+int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{
struct buffer_head *bh;
struct list_head tmp;
int err = 0, err2;
+
+ if (lock == NULL)
+ lock = &global_bufferlist_lock;
INIT_LIST_HEAD(&tmp);
-
- spin_lock(&lru_list_lock);
+ spin_lock(lock);
while (!list_empty(list)) {
bh = BH_ENTRY(list->next);
list_del(&bh->b_inode_buffers);
list_add(&bh->b_inode_buffers, &tmp);
if (buffer_dirty(bh)) {
get_bh(bh);
- spin_unlock(&lru_list_lock);
+ spin_unlock(lock);
ll_rw_block(WRITE, 1, &bh);
brelse(bh);
- spin_lock(&lru_list_lock);
+ spin_lock(lock);
}
}
}
while (!list_empty(&tmp)) {
bh = BH_ENTRY(tmp.prev);
- remove_inode_queue(bh);
+ __remove_inode_queue(bh);
get_bh(bh);
- spin_unlock(&lru_list_lock);
+ spin_unlock(lock);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
err = -EIO;
brelse(bh);
- spin_lock(&lru_list_lock);
+ spin_lock(lock);
}
- spin_unlock(&lru_list_lock);
- err2 = osync_buffers_list(list);
-
+ spin_unlock(lock);
+ err2 = osync_buffers_list(lock, list);
if (err)
return err;
else
}
/*
- * osync is designed to support O_SYNC io. It waits synchronously for
- * all already-submitted IO to complete, but does not queue any new
- * writes to the disk.
- *
- * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
- * you dirty the buffers, and then use osync_inode_buffers to wait for
- * completion. Any other dirty buffers which are not yet queued for
- * write will not be flushed to disk by the osync.
+ * Invalidate any and all dirty buffers on a given inode. We are
+ * probably unmounting the fs, but that doesn't mean we have already
+ * done a sync(). Just drop the buffers from the inode list.
*/
+void invalidate_inode_buffers(struct inode *inode)
+{
+ struct list_head * entry;
+
+ spin_lock(&inode->i_bufferlist_lock);
+ while ((entry = inode->i_dirty_buffers.next) !=
+ &inode->i_dirty_buffers)
+ __remove_inode_queue(BH_ENTRY(entry));
+ while ((entry = inode->i_dirty_data_buffers.next) !=
+ &inode->i_dirty_data_buffers)
+ __remove_inode_queue(BH_ENTRY(entry));
+ spin_unlock(&inode->i_bufferlist_lock);
+}
-int osync_buffers_list(struct list_head *list)
+static void __put_unused_buffer_head(struct buffer_head * bh)
{
- struct buffer_head *bh;
- struct list_head *p;
- int err = 0;
+ if (bh->b_inode)
+ BUG();
+ if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
+ kmem_cache_free(bh_cachep, bh);
+ } else {
+ bh->b_bdev = NULL;
+ bh->b_blocknr = -1;
+ bh->b_this_page = NULL;
- spin_lock(&lru_list_lock);
-
- repeat:
-
- for (p = list->prev;
- bh = BH_ENTRY(p), p != list;
- p = bh->b_inode_buffers.prev) {
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&lru_list_lock);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- err = -EIO;
- brelse(bh);
- spin_lock(&lru_list_lock);
- goto repeat;
+ nr_unused_buffer_heads++;
+ bh->b_private = unused_list;
+ unused_list = bh;
+ if (waitqueue_active(&buffer_wait))
+ wake_up(&buffer_wait);
+ }
+}
+
+void put_unused_buffer_head(struct buffer_head *bh)
+{
+ spin_lock(&unused_list_lock);
+ __put_unused_buffer_head(bh);
+ spin_unlock(&unused_list_lock);
+}
+EXPORT_SYMBOL(put_unused_buffer_head);
+
+/*
+ * Create the appropriate buffers when given a page for data area and
+ * the size of each buffer.. Use the bh->b_this_page linked list to
+ * follow the buffers created. Return NULL if unable to create more
+ * buffers.
+ * The async flag is used to differentiate async IO (paging, swapping)
+ * from ordinary buffer allocations, and only async requests are allowed
+ * to sleep waiting for buffer heads.
+ */
+static struct buffer_head *
+create_buffers(struct page * page, unsigned long size, int async)
+{
+ struct buffer_head *bh, *head;
+ long offset;
+
+try_again:
+ head = NULL;
+ offset = PAGE_SIZE;
+ while ((offset -= size) >= 0) {
+ bh = get_unused_buffer_head(async);
+ if (!bh)
+ goto no_grow;
+
+ bh->b_bdev = NULL;
+ bh->b_this_page = head;
+ head = bh;
+
+ bh->b_state = 0;
+ atomic_set(&bh->b_count, 0);
+ bh->b_size = size;
+
+ /* Link the buffer to its page */
+ set_bh_page(bh, page, offset);
+
+ bh->b_end_io = NULL;
+ }
+ return head;
+/*
+ * In case anything failed, we just free everything we got.
+ */
+no_grow:
+ if (head) {
+ spin_lock(&unused_list_lock);
+ do {
+ bh = head;
+ head = head->b_this_page;
+ __put_unused_buffer_head(bh);
+ } while (head);
+ spin_unlock(&unused_list_lock);
+ }
+
+ /*
+ * Return failure for non-async IO requests. Async IO requests
+ * are not allowed to fail, so we have to wait until buffer heads
+ * become available. But we don't want tasks sleeping with
+ * partially complete buffers, so all were released above.
+ */
+ if (!async)
+ return NULL;
+
+ /* We're _really_ low on memory. Now we just
+ * wait for old buffer heads to become free due to
+ * finishing IO. Since this is an async request and
+ * the reserve list is empty, we're sure there are
+ * async buffer heads in use.
+ */
+ run_task_queue(&tq_disk);
+
+ free_more_memory();
+ goto try_again;
+}
+
+static inline void
+link_dev_buffers(struct page *page, struct buffer_head *head)
+{
+ struct buffer_head *bh, *tail;
+
+ bh = head;
+ do {
+ tail = bh;
+ bh = bh->b_this_page;
+ } while (bh);
+ tail->b_this_page = head;
+ __set_page_buffers(page, head);
+}
+
+/*
+ * Initialise the state of a blockdev page's buffers.
+ */
+static /*inline*/ void
+init_page_buffers(struct page *page, struct block_device *bdev,
+ int block, int size)
+{
+ struct buffer_head *head = page_buffers(page);
+ struct buffer_head *bh = head;
+ unsigned int b_state;
+
+ b_state = 1 << BH_Mapped;
+ if (Page_Uptodate(page))
+ b_state |= 1 << BH_Uptodate;
+
+ do {
+ if (!(bh->b_state & (1 << BH_Mapped))) {
+ init_buffer(bh, NULL, NULL);
+ bh->b_bdev = bdev;
+ bh->b_blocknr = block;
+ bh->b_state = b_state;
}
+ block++;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
+/*
+ * Create the page-cache page that contains the requested block.
+ *
+ * This is user purely for blockdev mappings.
+ */
+static /*inline*/ struct page *
+grow_dev_page(struct block_device *bdev, unsigned long block,
+ unsigned long index, int size)
+{
+ struct inode *inode = bdev->bd_inode;
+ struct page *page;
+ struct buffer_head *bh;
+
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (!page)
+ return NULL;
+
+ if (!PageLocked(page))
+ BUG();
+
+ if (page_has_buffers(page)) {
+ bh = page_buffers(page);
+ if (bh->b_size == size)
+ return page;
+ if (!try_to_free_buffers(page))
+ goto failed;
}
- spin_unlock(&lru_list_lock);
- return err;
+ /*
+ * Allocate some buffers for this page
+ */
+ bh = create_buffers(page, size, 0);
+ if (!bh)
+ goto failed;
+
+ /*
+ * Link the page to the buffers and initialise them. Take the
+ * lock to be atomic wrt __get_hash_table(), which does not
+ * run under the page lock.
+ */
+ spin_lock(&inode->i_bufferlist_lock);
+ link_dev_buffers(page, bh);
+ init_page_buffers(page, bdev, block, size);
+ spin_unlock(&inode->i_bufferlist_lock);
+ return page;
+
+failed:
+ buffer_error();
+ UnlockPage(page);
+ page_cache_release(page);
+ return NULL;
}
-/*
- * Invalidate any and all dirty buffers on a given inode. We are
- * probably unmounting the fs, but that doesn't mean we have already
- * done a sync(). Just drop the buffers from the inode list.
- */
-void invalidate_inode_buffers(struct inode *inode)
-{
- struct list_head * entry;
-
- spin_lock(&lru_list_lock);
- while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
- remove_inode_queue(BH_ENTRY(entry));
- while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
- remove_inode_queue(BH_ENTRY(entry));
- spin_unlock(&lru_list_lock);
+/*
+ * Create buffers for the specified block device block's page. If
+ * that page was dirty, the buffers are set dirty also.
+ *
+ * Except that's a bug. Attaching dirty buffers to a dirty
+ * blockdev's page can result in filesystem corruption, because
+ * some of those buffers may be aliases of filesystem data.
+ * grow_dev_page() will go BUG() if this happens.
+ */
+static inline int
+grow_buffers(struct block_device *bdev, unsigned long block, int size)
+{
+ struct page *page;
+ unsigned long index;
+ int sizebits;
+
+ /* Size must be multiple of hard sectorsize */
+ if (size & (bdev_hardsect_size(bdev)-1))
+ BUG();
+ if (size < 512 || size > PAGE_SIZE)
+ BUG();
+
+ sizebits = -1;
+ do {
+ sizebits++;
+ } while ((size << sizebits) < PAGE_SIZE);
+
+ index = block >> sizebits;
+ block = index << sizebits;
+
+ /* Create a page with the proper size buffers.. */
+ page = grow_dev_page(bdev, block, index, size);
+ if (!page)
+ return 0;
+ UnlockPage(page);
+ page_cache_release(page);
+ return 1;
}
-
/*
- * Ok, this is getblk, and it isn't very clear, again to hinder
- * race-conditions. Most of the code is seldom used, (ie repeating),
- * so it should be much more efficient than it looks.
+ * __getblk will locate (and, if necessary, create) the buffer_head
+ * which corresponds to the passed block_device, block and size. The
+ * returned buffer has its reference count incremented.
*
- * The algorithm is changed: hopefully better, and an elusive bug removed.
+ * __getblk() cannot fail - it just keeps trying. If you pass it an
+ * illegal block number, __getblk() will happily return a buffer_head
+ * which represents the non-existent block. Very weird.
*
- * 14.02.92: changed it to sync dirty buffers a bit: better performance
- * when the filesystem starts to get full of dirty blocks (I hope).
+ * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
+ * attempt is failing. FIXME, perhaps?
*/
-struct buffer_head * __getblk(struct block_device *bdev, sector_t block, int size)
+struct buffer_head *
+__getblk(struct block_device *bdev, sector_t block, int size)
{
for (;;) {
struct buffer_head * bh;
bh = __get_hash_table(bdev, block, size);
- if (bh)
+ if (bh) {
+ touch_buffer(bh);
return bh;
+ }
if (!grow_buffers(bdev, block, size))
free_more_memory();
}
}
-/* -1 -> no need to flush
- 0 -> async flush
- 1 -> sync flush (wait for I/O completion) */
-static int balance_dirty_state(void)
-{
- unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
-
- dirty = (size_buffers_type[BUF_DIRTY] + size_buffers_type[BUF_LOCKED]) >> PAGE_SHIFT;
- tot = nr_free_buffer_pages();
-
- dirty *= 100;
- soft_dirty_limit = tot * bdf_prm.b_un.nfract;
- hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
-
- /* First, check for the "real" dirty limit. */
- if (dirty > soft_dirty_limit) {
- if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
- return 1;
- return 0;
- }
-
- return -1;
-}
-
/*
- * if a new dirty buffer is created we need to balance bdflush.
+ * The relationship between dirty buffers and dirty pages:
+ *
+ * Whenever a page has any dirty buffers, the page's dirty bit is set, and
+ * the page appears on its address_space.dirty_pages list.
+ *
+ * At all times, the dirtiness of the buffers represents the dirtiness of
+ * subsections of the page. If the page has buffers, the page dirty bit is
+ * merely a hint about the true dirty state.
*
- * in the future we might want to make bdflush aware of different
- * pressures on different devices - thus the (currently unused)
- * 'dev' parameter.
+ * When a page is set dirty in its entirety, all its buffers are marked dirty
+ * (if the page has buffers).
+ *
+ * When a buffer is marked dirty, its page is dirtied, but the page's other
+ * buffers are not.
+ *
+ * Also. When blockdev buffers are explicitly read with bread(), they
+ * individually become uptodate. But their backing page remains not
+ * uptodate - even if all of its buffers are uptodate. A subsequent
+ * block_read_full_page() against that page will discover all the uptodate
+ * buffers, will set the page uptodate and will perform no I/O.
*/
-void balance_dirty(void)
-{
- int state = balance_dirty_state();
-
- if (state < 0)
- return;
-
- /* If we're getting into imbalance, start write-out */
- spin_lock(&lru_list_lock);
- write_some_buffers(NULL);
-
- /*
- * And if we're _really_ out of balance, wait for
- * some of the dirty/locked buffers ourselves and
- * start bdflush.
- * This will throttle heavy writers.
- */
- if (state > 0) {
- wait_for_some_buffers(NULL);
- wakeup_bdflush();
- }
-}
-
-inline void __mark_dirty(struct buffer_head *bh)
+static inline void __mark_dirty(struct buffer_head *bh)
{
- bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
- refile_buffer(bh);
-}
-
-/* atomic version, the user must call balance_dirty() by hand
- as soon as it become possible to block */
-void __mark_buffer_dirty(struct buffer_head *bh)
-{
- if (!atomic_set_buffer_dirty(bh))
- __mark_dirty(bh);
+ __set_page_dirty_nobuffers(bh->b_page);
}
+/**
+ * mark_buffer_dirty - mark a buffer_head as needing writeout
+ *
+ * mark_buffer_dirty() will set the dirty bit against the buffer,
+ * then set its backing page dirty, then attach the page to its
+ * address_space's dirty_pages list and then attach the address_space's
+ * inode to its superblock's dirty inode list.
+ *
+ * mark_buffer_dirty() is atomic. It takes inode->i_bufferlist_lock,
+ * mapping->page_lock and the global inode_lock.
+ */
void mark_buffer_dirty(struct buffer_head *bh)
{
- if (!atomic_set_buffer_dirty(bh)) {
+ if (!atomic_set_buffer_dirty(bh))
__mark_dirty(bh);
- balance_dirty();
- }
-}
-
-void set_buffer_flushtime(struct buffer_head *bh)
-{
- bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
-}
-EXPORT_SYMBOL(set_buffer_flushtime);
-
-/*
- * A buffer may need to be moved from one buffer list to another
- * (e.g. in case it is not shared any more). Handle this.
- */
-static void __refile_buffer(struct buffer_head *bh)
-{
- int dispose = BUF_CLEAN;
- if (buffer_locked(bh))
- dispose = BUF_LOCKED;
- if (buffer_dirty(bh))
- dispose = BUF_DIRTY;
- if (dispose != bh->b_list) {
- __remove_from_lru_list(bh);
- bh->b_list = dispose;
- if (dispose == BUF_CLEAN)
- remove_inode_queue(bh);
- __insert_into_lru_list(bh, dispose);
- }
-}
-
-void refile_buffer(struct buffer_head *bh)
-{
- spin_lock(&lru_list_lock);
- __refile_buffer(bh);
- spin_unlock(&lru_list_lock);
}
/*
- * Release a buffer head
+ * Decrement a buffer_head's reference count. If all buffers against a page
+ * have zero reference count, are clean and unlocked, and if the page is clean
+ * and unlocked then try_to_free_buffers() may strip the buffers from the page
+ * in preparation for freeing it (sometimes, rarely, buffers are removed from
+ * a page but it ends up not being freed, and buffers may later be reattached).
*/
void __brelse(struct buffer_head * buf)
{
return;
}
printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
+ buffer_error(); /* For the stack backtrace */
}
/*
}
/**
- * bread() - reads a specified block and returns the bh
- * @block: number of block
- * @size: size (in bytes) to read
+ * bread() - reads a specified block and returns the bh
+ * @block: number of block
+ * @size: size (in bytes) to read
*
- * Reads a specified block, and returns buffer head that
- * contains it. It returns NULL if the block was unreadable.
+ * Reads a specified block, and returns buffer head that contains it.
+ * It returns NULL if the block was unreadable.
*/
struct buffer_head * __bread(struct block_device *bdev, int block, int size)
{
- struct buffer_head * bh = __getblk(bdev, block, size);
+ struct buffer_head *bh = __getblk(bdev, block, size);
- touch_buffer(bh);
if (buffer_uptodate(bh))
return bh;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
+ lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
return bh;
- brelse(bh);
- return NULL;
-}
-
-/*
- * Note: the caller should wake up the buffer_wait list if needed.
- */
-static void __put_unused_buffer_head(struct buffer_head * bh)
-{
- if (bh->b_inode)
- BUG();
- if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
- kmem_cache_free(bh_cachep, bh);
} else {
- bh->b_bdev = NULL;
- bh->b_blocknr = -1;
- bh->b_this_page = NULL;
-
- nr_unused_buffer_heads++;
- bh->b_next_free = unused_list;
- unused_list = bh;
+ if (buffer_dirty(bh))
+ buffer_error();
+ get_bh(bh);
+ bh->b_end_io = end_buffer_io_sync;
+ submit_bh(READ, bh);
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return bh;
}
+ brelse(bh);
+ return NULL;
}
-void put_unused_buffer_head(struct buffer_head *bh)
-{
- spin_lock(&unused_list_lock);
- __put_unused_buffer_head(bh);
- spin_unlock(&unused_list_lock);
-}
-EXPORT_SYMBOL(put_unused_buffer_head);
-
/*
* Reserve NR_RESERVED buffer heads for async IO requests to avoid
* no-buffer-head deadlock. Return NULL on failure; waiting for
spin_lock(&unused_list_lock);
if (nr_unused_buffer_heads > NR_RESERVED) {
bh = unused_list;
- unused_list = bh->b_next_free;
+ unused_list = bh->b_private;
nr_unused_buffer_heads--;
spin_unlock(&unused_list_lock);
return bh;
spin_lock(&unused_list_lock);
if (unused_list) {
bh = unused_list;
- unused_list = bh->b_next_free;
+ unused_list = bh->b_private;
nr_unused_buffer_heads--;
spin_unlock(&unused_list_lock);
return bh;
}
EXPORT_SYMBOL(get_unused_buffer_head);
-void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
+void set_bh_page(struct buffer_head *bh,
+ struct page *page, unsigned long offset)
{
bh->b_page = page;
if (offset >= PAGE_SIZE)
}
EXPORT_SYMBOL(set_bh_page);
-/*
- * Create the appropriate buffers when given a page for data area and
- * the size of each buffer.. Use the bh->b_this_page linked list to
- * follow the buffers created. Return NULL if unable to create more
- * buffers.
- * The async flag is used to differentiate async IO (paging, swapping)
- * from ordinary buffer allocations, and only async requests are allowed
- * to sleep waiting for buffer heads.
- */
-static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
-{
- struct buffer_head *bh, *head;
- long offset;
-
-try_again:
- head = NULL;
- offset = PAGE_SIZE;
- while ((offset -= size) >= 0) {
- bh = get_unused_buffer_head(async);
- if (!bh)
- goto no_grow;
-
- bh->b_bdev = NULL;
- bh->b_this_page = head;
- head = bh;
-
- bh->b_state = 0;
- bh->b_next_free = NULL;
- bh->b_pprev = NULL;
- atomic_set(&bh->b_count, 0);
- bh->b_size = size;
-
- set_bh_page(bh, page, offset);
-
- bh->b_list = BUF_CLEAN;
- bh->b_end_io = NULL;
- }
- return head;
-/*
- * In case anything failed, we just free everything we got.
- */
-no_grow:
- if (head) {
- spin_lock(&unused_list_lock);
- do {
- bh = head;
- head = head->b_this_page;
- __put_unused_buffer_head(bh);
- } while (head);
- spin_unlock(&unused_list_lock);
-
- /* Wake up any waiters ... */
- wake_up(&buffer_wait);
- }
-
- /*
- * Return failure for non-async IO requests. Async IO requests
- * are not allowed to fail, so we have to wait until buffer heads
- * become available. But we don't want tasks sleeping with
- * partially complete buffers, so all were released above.
- */
- if (!async)
- return NULL;
-
- /* We're _really_ low on memory. Now we just
- * wait for old buffer heads to become free due to
- * finishing IO. Since this is an async request and
- * the reserve list is empty, we're sure there are
- * async buffer heads in use.
- */
- run_task_queue(&tq_disk);
-
- free_more_memory();
- goto try_again;
-}
-
/*
* Called when truncating a buffer on a page completely.
*/
mark_buffer_clean(bh);
lock_buffer(bh);
bh->b_bdev = NULL;
- clear_bit(BH_Uptodate, &bh->b_state);
clear_bit(BH_Mapped, &bh->b_state);
clear_bit(BH_Req, &bh->b_state);
clear_bit(BH_New, &bh->b_state);
- remove_from_queues(bh);
unlock_buffer(bh);
}
}
/**
- * try_to_release_page - release old fs-specific metadata on a page
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
*
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private). If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ *
+ * NOTE: @gfp_mask may go away, and this function may become non-blocking.
*/
-
-int try_to_release_page(struct page * page, int gfp_mask)
+int try_to_release_page(struct page *page, int gfp_mask)
{
struct address_space * const mapping = page->mapping;
if (mapping && mapping->a_ops->releasepage)
return mapping->a_ops->releasepage(page, gfp_mask);
- return try_to_free_buffers(page, gfp_mask);
+ return try_to_free_buffers(page);
}
-/*
- * We don't have to release all buffers here, but
- * we have to be sure that no dirty buffer is left
- * and no IO is going on (no buffer is locked), because
- * we have truncated the file and are going to free the
- * blocks on-disk..
+/**
+ * block_flushpage - invalidate part of all of a buffer-backed page
+ *
+ * @page: the page which is affected
+ * @offset: the index of the truncation point
+ *
+ * block_flushpage() should be called block_invalidatepage(). It is
+ * called when all or part of the page has become invalidatedby a truncate
+ * operation.
+ *
+ * block_flushpage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point. Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
*/
-int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
+int block_flushpage(struct page *page, unsigned long offset)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
next = bh->b_this_page;
/*
- * is this block fully flushed?
+ * is this block fully invalidated?
*/
if (offset <= curr_off)
discard_buffer(bh);
} while (bh != head);
/*
- * subtle. We release buffer-heads only if this is
- * the 'final' flushpage. We have invalidated the get_block
- * cached value unconditionally, so real IO is not
- * possible anymore.
- *
- * If the free doesn't work out, the buffers can be
- * left around - they just turn into anonymous buffers
- * instead.
+ * We release buffers only if the entire page is being invalidated.
+ * The get_block cached value has been unconditionally invalidated,
+ * so real IO is not possible anymore.
*/
- if (!offset) {
+ if (offset == 0) {
if (!try_to_release_page(page, 0))
return 0;
}
return 1;
}
-void create_empty_buffers(struct page *page, unsigned long blocksize)
+/*
+ * We attach and possibly dirty the buffers atomically wrt
+ * __set_page_dirty_buffers() via i_bufferlist_lock. try_to_free_buffers
+ * is already excluded via the page lock.
+ */
+void create_empty_buffers(struct page *page,
+ unsigned long blocksize, unsigned long b_state)
{
struct buffer_head *bh, *head, *tail;
- /* FIXME: create_buffers should fail if there's no enough memory */
head = create_buffers(page, blocksize, 1);
- if (page_has_buffers(page))
- BUG();
-
bh = head;
do {
bh->b_end_io = NULL;
+ bh->b_state |= b_state;
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
- set_page_buffers(page, head);
- page_cache_get(page);
+
+ spin_lock(&page->mapping->host->i_bufferlist_lock);
+ if (PageDirty(page)) {
+ bh = head;
+ do {
+ set_bit(BH_Dirty, &bh->b_state);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+ __set_page_buffers(page, head);
+ spin_unlock(&page->mapping->host->i_bufferlist_lock);
}
EXPORT_SYMBOL(create_empty_buffers);
* don't want to mark the alias unmapped, for example - it would confuse
* anyone who might pick it with bread() afterwards...
*/
-
-static void unmap_underlying_metadata(struct buffer_head * bh)
+static void unmap_underlying_metadata(struct buffer_head *bh)
{
struct buffer_head *old_bh;
- old_bh = __get_hash_table(bh->b_bdev, bh->b_blocknr, bh->b_size);
+ old_bh = __get_hash_table(bh->b_bdev, bh->b_blocknr, 0);
if (old_bh) {
+#if 0 /* This happens. Later. */
+ if (buffer_dirty(old_bh))
+ buffer_error();
+#endif
mark_buffer_clean(old_bh);
wait_on_buffer(old_bh);
clear_bit(BH_Req, &old_bh->b_state);
*/
/*
- * block_write_full_page() is SMP threaded - the kernel lock is not held.
+ * While block_write_full_page is writing back the dirty buffers under
+ * the page lock, whoever dirtied the buffers may decide to clean them
+ * again at any time. We handle that by only looking at the buffer
+ * state inside lock_buffer().
*/
-static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
+static int __block_write_full_page(struct inode *inode,
+ struct page *page, get_block_t *get_block)
{
- int err, i;
+ int err;
unsigned long block;
+ unsigned long last_block;
struct buffer_head *bh, *head;
- int need_unlock;
+ int nr_underway = 0;
if (!PageLocked(page))
BUG();
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << inode->i_blkbits);
- head = page_buffers(page);
+ last_block = (inode->i_size - 1) >> inode->i_blkbits;
- block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ if (!page_has_buffers(page)) {
+ if (S_ISBLK(inode->i_mode))
+ buffer_error();
+ if (!Page_Uptodate(page))
+ buffer_error();
+ create_empty_buffers(page, 1 << inode->i_blkbits,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ }
+ block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ head = page_buffers(page);
bh = head;
- i = 0;
- /* Stage 1: make sure we have all the buffers mapped! */
+ /*
+ * Get all the dirty buffers mapped to disk addresses and
+ * handle any aliases from the underlying blockdev's mapping.
+ */
do {
- /*
- * If the buffer isn't up-to-date, we can't be sure
- * that the buffer has been initialized with the proper
- * block number information etc..
- *
- * Leave it to the low-level FS to make all those
- * decisions (block #0 may actually be a valid block)
- */
- if (!buffer_mapped(bh)) {
+ if (block > last_block) {
+ if (buffer_dirty(bh))
+ buffer_error();
+ if (buffer_mapped(bh))
+ buffer_error();
+ /*
+ * NOTE: this buffer can only be marked uptodate
+ * because we know that block_write_full_page has
+ * zeroed it out. That seems unnecessary and may go
+ * away.
+ */
+ mark_buffer_uptodate(bh, 1);
+ } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_new(bh))
+ buffer_error();
err = get_block(inode, block, bh, 1);
if (err)
- goto out;
- if (buffer_new(bh))
+ goto recover;
+ if (buffer_new(bh)) {
+ /* blockdev mappings never come here */
+ clear_bit(BH_New, &bh->b_state);
unmap_underlying_metadata(bh);
+ }
}
bh = bh->b_this_page;
block++;
} while (bh != head);
- /* Stage 2: lock the buffers, mark them clean */
do {
- lock_buffer(bh);
- set_buffer_async_io(bh);
- set_bit(BH_Uptodate, &bh->b_state);
- clear_bit(BH_Dirty, &bh->b_state);
+ get_bh(bh);
+ if (buffer_dirty(bh)) {
+ lock_buffer(bh);
+ if (buffer_dirty(bh)) {
+ if (!buffer_mapped(bh))
+ buffer_error();
+ if (!buffer_uptodate(bh))
+ buffer_error();
+ set_buffer_async_io(bh);
+ } else {
+ unlock_buffer(bh);
+ }
+ }
bh = bh->b_this_page;
} while (bh != head);
- /* Stage 3: submit the IO */
+ /*
+ * The page may come unlocked any time after the *first* submit_bh()
+ * call. Be careful with its buffers.
+ */
do {
struct buffer_head *next = bh->b_this_page;
- submit_bh(WRITE, bh);
+ if (buffer_async(bh)) {
+ mark_buffer_clean(bh);
+ submit_bh(WRITE, bh);
+ nr_underway++;
+ }
+ put_bh(bh);
bh = next;
} while (bh != head);
- /* Done - end_buffer_io_async will unlock */
- SetPageUptodate(page);
- return 0;
-
-out:
+ err = 0;
+done:
+ if (nr_underway == 0) {
+ /*
+ * The page was marked dirty, but the buffers were
+ * clean. Someone wrote them back by hand with
+ * ll_rw_block/submit_bh. A rare case.
+ */
+ int uptodate = 1;
+ do {
+ if (!buffer_uptodate(bh)) {
+ uptodate = 0;
+ break;
+ }
+ bh = bh->b_this_page;
+ } while (bh != head);
+ if (uptodate)
+ SetPageUptodate(page);
+ UnlockPage(page);
+ }
+ return err;
+recover:
/*
* ENOSPC, or some other error. We may already have added some
* blocks to the file, so we need to write these out to avoid
*/
ClearPageUptodate(page);
bh = head;
- need_unlock = 1;
/* Recovery: lock and submit the mapped buffers */
do {
if (buffer_mapped(bh)) {
lock_buffer(bh);
set_buffer_async_io(bh);
- need_unlock = 0;
+ } else {
+ /*
+ * The buffer may have been set dirty during
+ * attachment to a dirty page.
+ */
+ mark_buffer_clean(bh);
}
bh = bh->b_this_page;
} while (bh != head);
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_mapped(bh)) {
- set_bit(BH_Uptodate, &bh->b_state);
- clear_bit(BH_Dirty, &bh->b_state);
+ mark_buffer_uptodate(bh, 1);
+ mark_buffer_clean(bh);
submit_bh(WRITE, bh);
+ nr_underway++;
}
bh = next;
} while (bh != head);
- if (need_unlock)
- UnlockPage(page);
- return err;
+ goto done;
}
static int __block_prepare_write(struct inode *inode, struct page *page,
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
char *kaddr = kmap(page);
+ BUG_ON(!PageLocked(page));
+ BUG_ON(from > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > to);
+
blocksize = 1 << inode->i_blkbits;
if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize);
+ create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
bbits = inode->i_blkbits;
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
- if (!bh)
- BUG();
- block_end = block_start+blocksize;
- if (block_end <= from)
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (Page_Uptodate(page))
+ mark_buffer_uptodate(bh, 1);
continue;
- if (block_start >= to)
- break;
+ }
clear_bit(BH_New, &bh->b_state);
if (!buffer_mapped(bh)) {
err = get_block(inode, block, bh, 1);
if (err)
goto out;
if (buffer_new(bh)) {
+ clear_bit(BH_New, &bh->b_state);
unmap_underlying_metadata(bh);
if (Page_Uptodate(page)) {
- set_bit(BH_Uptodate, &bh->b_state);
+ if (!buffer_mapped(bh))
+ buffer_error();
+ mark_buffer_uptodate(bh, 1);
continue;
}
if (block_end > to)
memset(kaddr+to, 0, block_end-to);
if (block_start < from)
- memset(kaddr+block_start, 0, from-block_start);
+ memset(kaddr+block_start,
+ 0, from-block_start);
if (block_end > to || block_start < from)
flush_dcache_page(page);
continue;
}
}
if (Page_Uptodate(page)) {
- set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_uptodate(bh, 1);
continue;
}
if (!buffer_uptodate(bh) &&
if (block_start >= to)
break;
if (buffer_new(bh)) {
+ clear_bit(BH_New, &bh->b_state);
if (buffer_uptodate(bh))
- printk(KERN_ERR "%s: zeroing uptodate buffer!\n", __FUNCTION__);
+ buffer_error();
memset(kaddr+block_start, 0, bh->b_size);
- set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_uptodate(bh, 1);
mark_buffer_dirty(bh);
}
next_bh:
unsigned from, unsigned to)
{
unsigned block_start, block_end;
- int partial = 0, need_balance_dirty = 0;
+ int partial = 0;
unsigned blocksize;
struct buffer_head *bh, *head;
if (!buffer_uptodate(bh))
partial = 1;
} else {
- set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_uptodate(bh, 1);
if (!atomic_set_buffer_dirty(bh)) {
__mark_dirty(bh);
buffer_insert_inode_data_queue(bh, inode);
- need_balance_dirty = 1;
}
}
}
- if (need_balance_dirty)
- balance_dirty();
/*
- * is this a partial write that happened to make all buffers
+ * If this is a partial write which happened to make all buffers
* uptodate then we can optimize away a bogus readpage() for
- * the next read(). Here we 'discover' wether the page went
+ * the next read(). Here we 'discover' whether the page went
* uptodate as a result of this (potentially partial) write.
*/
if (!partial)
if (!PageLocked(page))
PAGE_BUG(page);
+ if (Page_Uptodate(page))
+ buffer_error();
blocksize = 1 << inode->i_blkbits;
if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize);
+ create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
memset(kmap(page) + i*blocksize, 0, blocksize);
flush_dcache_page(page);
kunmap(page);
- set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_uptodate(bh, 1);
continue;
}
- /* get_block() might have updated the buffer synchronously */
+ /*
+ * get_block() might have updated the buffer
+ * synchronously
+ */
if (buffer_uptodate(bh))
continue;
}
-
- arr[nr] = bh;
- nr++;
+ arr[nr++] = bh;
} while (i++, iblock++, (bh = bh->b_this_page) != head);
if (!nr) {
for (i = 0; i < nr; i++) {
struct buffer_head * bh = arr[i];
lock_buffer(bh);
+ if (buffer_uptodate(bh))
+ buffer_error();
+ if (buffer_dirty(bh))
+ buffer_error();
set_buffer_async_io(bh);
}
-
- /* Stage 3: start the IO */
- for (i = 0; i < nr; i++)
- submit_bh(READ, arr[i]);
-
+
+ /*
+ * Stage 3: start the IO. Check for uptodateness
+ * inside the buffer lock in case another process reading
+ * the underlying blockdev brought it uptodate (the sct fix).
+ */
+ for (i = 0; i < nr; i++) {
+ struct buffer_head * bh = arr[i];
+ if (buffer_uptodate(bh))
+ end_buffer_io_async(bh, 1);
+ else
+ submit_bh(READ, bh);
+ }
return 0;
}
* We may have to extend the file.
*/
-int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
+int cont_prepare_write(struct page *page, unsigned offset,
+ unsigned to, get_block_t *get_block, unsigned long *bytes)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
kaddr = page_address(new_page);
memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
flush_dcache_page(new_page);
- __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
+ __block_commit_write(inode, new_page,
+ zerofrom, PAGE_CACHE_SIZE);
kunmap(new_page);
UnlockPage(new_page);
page_cache_release(new_page);
return 0;
}
-int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
+int block_truncate_page(struct address_space *mapping,
+ loff_t from, get_block_t *get_block)
{
unsigned long index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
goto out;
if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize);
+ create_empty_buffers(page, blocksize, 0);
/* Find the buffer that contains "offset" */
bh = page_buffers(page);
/* Ok, it's mapped. Make sure it's up-to-date */
if (Page_Uptodate(page))
- set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_uptodate(bh, 1);
if (!buffer_uptodate(bh)) {
err = -EIO;
flush_dcache_page(page);
kunmap(page);
- __mark_buffer_dirty(bh);
+ mark_buffer_dirty(bh);
err = 0;
unlock:
return err;
}
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
int block_write_full_page(struct page *page, get_block_t *get_block)
{
- struct inode *inode = page->mapping->host;
- unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ struct inode * const inode = page->mapping->host;
+ const unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
unsigned offset;
- int err;
+ char *kaddr;
- /* easy case */
+ /* Is the page fully inside i_size? */
if (page->index < end_index)
return __block_write_full_page(inode, page, get_block);
- /* things got complicated... */
+ /* Is the page fully outside i_size? (truncate in progress) */
offset = inode->i_size & (PAGE_CACHE_SIZE-1);
- /* OK, are we completely out? */
if (page->index >= end_index+1 || !offset) {
UnlockPage(page);
return -EIO;
}
- /* Sigh... will have to work, then... */
- err = __block_prepare_write(inode, page, 0, offset, get_block);
- if (!err) {
- memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- __block_commit_write(inode,page,0,offset);
-done:
- kunmap(page);
- UnlockPage(page);
- return err;
- }
- ClearPageUptodate(page);
- goto done;
+ /* The page straddles i_size */
+ kaddr = kmap(page);
+ memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+ flush_dcache_page(page);
+ kunmap(page);
+ return __block_write_full_page(inode, page, get_block);
}
/*
*/
int writeout_one_page(struct page *page)
{
- struct buffer_head *bh, *head = page_buffers(page);
-
- if (!PageLocked(page))
- BUG();
+ struct buffer_head * const head = page_buffers(page);
+ struct buffer_head *arr[MAX_BUF_PER_PAGE];
+ struct buffer_head *bh;
+ int nr = 0;
+ BUG_ON(!PageLocked(page));
bh = head;
do {
- if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
- continue;
-
- bh->b_flushtime = jiffies;
- ll_rw_block(WRITE, 1, &bh);
+ if (!buffer_locked(bh) && buffer_dirty(bh) &&
+ buffer_mapped(bh) && buffer_uptodate(bh))
+ arr[nr++] = bh;
} while ((bh = bh->b_this_page) != head);
+ if (nr)
+ ll_rw_block(WRITE, nr, arr);
return 0;
}
EXPORT_SYMBOL(writeout_one_page);
return tmp.b_blocknr;
}
-int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
+int generic_direct_IO(int rw, struct inode *inode,
+ struct kiobuf *iobuf, unsigned long blocknr,
+ int blocksize, get_block_t *get_block)
{
int i, nr_blocks, retval;
sector_t *blocks = iobuf->blocks;
}
/* This does not understand multi-device filesystems currently */
- retval = brw_kiovec(rw, 1, &iobuf, inode->i_sb->s_bdev, blocks, blocksize);
+ retval = brw_kiovec(rw, 1, &iobuf,
+ inode->i_sb->s_bdev, blocks, blocksize);
out:
return retval;
* before I/O is complete. You then have to check page->locked
* and page->uptodate.
*
- * brw_page() is SMP-safe, although it's being called with the
- * kernel lock held - but the code is ready.
- *
* FIXME: we need a swapper_inode->get_block function to remove
* some of the bmap kludges and interface ugliness here.
*/
-int brw_page(int rw, struct page *page, struct block_device *bdev, sector_t b[], int size)
+int brw_page(int rw, struct page *page,
+ struct block_device *bdev, sector_t b[], int size)
{
struct buffer_head *head, *bh;
- if (!PageLocked(page))
- panic("brw_page: page not locked for I/O");
+ BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
- create_empty_buffers(page, size);
+ create_empty_buffers(page, size, 0);
head = bh = page_buffers(page);
/* Stage 1: lock all the buffers */
bh->b_blocknr = *(b++);
bh->b_bdev = bdev;
set_bit(BH_Mapped, &bh->b_state);
+ if (rw == WRITE) /* To support submit_bh debug tests */
+ mark_buffer_uptodate(bh, 1);
set_buffer_async_io(bh);
bh = bh->b_this_page;
} while (bh != head);
return err;
}
-static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
+/*
+ * Sanity checks for try_to_free_buffers.
+ */
+static void check_ttfb_buffer(struct page *page, struct buffer_head *bh)
{
- struct buffer_head *bh, *tail;
-
- bh = head;
- do {
- tail = bh;
- bh = bh->b_this_page;
- } while (bh);
- tail->b_this_page = head;
- set_page_buffers(page, head);
- page_cache_get(page);
+ if (!buffer_uptodate(bh)) {
+ if (Page_Uptodate(page) && page->mapping
+ && buffer_mapped(bh) /* discard_buffer */
+ && S_ISBLK(page->mapping->host->i_mode))
+ {
+ buffer_error();
+ }
+ }
}
/*
- * Create the page-cache page that contains the requested block
+ * try_to_free_buffers() checks if all the buffers on this particular page
+ * are unused, and releases them if so.
+ *
+ * Exclusion against try_to_free_buffers may be obtained by either
+ * locking the page or by holding its inode's i_bufferlist_lock.
+ *
+ * If the page is dirty but all the buffers are clean then we need to
+ * be sure to mark the page clean as well. This is because the page
+ * may be against a block device, and a later reattachment of buffers
+ * to a dirty page will set *all* buffers dirty. Which would corrupt
+ * filesystem data on the same device.
+ *
+ * The same applies to regular filesystem pages: if all the buffers are
+ * clean then we set the page clean and proceed. To do that, we require
+ * total exclusion from __set_page_dirty_buffers(). That is obtained with
+ * i_bufferlist_lock.
+ *
+ * Nobody should be calling try_to_free_buffers against a page which is
+ * eligible for set_page_dirty() treatment anyway - the page is clearly
+ * not freeable. So we could just test page_count(page) here and complain
+ * then scram if it's wrong.
+ *
+ * If any buffer is not uptodate then the entire page is set not uptodate,
+ * as the partial uptodateness information is about to be lost.
+ *
+ * try_to_free_buffers() is non-blocking.
*/
-static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
+static inline int buffer_busy(struct buffer_head *bh)
{
- struct page * page;
- struct buffer_head *bh;
-
- page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
- if (!page)
- return NULL;
-
- if (!PageLocked(page))
- BUG();
-
- if (page_has_buffers(page)) {
- bh = page_buffers(page);
- if (bh->b_size == size)
- return page;
- if (!try_to_free_buffers(page, GFP_NOFS))
- goto failed;
- }
-
- bh = create_buffers(page, size, 0);
- if (!bh)
- goto failed;
- link_dev_buffers(page, bh);
- return page;
-
-failed:
- UnlockPage(page);
- page_cache_release(page);
- return NULL;
+ return atomic_read(&bh->b_count) |
+ (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
-static void hash_page_buffers(struct page *page, struct block_device *bdev, int block, int size)
+static /*inline*/ int drop_buffers(struct page *page)
{
struct buffer_head *head = page_buffers(page);
- struct buffer_head *bh = head;
- unsigned int uptodate;
-
- uptodate = 1 << BH_Mapped;
- if (Page_Uptodate(page))
- uptodate |= 1 << BH_Uptodate;
+ struct buffer_head *bh;
+ int was_uptodate = 1;
- write_lock(&hash_table_lock);
+ bh = head;
do {
- if (!(bh->b_state & (1 << BH_Mapped))) {
- init_buffer(bh, NULL, NULL);
- bh->b_bdev = bdev;
- bh->b_blocknr = block;
- bh->b_state = uptodate;
- }
-
- /* Insert the buffer into the hash lists if necessary */
- if (!bh->b_pprev)
- __insert_into_hash_list(bh);
-
- block++;
+ check_ttfb_buffer(page, bh);
+ if (buffer_busy(bh))
+ goto failed;
+ if (!buffer_uptodate(bh))
+ was_uptodate = 0;
bh = bh->b_this_page;
} while (bh != head);
- write_unlock(&hash_table_lock);
-}
-
-/*
- * Try to increase the number of buffers available: the size argument
- * is used to determine what kind of buffers we want.
- */
-static int grow_buffers(struct block_device *bdev, unsigned long block, int size)
-{
- struct page * page;
- unsigned long index;
- int sizebits;
- /* Size must be multiple of hard sectorsize */
- if (size & (bdev_hardsect_size(bdev)-1))
- BUG();
- /* Size must be within 512 bytes and PAGE_SIZE */
- if (size < 512 || size > PAGE_SIZE)
- BUG();
+ if (!was_uptodate && Page_Uptodate(page))
+ buffer_error();
- sizebits = -1;
+ spin_lock(&unused_list_lock);
do {
- sizebits++;
- } while ((size << sizebits) < PAGE_SIZE);
-
- index = block >> sizebits;
- block = index << sizebits;
-
- /* Create a page with the proper size buffers.. */
- page = grow_dev_page(bdev, index, size);
-
- if (!page)
- return 0;
-
- /* Hash in the buffers on the hash list */
- hash_page_buffers(page, bdev, block, size);
- UnlockPage(page);
- page_cache_release(page);
+ struct buffer_head *next = bh->b_this_page;
- /* We hashed up this page, so increment buffermem */
- atomic_inc(&buffermem_pages);
+ __remove_inode_queue(bh);
+ __put_unused_buffer_head(bh);
+ bh = next;
+ } while (bh != head);
+ spin_unlock(&unused_list_lock);
+ __clear_page_buffers(page);
return 1;
+failed:
+ return 0;
}
-static int sync_page_buffers(struct buffer_head *head, unsigned int gfp_mask)
-{
- struct buffer_head * bh = head;
- int tryagain = 0;
-
- do {
- if (!buffer_dirty(bh) && !buffer_locked(bh))
- continue;
-
- /* Don't start IO first time around.. */
- if (!test_and_set_bit(BH_Wait_IO, &bh->b_state))
- continue;
-
- /* Second time through we start actively writing out.. */
- if (test_and_set_bit(BH_Lock, &bh->b_state)) {
- if (!test_bit(BH_launder, &bh->b_state))
- continue;
- wait_on_buffer(bh);
- tryagain = 1;
- continue;
- }
-
- if (!atomic_set_buffer_clean(bh)) {
- unlock_buffer(bh);
- continue;
- }
-
- __mark_buffer_clean(bh);
- get_bh(bh);
- set_bit(BH_launder, &bh->b_state);
- bh->b_end_io = end_buffer_io_sync;
- submit_bh(WRITE, bh);
- tryagain = 0;
- } while ((bh = bh->b_this_page) != head);
-
- return tryagain;
-}
-
-/*
- * Can the buffer be thrown out?
- */
-#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock))
-#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
-
-/*
- * try_to_free_buffers() checks if all the buffers on this particular page
- * are unused, and free's the page if so.
- *
- * Wake up bdflush() if this fails - if we're running low on memory due
- * to dirty buffers, we need to flush them out as quickly as possible.
- *
- * NOTE: There are quite a number of ways that threads of control can
- * obtain a reference to a buffer head within a page. So we must
- * lock out all of these paths to cleanly toss the page.
- */
-int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
+int try_to_free_buffers(struct page *page)
{
- struct buffer_head * tmp, * bh = page_buffers(page);
+ struct inode *inode;
+ int ret = 0;
BUG_ON(!PageLocked(page));
- BUG_ON(!bh);
-
-cleaned_buffers_try_again:
- spin_lock(&lru_list_lock);
- write_lock(&hash_table_lock);
- tmp = bh;
- do {
- if (buffer_busy(tmp))
- goto busy_buffer_page;
- tmp = tmp->b_this_page;
- } while (tmp != bh);
-
- spin_lock(&unused_list_lock);
- tmp = bh;
-
- /* if this buffer was hashed, this page counts as buffermem */
- if (bh->b_pprev)
- atomic_dec(&buffermem_pages);
- do {
- struct buffer_head * p = tmp;
- tmp = tmp->b_this_page;
- remove_inode_queue(p);
- __remove_from_queues(p);
- __put_unused_buffer_head(p);
- } while (tmp != bh);
- spin_unlock(&unused_list_lock);
-
- /* Wake up anyone waiting for buffer heads */
- wake_up(&buffer_wait);
- /* And free the page */
- clear_page_buffers(page);
- page_cache_release(page);
- write_unlock(&hash_table_lock);
- spin_unlock(&lru_list_lock);
- return 1;
+ if (page->mapping == NULL) /* swapped-in anon page */
+ return drop_buffers(page);
-busy_buffer_page:
- /* Uhhuh, start writeback so that we don't end up with all dirty pages */
- write_unlock(&hash_table_lock);
- spin_unlock(&lru_list_lock);
- if (gfp_mask & __GFP_IO) {
- if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
- if (sync_page_buffers(bh, gfp_mask)) {
- /* no IO or waiting next time */
- gfp_mask = 0;
- goto cleaned_buffers_try_again;
- }
- }
- }
- if (balance_dirty_state() >= 0)
- wakeup_bdflush();
- return 0;
+ inode = page->mapping->host;
+ spin_lock(&inode->i_bufferlist_lock);
+ ret = drop_buffers(page);
+ if (ret)
+ ClearPageDirty(page);
+ spin_unlock(&inode->i_bufferlist_lock);
+ return ret;
}
EXPORT_SYMBOL(try_to_free_buffers);
void show_buffers(void)
{
-#ifdef CONFIG_SMP
- struct buffer_head * bh;
- int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
- int nlist;
- static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
-#endif
-
printk("Buffer memory: %6dkB\n",
atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
-
-#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
- if (!spin_trylock(&lru_list_lock))
- return;
- for(nlist = 0; nlist < NR_LIST; nlist++) {
- found = locked = dirty = used = lastused = 0;
- bh = lru_list[nlist];
- if(!bh) continue;
-
- do {
- found++;
- if (buffer_locked(bh))
- locked++;
- if (buffer_dirty(bh))
- dirty++;
- if (atomic_read(&bh->b_count))
- used++, lastused = found;
- bh = bh->b_next_free;
- } while (bh != lru_list[nlist]);
- {
- int tmp = nr_buffers_type[nlist];
- if (found != tmp)
- printk("%9s: BUG -> found %d, reported %d\n",
- buf_types[nlist], found, tmp);
- }
- printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
- "%d locked, %d dirty\n",
- buf_types[nlist], found, size_buffers_type[nlist]>>10,
- used, lastused, locked, dirty);
- }
- spin_unlock(&lru_list_lock);
-#endif
-}
-
-/* ===================== Init ======================= */
-
-/*
- * allocate the hash table and init the free list
- * Use gfp() for the hash table to decrease TLB misses, use
- * SLAB cache for buffer heads.
- */
-void __init buffer_init(unsigned long mempages)
-{
- int order, i;
- unsigned int nr_hash;
-
- /* The buffer cache hash table is less important these days,
- * trim it a bit.
- */
- mempages >>= 14;
-
- mempages *= sizeof(struct buffer_head *);
-
- for (order = 0; (1 << order) < mempages; order++)
- ;
-
- /* try to allocate something until we get it or we're asking
- for something that is really too small */
-
- do {
- unsigned long tmp;
-
- nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
- bh_hash_mask = (nr_hash - 1);
-
- tmp = nr_hash;
- bh_hash_shift = 0;
- while((tmp >>= 1UL) != 0UL)
- bh_hash_shift++;
-
- hash_table = (struct buffer_head **)
- __get_free_pages(GFP_ATOMIC, order);
- } while (hash_table == NULL && --order > 0);
- printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
- nr_hash, order, (PAGE_SIZE << order));
-
- if (!hash_table)
- panic("Failed to allocate buffer hash table\n");
-
- /* Setup hash chains. */
- for(i = 0; i < nr_hash; i++)
- hash_table[i] = NULL;
-
- /* Setup lru lists. */
- for(i = 0; i < NR_LIST; i++)
- lru_list[i] = NULL;
-
-}
-
-/*
- * Here we attempt to write back old buffers. We also try to flush inodes
- * and supers as well, since this function is essentially "update", and
- * otherwise there would be no way of ensuring that these quantities ever
- * get written back. Ideally, we would have a timestamp on the inodes
- * and superblocks so that we could write back only the old ones as well
- */
-
-static void sync_old_buffers(unsigned long dummy)
-{
- sync_unlocked_inodes();
- sync_supers();
-
- for (;;) {
- struct buffer_head *bh;
-
- spin_lock(&lru_list_lock);
- bh = lru_list[BUF_DIRTY];
- if (!bh || time_before(jiffies, bh->b_flushtime))
- break;
- if (write_some_buffers(NULL))
- continue;
- return;
- }
- spin_unlock(&lru_list_lock);
}
int block_sync_page(struct page *page)
return 0;
}
-/* This is the interface to bdflush. As we get more sophisticated, we can
- * pass tuning parameters to this "process", to adjust how it behaves.
- * We would want to verify each parameter, however, to make sure that it
- * is reasonable. */
-
+/*
+ * There are no bdflush tunables left. But distributions are
+ * still running obsolete flush daemons, so we terminate them here.
+ */
asmlinkage long sys_bdflush(int func, long data)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
-
- if (func == 1) {
- /* do_exit directly and let kupdate to do its work alone. */
+ if (func == 1)
do_exit(0);
-#if 0 /* left here as it's the only example of lazy-mm-stuff used from
- a syscall that doesn't care about the current mm context. */
- int error;
- struct mm_struct *user_mm;
-
- /*
- * bdflush will spend all of it's time in kernel-space,
- * without touching user-space, so we can switch it into
- * 'lazy TLB mode' to reduce the cost of context-switches
- * to and from bdflush.
- */
- user_mm = start_lazy_tlb();
- error = sync_old_buffers();
- end_lazy_tlb(user_mm);
- return error;
-#endif
- }
-
- /* Basically func 1 means read param 1, 2 means write param 1, etc */
- if (func >= 2) {
- int i = (func-2) >> 1;
- if (i >= 0 && i < N_PARAM) {
- if ((func & 1) == 0)
- return put_user(bdf_prm.data[i], (int*)data);
-
- if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
- bdf_prm.data[i] = data;
- return 0;
- }
- }
- return -EINVAL;
- }
-
- /* Having func 0 used to launch the actual bdflush and then never
- * return (unless explicitly killed). We return zero here to
- * remain semi-compatible with present update(8) programs.
- */
return 0;
}
-static void bdflush(unsigned long pexclude)
-{
- while (balance_dirty_state() >= 0) {
- spin_lock(&lru_list_lock);
- if (write_some_buffers(NULL) == 0)
- break;
- }
- clear_bit(0, (unsigned long *)pexclude);
-}
-
void wakeup_bdflush(void)
{
- static unsigned long exclude;
-
- if (!test_and_set_bit(0, &exclude)) {
- if (pdflush_operation(bdflush, (unsigned long)&exclude))
- clear_bit(0, &exclude);
- }
-}
-
-/*
- * kupdate
- */
-static struct timer_list kupdate_timer;
-static void kupdate_handler(unsigned long dummy)
-{
- pdflush_operation(sync_old_buffers, 0);
- mod_timer(&kupdate_timer, jiffies + bdf_prm.b_un.interval);
-}
-
-static int __init kupdate_init(void)
-{
- init_timer(&kupdate_timer);
- kupdate_timer.expires = jiffies + bdf_prm.b_un.interval;
- kupdate_timer.data = 0;
- kupdate_timer.function = kupdate_handler;
- add_timer(&kupdate_timer);
- return 0;
+ pdflush_flush(0);
}
-
-module_init(kupdate_init)
-
/* bget() all the buffers */
if (order_data) {
- if (!page_has_buffers(page))
- create_empty_buffers(page, inode->i_sb->s_blocksize);
+ if (!page_has_buffers(page)) {
+ if (!Page_Uptodate(page))
+ buffer_error();
+ create_empty_buffers(page,
+ inode->i_sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ }
page_bufs = page_buffers(page);
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bget_one);
goto out;
if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize);
+ create_empty_buffers(page, blocksize, 0);
/* Find the buffer that contains "offset" */
bh = page_buffers(page);
} else {
if (ext3_should_order_data(inode))
err = ext3_journal_dirty_data(handle, bh, 0);
- __mark_buffer_dirty(bh);
+ mark_buffer_dirty(bh);
}
unlock:
--- /dev/null
+/*
+ * fs/fs-writeback.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains all the functions related to writing back and waiting
+ * upon dirty inodes against superblocks, and writing back dirty
+ * pages against inodes. ie: data writeback. Writeout of the
+ * inode itself is not handled here.
+ *
+ * 10Apr2002 akpm@zip.com.au
+ * Split out of fs/inode.c
+ * Additions for address_space-based writeback
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+
+/**
+ * __mark_inode_dirty - internal function
+ * @inode: inode to mark
+ * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ * Mark an inode as dirty. Callers should use mark_inode_dirty or
+ * mark_inode_dirty_sync.
+ *
+ * Put the inode on the super block's dirty list.
+ *
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
+ * dirty list only if it is hashed or if it refers to a blockdev.
+ * If it was not hashed, it will never be added to the dirty list
+ * even if it is later hashed, as it will have been marked dirty already.
+ *
+ * In short, make sure you hash any inodes _before_ you start marking
+ * them dirty.
+ *
+ * This function *must* be atomic for the I_DIRTY_PAGES case -
+ * set_page_dirty() is called under spinlock in several places.
+ */
+void __mark_inode_dirty(struct inode *inode, int flags)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!sb)
+ return; /* swapper_space */
+
+ /*
+ * Don't do this for I_DIRTY_PAGES - that doesn't actually
+ * dirty the inode itself
+ */
+ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (sb->s_op && sb->s_op->dirty_inode)
+ sb->s_op->dirty_inode(inode);
+ }
+
+ /* avoid the locking if we can */
+ if ((inode->i_state & flags) == flags)
+ return;
+
+ spin_lock(&inode_lock);
+ if ((inode->i_state & flags) != flags) {
+ inode->i_state |= flags;
+
+ /*
+ * If the inode is locked, just update its dirty state.
+ * The unlocker will place the inode on the appropriate
+ * superblock list, based upon its state.
+ */
+ if (inode->i_state & I_LOCK)
+ goto same_list;
+
+ /*
+ * Only add valid (hashed) inode to the superblock's
+ * dirty list. Add blockdev inodes as well.
+ */
+ if (list_empty(&inode->i_hash) && !S_ISBLK(inode->i_mode))
+ goto same_list;
+ if (inode->i_mapping->dirtied_when == 0)
+ inode->i_mapping->dirtied_when = jiffies;
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &sb->s_dirty);
+ }
+same_list:
+ spin_unlock(&inode_lock);
+}
+
+static inline void write_inode(struct inode *inode, int sync)
+{
+ if (inode->i_sb->s_op && inode->i_sb->s_op->write_inode &&
+ !is_bad_inode(inode))
+ inode->i_sb->s_op->write_inode(inode, sync);
+}
+
+/*
+ * Write a single inode's dirty pages and inode data out to disk.
+ * If `sync' is set, wait on the writeout.
+ * If `nr_to_write' is not NULL, subtract the number of written pages
+ * from *nr_to_write.
+ *
+ * Normally it is not legal for a single process to lock more than one
+ * page at a time, due to ab/ba deadlock problems. But writeback_mapping()
+ * does want to lock a large number of pages, without immediately submitting
+ * I/O against them (starting I/O is a "deferred unlock_page").
+ *
+ * However it *is* legal to lock multiple pages, if this is only ever performed
+ * by a single process. We provide that exclusion via locking in the
+ * filesystem's ->writeback_mapping a_op. This ensures that only a single
+ * process is locking multiple pages against this inode. And as I/O is
+ * submitted against all those locked pages, there is no deadlock.
+ *
+ * Called under inode_lock.
+ */
+static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
+{
+ unsigned dirty;
+ struct address_space *mapping = inode->i_mapping;
+
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
+
+ if (inode->i_state & I_LOCK)
+ BUG();
+
+ /* Set I_LOCK, reset I_DIRTY */
+ dirty = inode->i_state & I_DIRTY;
+ inode->i_state |= I_LOCK;
+ inode->i_state &= ~I_DIRTY;
+ spin_unlock(&inode_lock);
+
+ if (mapping->a_ops->writeback_mapping)
+ mapping->a_ops->writeback_mapping(mapping, nr_to_write);
+ else
+ filemap_fdatasync(mapping);
+
+ /* Don't write the inode if only I_DIRTY_PAGES was set */
+ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
+ write_inode(inode, wait);
+
+ if (wait)
+ filemap_fdatawait(mapping);
+
+ /*
+ * For non-blocking writeout (wait == 0), we still
+ * count the inode as being clean.
+ */
+ spin_lock(&inode_lock);
+
+ /*
+ * Did we write back all the pages?
+ */
+ if (nr_to_write && *nr_to_write == 0) {
+ /*
+ * Maybe not
+ */
+ if (!list_empty(&mapping->dirty_pages)) /* No lock needed */
+ inode->i_state |= I_DIRTY_PAGES;
+ }
+
+ inode->i_state &= ~I_LOCK;
+ if (!(inode->i_state & I_FREEING)) {
+ struct list_head *to;
+ if (inode->i_state & I_DIRTY)
+ to = &inode->i_sb->s_dirty;
+ else if (atomic_read(&inode->i_count))
+ to = &inode_in_use;
+ else
+ to = &inode_unused;
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, to);
+ }
+ wake_up(&inode->i_wait);
+}
+
+/*
+ * Write out an inode's dirty pages. Called under inode_lock.
+ */
+static void
+__writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
+{
+ while (inode->i_state & I_LOCK) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ __wait_on_inode(inode);
+ iput(inode);
+ spin_lock(&inode_lock);
+ }
+ __sync_single_inode(inode, sync, nr_to_write);
+}
+
+void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
+{
+ spin_lock(&inode_lock);
+ __writeback_single_inode(inode, sync, nr_to_write);
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Write out a list of inodes' pages, and the inode itself.
+ *
+ * If `sync' is true, wait on writeout of the last mapping
+ * which we write.
+ *
+ * If older_than_this is non-NULL, then only write out mappings which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * Called under inode_lock.
+ *
+ * FIXME: putting all the inodes on a local list could introduce a
+ * race with umount. Bump the superblock refcount?
+ */
+static void __sync_list(struct list_head *head, int sync_mode,
+ int *nr_to_write, unsigned long *older_than_this)
+{
+ struct list_head * tmp;
+ LIST_HEAD(hold); /* Unready inodes go here */
+
+ while ((tmp = head->next) != head) {
+ struct inode *inode = list_entry(tmp, struct inode, i_list);
+ struct address_space *mapping = inode->i_mapping;
+ int really_sync;
+
+ if (older_than_this && *older_than_this) {
+ if (time_after(mapping->dirtied_when,
+ *older_than_this)) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &hold);
+ continue;
+ }
+ }
+ really_sync = (sync_mode == WB_SYNC_ALL);
+ if ((sync_mode == WB_SYNC_LAST) && (head->prev == head))
+ really_sync = 1;
+ __writeback_single_inode(inode, really_sync, nr_to_write);
+ if (nr_to_write && *nr_to_write == 0)
+ break;
+ }
+ /*
+ * Put the not-ready inodes back
+ */
+ if (!list_empty(&hold))
+ list_splice(&hold, head);
+}
+
+/*
+ * Start writeback of dirty pagecache data against all unlocked inodes.
+ *
+ * Note:
+ * We don't need to grab a reference to superblock here. If it has non-empty
+ * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
+ * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are
+ * empty. Since __sync_single_inode() regains inode_lock before it finally moves
+ * inode from superblock lists we are OK.
+ *
+ * If `older_than_this' is non-zero then only flush inodes which have a
+ * flushtime older than *older_than_this. Unless *older_than_this is
+ * zero. In which case we flush everything, like the old (dumb) wakeup_bdflush.
+ */
+void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
+ unsigned long *older_than_this)
+{
+ struct super_block * sb;
+ static unsigned short writeback_gen;
+
+ spin_lock(&inode_lock);
+ spin_lock(&sb_lock);
+
+ /*
+ * We could get into livelock here if someone is dirtying
+ * inodes fast enough. writeback_gen is used to avoid that.
+ */
+ writeback_gen++;
+
+ sb = sb_entry(super_blocks.prev);
+ for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
+ if (sb->s_writeback_gen == writeback_gen)
+ continue;
+ sb->s_writeback_gen = writeback_gen;
+
+ if (current->flags & PF_FLUSHER) {
+ if (sb->s_flags & MS_FLUSHING) {
+ /*
+ * There's no point in two pdflush threads
+ * flushing the same device. But for other
+ * callers, we want to perform the flush
+ * because the fdatasync is how we implement
+ * writer throttling.
+ */
+ continue;
+ }
+ sb->s_flags |= MS_FLUSHING;
+ }
+
+ if (!list_empty(&sb->s_dirty)) {
+ spin_unlock(&sb_lock);
+ __sync_list(&sb->s_dirty, sync_mode,
+ nr_to_write, older_than_this);
+ spin_lock(&sb_lock);
+ }
+ if (current->flags & PF_FLUSHER)
+ sb->s_flags &= ~MS_FLUSHING;
+ if (nr_to_write && *nr_to_write == 0)
+ break;
+ }
+ spin_unlock(&sb_lock);
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Called under inode_lock
+ */
+static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
+{
+ struct list_head *tmp = head;
+ struct inode *inode;
+
+ while (nr_inodes && (tmp = tmp->prev) != head) {
+ inode = list_entry(tmp, struct inode, i_list);
+
+ if (!atomic_read(&inode->i_count)) {
+ __sync_single_inode(inode, 0, NULL);
+ nr_inodes--;
+
+ /*
+ * __sync_single_inode moved the inode to another list,
+ * so we have to start looking from the list head.
+ */
+ tmp = head;
+ }
+ }
+
+ return nr_inodes;
+}
+
+static void __wait_on_locked(struct list_head *head)
+{
+ struct list_head * tmp;
+ while ((tmp = head->prev) != head) {
+ struct inode *inode = list_entry(tmp, struct inode, i_list);
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ __wait_on_inode(inode);
+ iput(inode);
+ spin_lock(&inode_lock);
+ }
+}
+
+/*
+ * writeback and wait upon the filesystem's dirty inodes.
+ * We do it in two passes - one to write, and one to wait.
+ */
+void sync_inodes_sb(struct super_block *sb)
+{
+ spin_lock(&inode_lock);
+ while (!list_empty(&sb->s_dirty)||!list_empty(&sb->s_locked_inodes)) {
+ __sync_list(&sb->s_dirty, WB_SYNC_NONE, NULL, NULL);
+ __sync_list(&sb->s_dirty, WB_SYNC_ALL, NULL, NULL);
+ __wait_on_locked(&sb->s_locked_inodes);
+ }
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * writeback the dirty inodes for this filesystem
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+ spin_lock(&inode_lock);
+ while (!list_empty(&sb->s_dirty))
+ __sync_list(&sb->s_dirty, WB_SYNC_NONE, NULL, NULL);
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Find a superblock with inodes that need to be synced
+ */
+
+static struct super_block *get_super_to_sync(void)
+{
+ struct list_head *p;
+restart:
+ spin_lock(&inode_lock);
+ spin_lock(&sb_lock);
+ list_for_each(p, &super_blocks) {
+ struct super_block *s = list_entry(p,struct super_block,s_list);
+ if (list_empty(&s->s_dirty) && list_empty(&s->s_locked_inodes))
+ continue;
+ s->s_count++;
+ spin_unlock(&sb_lock);
+ spin_unlock(&inode_lock);
+ down_read(&s->s_umount);
+ if (!s->s_root) {
+ drop_super(s);
+ goto restart;
+ }
+ return s;
+ }
+ spin_unlock(&sb_lock);
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * sync_inodes
+ * @dev: device to sync the inodes from.
+ *
+ * sync_inodes goes through the super block's dirty list,
+ * writes them out, waits on the writeout and puts the inodes
+ * back on the normal list.
+ */
+
+void sync_inodes(void)
+{
+ struct super_block * s;
+ /*
+ * Search the super_blocks array for the device(s) to sync.
+ */
+ while ((s = get_super_to_sync()) != NULL) {
+ sync_inodes_sb(s);
+ drop_super(s);
+ }
+}
+
+void try_to_writeback_unused_inodes(unsigned long pexclusive)
+{
+ struct super_block * sb;
+ int nr_inodes = inodes_stat.nr_unused;
+
+ spin_lock(&inode_lock);
+ spin_lock(&sb_lock);
+ sb = sb_entry(super_blocks.next);
+ for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
+ if (list_empty(&sb->s_dirty))
+ continue;
+ spin_unlock(&sb_lock);
+ nr_inodes = __try_to_writeback_unused_list(&sb->s_dirty, nr_inodes);
+ spin_lock(&sb_lock);
+ }
+ spin_unlock(&sb_lock);
+ spin_unlock(&inode_lock);
+ clear_bit(0, (unsigned long *)pexclusive);
+}
+
+/**
+ * write_inode_now - write an inode to disk
+ * @inode: inode to write to disk
+ * @sync: whether the write should be synchronous or not
+ *
+ * This function commits an inode to disk immediately if it is
+ * dirty. This is primarily needed by knfsd.
+ */
+
+void write_inode_now(struct inode *inode, int sync)
+{
+ spin_lock(&inode_lock);
+ __writeback_single_inode(inode, sync, NULL);
+ spin_unlock(&inode_lock);
+ if (sync)
+ wait_on_inode(inode);
+}
+
+/**
+ * generic_osync_inode - flush all dirty data for a given inode to disk
+ * @inode: inode to write
+ * @datasync: if set, don't bother flushing timestamps
+ *
+ * This can be called by file_write functions for files which have the
+ * O_SYNC flag set, to flush dirty writes to disk.
+ */
+
+int generic_osync_inode(struct inode *inode, int what)
+{
+ int err = 0, err2 = 0, need_write_inode_now = 0;
+
+ /*
+ * WARNING
+ *
+ * Currently, the filesystem write path does not pass the
+ * filp down to the low-level write functions. Therefore it
+ * is impossible for (say) __block_commit_write to know if
+ * the operation is O_SYNC or not.
+ *
+ * Ideally, O_SYNC writes would have the filesystem call
+ * ll_rw_block as it went to kick-start the writes, and we
+ * could call osync_inode_buffers() here to wait only for
+ * those IOs which have already been submitted to the device
+ * driver layer. As it stands, if we did this we'd not write
+ * anything to disk since our writes have not been queued by
+ * this point: they are still on the dirty LRU.
+ *
+ * So, currently we will call fsync_inode_buffers() instead,
+ * to flush _all_ dirty buffers for this inode to disk on
+ * every O_SYNC write, not just the synchronous I/Os. --sct
+ */
+
+ if (what & OSYNC_DATA)
+ writeback_single_inode(inode, 0, NULL);
+ if (what & (OSYNC_METADATA|OSYNC_DATA))
+ err = fsync_inode_buffers(inode);
+ if (what & OSYNC_DATA) {
+ err2 = filemap_fdatasync(inode->i_mapping);
+ if (!err)
+ err = err2;
+ }
+
+ spin_lock(&inode_lock);
+ if ((inode->i_state & I_DIRTY) &&
+ ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
+ need_write_inode_now = 1;
+ spin_unlock(&inode_lock);
+
+ if (need_write_inode_now)
+ write_inode_now(inode, 1);
+ else
+ wait_on_inode(inode);
+
+ return err;
+}
#include <linux/config.h>
#include <linux/fs.h>
-#include <linux/string.h>
#include <linux/mm.h>
#include <linux/dcache.h>
#include <linux/init.h>
#include <linux/quotaops.h>
#include <linux/slab.h>
-#include <linux/cache.h>
-#include <linux/swap.h>
-#include <linux/swapctl.h>
-#include <linux/prefetch.h>
-#include <linux/locks.h>
+#include <linux/writeback.h>
/*
* New inode.c implementation.
* allowing for low-overhead inode sync() operations.
*/
-static LIST_HEAD(inode_in_use);
-static LIST_HEAD(inode_unused);
+LIST_HEAD(inode_in_use);
+LIST_HEAD(inode_unused);
static struct list_head *inode_hashtable;
static LIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
* NOTE! You also have to own the lock if you change
* the i_state of an inode while it is in use..
*/
-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
/*
* Statistics gathering..
inode->i_data.a_ops = &empty_aops;
inode->i_data.host = inode;
inode->i_data.gfp_mask = GFP_HIGHUSER;
+ inode->i_data.dirtied_when = 0;
inode->i_mapping = &inode->i_data;
inode->i_data.ra_pages = &default_ra_pages;
if (sb->s_bdev)
INIT_LIST_HEAD(&inode->i_data.clean_pages);
INIT_LIST_HEAD(&inode->i_data.dirty_pages);
INIT_LIST_HEAD(&inode->i_data.locked_pages);
+ INIT_LIST_HEAD(&inode->i_data.io_pages);
INIT_LIST_HEAD(&inode->i_dentry);
INIT_LIST_HEAD(&inode->i_dirty_buffers);
INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
rwlock_init(&inode->i_data.page_lock);
spin_lock_init(&inode->i_data.i_shared_lock);
+ spin_lock_init(&inode->i_bufferlist_lock);
INIT_LIST_HEAD(&inode->i_data.i_mmap);
INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
}
inode_init_once(inode);
}
-/*
- * Put the inode on the super block's dirty list.
- *
- * CAREFUL! We mark it dirty unconditionally, but
- * move it onto the dirty list only if it is hashed.
- * If it was not hashed, it will never be added to
- * the dirty list even if it is later hashed, as it
- * will have been marked dirty already.
- *
- * In short, make sure you hash any inodes _before_
- * you start marking them dirty..
- */
-
-/**
- * __mark_inode_dirty - internal function
- * @inode: inode to mark
- * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- * Mark an inode as dirty. Callers should use mark_inode_dirty or
- * mark_inode_dirty_sync.
- */
-
-void __mark_inode_dirty(struct inode *inode, int flags)
-{
- struct super_block * sb = inode->i_sb;
-
- if (!sb)
- return;
-
- /* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
- if (sb->s_op && sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode);
- }
-
- /* avoid the locking if we can */
- if ((inode->i_state & flags) == flags)
- return;
-
- spin_lock(&inode_lock);
- if ((inode->i_state & flags) != flags) {
- inode->i_state |= flags;
- /* Only add valid (ie hashed) inodes to the dirty list */
- if (!(inode->i_state & I_LOCK) && !list_empty(&inode->i_hash)) {
- list_del(&inode->i_list);
- list_add(&inode->i_list, &sb->s_dirty);
- }
- }
- spin_unlock(&inode_lock);
-}
-
-static void __wait_on_inode(struct inode * inode)
+void __wait_on_inode(struct inode * inode)
{
DECLARE_WAITQUEUE(wait, current);
current->state = TASK_RUNNING;
}
-static inline void wait_on_inode(struct inode *inode)
-{
- if (inode->i_state & I_LOCK)
- __wait_on_inode(inode);
-}
-
-
-static inline void write_inode(struct inode *inode, int sync)
-{
- if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
- inode->i_sb->s_op->write_inode(inode, sync);
-}
-
-static inline void __iget(struct inode * inode)
+/*
+ * inode_lock must be held
+ */
+void __iget(struct inode * inode)
{
if (atomic_read(&inode->i_count)) {
atomic_inc(&inode->i_count);
inodes_stat.nr_unused--;
}
-static inline void __sync_one(struct inode *inode, int sync)
-{
- unsigned dirty;
-
- list_del(&inode->i_list);
- list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
-
- if (inode->i_state & I_LOCK)
- BUG();
-
- /* Set I_LOCK, reset I_DIRTY */
- dirty = inode->i_state & I_DIRTY;
- inode->i_state |= I_LOCK;
- inode->i_state &= ~I_DIRTY;
- spin_unlock(&inode_lock);
-
- filemap_fdatasync(inode->i_mapping);
-
- /* Don't write the inode if only I_DIRTY_PAGES was set */
- if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
- write_inode(inode, sync);
-
- filemap_fdatawait(inode->i_mapping);
-
- spin_lock(&inode_lock);
- inode->i_state &= ~I_LOCK;
- if (!(inode->i_state & I_FREEING)) {
- struct list_head *to;
- if (inode->i_state & I_DIRTY)
- to = &inode->i_sb->s_dirty;
- else if (atomic_read(&inode->i_count))
- to = &inode_in_use;
- else
- to = &inode_unused;
- list_del(&inode->i_list);
- list_add(&inode->i_list, to);
- }
- wake_up(&inode->i_wait);
-}
-
-static inline void sync_one(struct inode *inode, int sync)
-{
- while (inode->i_state & I_LOCK) {
- __iget(inode);
- spin_unlock(&inode_lock);
- __wait_on_inode(inode);
- iput(inode);
- spin_lock(&inode_lock);
- }
-
- __sync_one(inode, sync);
-}
-
-static inline void sync_list(struct list_head *head)
-{
- struct list_head * tmp;
-
- while ((tmp = head->prev) != head)
- __sync_one(list_entry(tmp, struct inode, i_list), 0);
-}
-
-static inline void wait_on_locked(struct list_head *head)
-{
- struct list_head * tmp;
- while ((tmp = head->prev) != head) {
- struct inode *inode = list_entry(tmp, struct inode, i_list);
- __iget(inode);
- spin_unlock(&inode_lock);
- __wait_on_inode(inode);
- iput(inode);
- spin_lock(&inode_lock);
- }
-}
-
-static inline int try_to_sync_unused_list(struct list_head *head, int nr_inodes)
-{
- struct list_head *tmp = head;
- struct inode *inode;
-
- while (nr_inodes && (tmp = tmp->prev) != head) {
- inode = list_entry(tmp, struct inode, i_list);
-
- if (!atomic_read(&inode->i_count)) {
- __sync_one(inode, 0);
- nr_inodes--;
-
- /*
- * __sync_one moved the inode to another list,
- * so we have to start looking from the list head.
- */
- tmp = head;
- }
- }
-
- return nr_inodes;
-}
-
-void sync_inodes_sb(struct super_block *sb)
-{
- spin_lock(&inode_lock);
- while (!list_empty(&sb->s_dirty)||!list_empty(&sb->s_locked_inodes)) {
- sync_list(&sb->s_dirty);
- wait_on_locked(&sb->s_locked_inodes);
- }
- spin_unlock(&inode_lock);
-}
-
-/*
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are
- * empty. Since __sync_one() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
- */
-
-void sync_unlocked_inodes(void)
-{
- struct super_block * sb;
- spin_lock(&inode_lock);
- spin_lock(&sb_lock);
- sb = sb_entry(super_blocks.next);
- for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
- if (!list_empty(&sb->s_dirty)) {
- spin_unlock(&sb_lock);
- sync_list(&sb->s_dirty);
- spin_lock(&sb_lock);
- }
- }
- spin_unlock(&sb_lock);
- spin_unlock(&inode_lock);
-}
-
-/*
- * Find a superblock with inodes that need to be synced
- */
-
-static struct super_block *get_super_to_sync(void)
-{
- struct list_head *p;
-restart:
- spin_lock(&inode_lock);
- spin_lock(&sb_lock);
- list_for_each(p, &super_blocks) {
- struct super_block *s = list_entry(p,struct super_block,s_list);
- if (list_empty(&s->s_dirty) && list_empty(&s->s_locked_inodes))
- continue;
- s->s_count++;
- spin_unlock(&sb_lock);
- spin_unlock(&inode_lock);
- down_read(&s->s_umount);
- if (!s->s_root) {
- drop_super(s);
- goto restart;
- }
- return s;
- }
- spin_unlock(&sb_lock);
- spin_unlock(&inode_lock);
- return NULL;
-}
-
-/**
- * sync_inodes
- * @dev: device to sync the inodes from.
- *
- * sync_inodes goes through the super block's dirty list,
- * writes them out, and puts them back on the normal list.
- */
-
-void sync_inodes(void)
-{
- struct super_block * s;
- /*
- * Search the super_blocks array for the device(s) to sync.
- */
- while ((s = get_super_to_sync()) != NULL) {
- sync_inodes_sb(s);
- drop_super(s);
- }
-}
-
-static void try_to_sync_unused_inodes(unsigned long pexclusive)
-{
- struct super_block * sb;
- int nr_inodes = inodes_stat.nr_unused;
-
- spin_lock(&inode_lock);
- spin_lock(&sb_lock);
- sb = sb_entry(super_blocks.next);
- for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
- if (list_empty(&sb->s_dirty))
- continue;
- spin_unlock(&sb_lock);
- nr_inodes = try_to_sync_unused_list(&sb->s_dirty, nr_inodes);
- spin_lock(&sb_lock);
- }
- spin_unlock(&sb_lock);
- spin_unlock(&inode_lock);
- clear_bit(0, (unsigned long *)pexclusive);
-}
-
-/**
- * write_inode_now - write an inode to disk
- * @inode: inode to write to disk
- * @sync: whether the write should be synchronous or not
- *
- * This function commits an inode to disk immediately if it is
- * dirty. This is primarily needed by knfsd.
- */
-
-void write_inode_now(struct inode *inode, int sync)
-{
- struct super_block * sb = inode->i_sb;
-
- if (sb) {
- spin_lock(&inode_lock);
- sync_one(inode, sync);
- spin_unlock(&inode_lock);
- if (sync)
- wait_on_inode(inode);
- }
- else
- printk(KERN_ERR "write_inode_now: no super block\n");
-}
-
-/**
- * generic_osync_inode - flush all dirty data for a given inode to disk
- * @inode: inode to write
- * @datasync: if set, don't bother flushing timestamps
- *
- * This can be called by file_write functions for files which have the
- * O_SYNC flag set, to flush dirty writes to disk.
- */
-
-int generic_osync_inode(struct inode *inode, int what)
-{
- int err = 0, err2 = 0, need_write_inode_now = 0;
-
- /*
- * WARNING
- *
- * Currently, the filesystem write path does not pass the
- * filp down to the low-level write functions. Therefore it
- * is impossible for (say) __block_commit_write to know if
- * the operation is O_SYNC or not.
- *
- * Ideally, O_SYNC writes would have the filesystem call
- * ll_rw_block as it went to kick-start the writes, and we
- * could call osync_inode_buffers() here to wait only for
- * those IOs which have already been submitted to the device
- * driver layer. As it stands, if we did this we'd not write
- * anything to disk since our writes have not been queued by
- * this point: they are still on the dirty LRU.
- *
- * So, currently we will call fsync_inode_buffers() instead,
- * to flush _all_ dirty buffers for this inode to disk on
- * every O_SYNC write, not just the synchronous I/Os. --sct
- */
-
- if (what & OSYNC_METADATA)
- err = fsync_inode_buffers(inode);
- if (what & OSYNC_DATA)
- err2 = fsync_inode_data_buffers(inode);
- if (!err)
- err = err2;
-
- spin_lock(&inode_lock);
- if ((inode->i_state & I_DIRTY) &&
- ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
- need_write_inode_now = 1;
- spin_unlock(&inode_lock);
-
- if (need_write_inode_now)
- write_inode_now(inode, 1);
- else
- wait_on_inode(inode);
-
- return err;
-}
-
/**
* clear_inode - clear an inode
* @inode: inode to clear
static unsigned long exclusive;
if (!test_and_set_bit(0, &exclusive)) {
- if (pdflush_operation(try_to_sync_unused_inodes,
+ if (pdflush_operation(try_to_writeback_unused_inodes,
(unsigned long)&exclusive))
clear_bit(0, &exclusive);
}
__journal_remove_checkpoint(jh);
__journal_remove_journal_head(bh);
BUFFER_TRACE(bh, "release");
- /* BUF_LOCKED -> BUF_CLEAN (fwiw) */
- refile_buffer(bh);
__brelse(bh);
ret = 1;
}
/*
* We used to test for (jh->b_list != BUF_CLEAN) here.
* But unmap_underlying_metadata() can place buffer onto
- * BUF_CLEAN. Since refile_buffer() no longer takes buffers
- * off checkpoint lists, we cope with it here
+ * BUF_CLEAN.
*/
/*
* AKPM: I think the buffer_jdirty test is redundant - it
BUFFER_TRACE(bh, "remove from checkpoint");
__journal_remove_checkpoint(jh);
__journal_remove_journal_head(bh);
- refile_buffer(bh);
__brelse(bh);
ret = 1;
}
__journal_unfile_buffer(jh);
jh->b_transaction = NULL;
__journal_remove_journal_head(bh);
- refile_buffer(bh);
__brelse(bh);
}
}
jh->b_transaction = NULL;
__journal_remove_journal_head(bh);
BUFFER_TRACE(bh, "finished async writeout: refile");
- /* It can sometimes be on BUF_LOCKED due to migration
- * from syncdata to asyncdata */
- if (bh->b_list != BUF_CLEAN)
- refile_buffer(bh);
__brelse(bh);
}
}
struct buffer_head *bh = wbuf[i];
set_bit(BH_Lock, &bh->b_state);
clear_bit(BH_Dirty, &bh->b_state);
+ mark_buffer_uptodate(bh, 1);
bh->b_end_io = journal_end_buffer_io_sync;
submit_bh(WRITE, bh);
}
JBUFFER_TRACE(descriptor, "write commit block");
{
struct buffer_head *bh = jh2bh(descriptor);
+ mark_buffer_uptodate(bh, 1);
ll_rw_block(WRITE, 1, &bh);
wait_on_buffer(bh);
__brelse(bh); /* One for getblk() */
__journal_unfile_buffer(jh);
jh->b_transaction = NULL;
__journal_remove_journal_head(bh);
- refile_buffer(bh);
__brelse(bh);
goto restart;
}
}
} while (!new_bh);
/* keep subsequent assertions sane */
- new_bh->b_prev_free = 0;
- new_bh->b_next_free = 0;
new_bh->b_state = 0;
init_buffer(new_bh, NULL, NULL);
atomic_set(&new_bh->b_count, 1);
* buffer_head? If so, we'd better make sure we clear the
* revoked status on any hashed alias too, otherwise the revoke
* state machine will get very upset later on. */
- if (need_cancel && !bh->b_pprev) {
+ if (need_cancel) {
struct buffer_head *bh2;
bh2 = __get_hash_table(bh->b_bdev, bh->b_blocknr, bh->b_size);
if (bh2) {
- clear_bit(BH_Revoked, &bh2->b_state);
+ if (bh2 != bh)
+ clear_bit(BH_Revoked, &bh2->b_state);
__brelse(bh2);
}
}
{
struct buffer_head *bh = jh2bh(descriptor);
BUFFER_TRACE(bh, "write");
+ mark_buffer_uptodate(bh, 1);
ll_rw_block (WRITE, 1, &bh);
}
}
JBUFFER_TRACE(jh, "file as BJ_Reserved");
__journal_file_buffer(jh, transaction, BJ_Reserved);
- /* And pull it off BUF_DIRTY, onto BUF_CLEAN */
- refile_buffer(jh2bh(jh));
-
/*
* The buffer is now hidden from bdflush. It is
* metadata against the current transaction.
jh->b_transaction = transaction;
JBUFFER_TRACE(jh, "file as BJ_Reserved");
__journal_file_buffer(jh, transaction, BJ_Reserved);
- JBUFFER_TRACE(jh, "refile");
- refile_buffer(jh2bh(jh));
} else if (jh->b_transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "set next transaction");
jh->b_next_transaction = transaction;
spin_lock(&journal_datalist_lock);
set_bit(BH_JBDDirty, &bh->b_state);
- set_buffer_flushtime(bh);
J_ASSERT_JH(jh, jh->b_transaction != NULL);
out:
ret = 0;
if (call_ttfb)
- ret = try_to_free_buffers(page, gfp_mask);
+ ret = try_to_free_buffers(page);
return ret;
}
if (buffer_dirty(bh))
mark_buffer_clean(bh);
J_ASSERT_BH(bh, !buffer_jdirty(bh));
- clear_bit(BH_Uptodate, &bh->b_state);
+// clear_bit(BH_Uptodate, &bh->b_state);
clear_bit(BH_Mapped, &bh->b_state);
clear_bit(BH_Req, &bh->b_state);
clear_bit(BH_New, &bh->b_state);
unlock_journal(journal);
if (!offset) {
- if (!may_free || !try_to_free_buffers(page, 0))
+ if (!may_free || !try_to_free_buffers(page))
return 0;
J_ASSERT(!page_has_buffers(page));
}
if (jh->b_transaction != NULL) {
__journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
- } else {
- /* Onto BUF_DIRTY for writeback */
- refile_buffer(jh2bh(jh));
}
}
blocksize = 1 << blocksize_bits;
if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize);
+ create_empty_buffers(page, blocksize, 0);
bh = head = page_buffers(page);
if (!bh)
return -ENOMEM;
blocksize_bits = vol->sb->s_blocksize_bits;
if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize);
+ create_empty_buffers(page, blocksize, 0);
bh = head = page_buffers(page);
if (!bh)
return -ENOMEM;
blocksize = 1 << blocksize_bits;
if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize);
+ create_empty_buffers(page, blocksize, 0);
bh = head = page_buffers(page);
if (!bh)
return -ENOMEM;
is interrupting do_balance */
#endif
+/*
+ * AKPM: The __mark_buffer_dirty() call here will not
+ * put the buffer on the dirty buffer LRU because we've just
+ * set BH_Dirty. That's a thinko in reiserfs.
+ *
+ * I'm reluctant to "fix" this bug because that would change
+ * behaviour. Using mark_buffer_dirty() here would make the
+ * buffer eligible for VM and periodic writeback, which may
+ * violate ordering constraints. I'll just leave the code
+ * as-is by removing the __mark_buffer_dirty call altogether.
+ *
+ * Chris says this code has "probably never been run" anyway.
+ * It is due to go away.
+ */
inline void do_balance_mark_leaf_dirty (struct tree_balance * tb,
struct buffer_head * bh, int flag)
{
if (reiserfs_dont_log(tb->tb_sb)) {
if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
- __mark_buffer_dirty(bh) ;
+// __mark_buffer_dirty(bh) ;
tb->need_balance_dirty = 1;
}
} else {
static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
struct list_head *list = &(SB_JOURNAL(inode->i_sb)->j_dirty_buffers) ;
- buffer_insert_list(bh, list) ;
+ buffer_insert_list(NULL, bh, list) ;
}
//
/* mark it dirty now to prevent commit_write from adding
** this buffer to the inode's dirty buffer list
*/
- __mark_buffer_dirty(unbh) ;
+ /*
+ * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
+ * It's still atomic, but it sets the page dirty too,
+ * which makes it eligible for writeback at any time by the
+ * VM (which was also the case with __mark_buffer_dirty())
+ */
+ mark_buffer_dirty(unbh) ;
//inode->i_blocks += inode->i_sb->s_blocksize / 512;
//mark_tail_converted (inode);
** more details.
*/
static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
- if (bh) {
- clear_bit(BH_Dirty, &bh->b_state) ;
- refile_buffer(bh) ;
- }
+ if (bh)
+ mark_buffer_clean(bh);
return 0 ;
}
if (!buffer_uptodate(cn->bh)) {
reiserfs_panic(s, "journal-949: buffer write failed\n") ;
}
- refile_buffer(cn->bh) ;
brelse(cn->bh) ;
}
cn = cn->next ;
SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;
/* write any buffers that must hit disk before this commit is done */
- fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+ fsync_buffers_list(NULL, &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
/* honor the flush and async wishes from the caller */
if (flush) {
static void sprintf_buffer_head (char * buf, struct buffer_head * bh)
{
- sprintf (buf, "dev %s, size %d, blocknr %ld, count %d, list %d, state 0x%lx, page %p, (%s, %s, %s)",
- bdevname (bh->b_bdev), bh->b_size, bh->b_blocknr, atomic_read (&(bh->b_count)), bh->b_list,
+ sprintf (buf, "dev %s, size %d, blocknr %ld, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+ bdevname (bh->b_bdev), bh->b_size, bh->b_blocknr,
+ atomic_read (&(bh->b_count)),
bh->b_state, bh->b_page,
buffer_uptodate (bh) ? "UPTODATE" : "!UPTODATE",
buffer_dirty (bh) ? "DIRTY" : "CLEAN",
#define MS_MOVE 8192
#define MS_REC 16384
#define MS_VERBOSE 32768
+#define MS_FLUSHING (1<<16) /* inodes are currently under writeout */
#define MS_ACTIVE (1<<30)
#define MS_NOUSER (1<<31)
#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC))
#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
+#define IS_FLUSHING(inode) __IS_FLG(inode, MS_FLUSHING)
#define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA)
#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
BH_Dirty, /* 1 if the buffer is dirty */
BH_Lock, /* 1 if the buffer is locked */
BH_Req, /* 0 if the buffer has been invalidated */
+
BH_Mapped, /* 1 if the buffer has a disk mapping */
BH_New, /* 1 if the buffer is new and not yet written out */
BH_Async, /* 1 if the buffer is under end_buffer_io_async I/O */
- BH_Wait_IO, /* 1 if we should write out this buffer */
- BH_launder, /* 1 if we should throttle on this buffer */
BH_JBD, /* 1 if it has an attached journal_head */
BH_PrivateStart,/* not a state bit, but the first bit available
*/
struct buffer_head {
/* First cache line: */
- struct buffer_head *b_next; /* Hash queue list */
sector_t b_blocknr; /* block number */
unsigned short b_size; /* block size */
- unsigned short b_list; /* List that this buffer appears */
struct block_device *b_bdev;
atomic_t b_count; /* users using this block */
unsigned long b_state; /* buffer state bitmap (see above) */
- unsigned long b_flushtime; /* Time when (dirty) buffer should be written */
-
- struct buffer_head *b_next_free;/* lru/free list linkage */
- struct buffer_head *b_prev_free;/* doubly linked list of buffers */
struct buffer_head *b_this_page;/* circular list of buffers in one page */
- struct buffer_head **b_pprev; /* doubly linked list of hash-queue */
- char * b_data; /* pointer to data block */
struct page *b_page; /* the page this bh is mapped to */
+
+ char * b_data; /* pointer to data block */
void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
void *b_private; /* reserved for b_end_io */
int (*writepage)(struct page *);
int (*readpage)(struct file *, struct page *);
int (*sync_page)(struct page *);
+
+ /* Write back some dirty pages from this mapping. */
+ int (*writeback_mapping)(struct address_space *, int *nr_to_write);
+
+ /* Perform a writeback as a memory-freeing operation. */
+ int (*vm_writeback)(struct page *, int *nr_to_write);
+
+ /* Set a page dirty */
+ int (*set_page_dirty)(struct page *page);
+
/*
* ext3 requires that a successful prepare_write() call be followed
* by a commit_write() call - they must be balanced
struct list_head clean_pages; /* list of clean pages */
struct list_head dirty_pages; /* list of dirty pages */
struct list_head locked_pages; /* list of locked pages */
+ struct list_head io_pages; /* being prepared for I/O */
unsigned long nrpages; /* number of total pages */
struct address_space_operations *a_ops; /* methods */
struct inode *host; /* owner: inode, block_device */
list_t i_mmap; /* list of private mappings */
list_t i_mmap_shared; /* list of private mappings */
spinlock_t i_shared_lock; /* and spinlock protecting it */
+ unsigned long dirtied_when; /* jiffies of first page dirtying */
int gfp_mask; /* how to allocate the pages */
unsigned long *ra_pages; /* device readahead */
};
struct list_head i_hash;
struct list_head i_list;
struct list_head i_dentry;
-
- struct list_head i_dirty_buffers;
+
+ struct list_head i_dirty_buffers; /* uses i_bufferlist_lock */
struct list_head i_dirty_data_buffers;
+ spinlock_t i_bufferlist_lock;
unsigned long i_ino;
atomic_t i_count;
struct list_head s_list; /* Keep this first */
kdev_t s_dev;
unsigned long s_blocksize;
- unsigned char s_blocksize_bits;
unsigned long s_old_blocksize;
+ unsigned short s_writeback_gen;/* To avoid writeback livelock */
+ unsigned char s_blocksize_bits;
unsigned char s_dirt;
unsigned long long s_maxbytes; /* Max file size */
struct file_system_type *s_type;
int (*show_options)(struct seq_file *, struct vfsmount *);
};
-/* Inode state bits.. */
+/* Inode state bits. Protected by inode_lock. */
#define I_DIRTY_SYNC 1 /* Not dirty enough for O_DATASYNC */
#define I_DIRTY_DATASYNC 2 /* Data-related inode changes pending */
#define I_DIRTY_PAGES 4 /* Data-related inode changes pending */
__mark_inode_dirty(inode, I_DIRTY_SYNC);
}
-static inline void mark_inode_dirty_pages(struct inode *inode)
-{
- __mark_inode_dirty(inode, I_DIRTY_PAGES);
-}
-
struct dquot_operations {
void (*initialize) (struct inode *, short);
void (*drop) (struct inode *);
extern int fs_may_remount_ro(struct super_block *);
-extern int try_to_free_buffers(struct page *, unsigned int);
-extern void refile_buffer(struct buffer_head * buf);
-extern void create_empty_buffers(struct page *, unsigned long);
+extern int try_to_free_buffers(struct page *);
+extern void create_empty_buffers(struct page *, unsigned long,
+ unsigned long b_state);
extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
/* reiserfs_writepage needs this */
extern void set_buffer_async_io(struct buffer_head *bh) ;
-#define BUF_CLEAN 0
-#define BUF_LOCKED 1 /* Buffers scheduled for write */
-#define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */
-#define NR_LIST 3
-
static inline void get_bh(struct buffer_head * bh)
{
atomic_inc(&(bh)->b_count);
#define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state)
-static inline void __mark_buffer_clean(struct buffer_head *bh)
-{
- refile_buffer(bh);
-}
-
static inline void mark_buffer_clean(struct buffer_head * bh)
{
- if (atomic_set_buffer_clean(bh))
- __mark_buffer_clean(bh);
+ clear_bit(BH_Dirty, &(bh)->b_state);
}
-extern void FASTCALL(__mark_dirty(struct buffer_head *bh));
-extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh));
extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
-extern void FASTCALL(buffer_insert_list(struct buffer_head *, struct list_head *));
+extern void buffer_insert_list(spinlock_t *lock,
+ struct buffer_head *, struct list_head *);
-static inline void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
+static inline void
+buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
{
- buffer_insert_list(bh, &inode->i_dirty_buffers);
+ buffer_insert_list(&inode->i_bufferlist_lock,
+ bh, &inode->i_dirty_buffers);
}
-static inline void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
+
+static inline void
+buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
{
- buffer_insert_list(bh, &inode->i_dirty_data_buffers);
+ buffer_insert_list(&inode->i_bufferlist_lock,
+ bh, &inode->i_dirty_data_buffers);
}
#define atomic_set_buffer_dirty(bh) test_and_set_bit(BH_Dirty, &(bh)->b_state)
buffer_insert_inode_queue(bh, inode);
}
-extern void set_buffer_flushtime(struct buffer_head *);
-extern void balance_dirty(void);
extern int check_disk_change(kdev_t);
extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int);
#define destroy_buffers(dev) __invalidate_buffers((dev), 1)
extern void invalidate_bdev(struct block_device *, int);
extern void __invalidate_buffers(kdev_t dev, int);
-extern void sync_inodes(void);
-extern void sync_unlocked_inodes(void);
extern void write_inode_now(struct inode *, int);
extern int sync_buffers(struct block_device *, int);
extern int fsync_dev(kdev_t);
extern int fsync_super(struct super_block *);
extern int fsync_no_super(struct block_device *);
extern void sync_inodes_sb(struct super_block *);
-extern int osync_buffers_list(struct list_head *);
-extern int fsync_buffers_list(struct list_head *);
+extern int fsync_buffers_list(spinlock_t *lock, struct list_head *);
static inline int fsync_inode_buffers(struct inode *inode)
{
- return fsync_buffers_list(&inode->i_dirty_buffers);
+ return fsync_buffers_list(&inode->i_bufferlist_lock,
+ &inode->i_dirty_buffers);
}
static inline int fsync_inode_data_buffers(struct inode *inode)
{
- return fsync_buffers_list(&inode->i_dirty_data_buffers);
+ return fsync_buffers_list(&inode->i_bufferlist_lock,
+ &inode->i_dirty_data_buffers);
}
extern int inode_has_buffers(struct inode *);
extern int filemap_fdatasync(struct address_space *);
return iget4(sb, ino, NULL, NULL);
}
+extern void __iget(struct inode * inode);
extern void clear_inode(struct inode *);
extern struct inode *new_inode(struct super_block *);
extern void remove_suid(struct dentry *);
bh->b_bdev = sb->s_bdev;
bh->b_blocknr = block;
}
+
extern void wakeup_bdflush(void);
extern void put_unused_buffer_head(struct buffer_head * bh);
extern struct buffer_head * get_unused_buffer_head(int async);
/* Generic buffer handling for block filesystems.. */
extern int try_to_release_page(struct page * page, int gfp_mask);
-extern int discard_bh_page(struct page *, unsigned long, int);
-#define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
-#define block_invalidate_page(page) discard_bh_page(page, 0, 0)
+extern int block_flushpage(struct page *page, unsigned long offset);
extern int block_symlink(struct inode *, const char *, int);
extern int block_write_full_page(struct page*, get_block_t*);
extern int block_read_full_page(struct page*, get_block_t*);
extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
extern int generic_file_open(struct inode * inode, struct file * filp);
+extern int generic_vm_writeback(struct page *page, int *nr_to_write);
+
extern struct file_operations generic_ro_fops;
extern int vfs_readlink(struct dentry *, char *, int, const char *);
return res;
}
+void __buffer_error(char *file, int line);
+#define buffer_error() __buffer_error(__FILE__, __LINE__)
+
#endif /* __KERNEL__ */
#endif /* _LINUX_FS_H */
#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
-extern void FASTCALL(set_page_dirty(struct page *));
-
/*
* Error return values for the *_nopage functions
*/
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+int __set_page_dirty_buffers(struct page *page);
+int __set_page_dirty_nobuffers(struct page *page);
+
+/*
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ * FIXME: make the method unconditional.
+ */
+static inline int set_page_dirty(struct page *page)
+{
+ if (page->mapping) {
+ int (*spd)(struct page *);
+
+ spd = page->mapping->a_ops->set_page_dirty;
+ if (spd)
+ return (*spd)(page);
+ }
+ return __set_page_dirty_buffers(page);
+}
+
/*
* On a two-level page table, this ends up being trivial. Thus the
* inlining and the symmetry break with pte_alloc_map() that does all
extern int filemap_sync(struct vm_area_struct *, unsigned long, size_t, unsigned int);
extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int);
+/* mm/page-writeback.c */
+int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write);
+
/* readahead.c */
#define VM_MAX_READAHEAD 128 /* kbytes */
#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
-extern int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
-extern int pdflush_flush(unsigned long nr_pages);
-
extern struct page * vmalloc_to_page(void *addr);
extern unsigned long get_page_cache_size(void);
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */
#define PF_FREE_PAGES 0x00002000 /* per process page freeing */
-#define PF_NOIO 0x00004000 /* avoid generating further I/O */
-#define PF_FLUSHER 0x00008000 /* responsible for disk writeback */
+#define PF_FLUSHER 0x00004000 /* responsible for disk writeback */
/*
* Ptrace flags
extern unsigned int nr_free_pages(void);
extern unsigned int nr_free_buffer_pages(void);
+extern unsigned int nr_free_pagecache_pages(void);
extern int nr_active_pages;
extern int nr_inactive_pages;
extern atomic_t nr_async_pages;
VM_SWAPCTL=1, /* struct: Set vm swapping control */
VM_SWAPOUT=2, /* int: Linear or sqrt() swapout for hogs */
VM_FREEPG=3, /* struct: Set free page thresholds */
- VM_BDFLUSH=4, /* struct: Control buffer cache flushing */
+ VM_BDFLUSH_UNUSED=4, /* Spare */
VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */
VM_BUFFERMEM=6, /* struct: Set buffer memory thresholds */
VM_PAGECACHE=7, /* struct: Set cache memory thresholds */
--- /dev/null
+/*
+ * include/linux/writeback.h.
+ *
+ * These declarations are private to fs/ and mm/.
+ * Declarations which are exported to filesystems do not
+ * get placed here.
+ */
+#ifndef WRITEBACK_H
+#define WRITEBACK_H
+
+extern spinlock_t inode_lock;
+extern struct list_head inode_in_use;
+extern struct list_head inode_unused;
+
+/*
+ * fs/fs-writeback.c
+ */
+#define WB_SYNC_NONE 0 /* Don't wait on anything */
+#define WB_SYNC_LAST 1 /* Wait on the last-written mapping */
+#define WB_SYNC_ALL 2 /* Wait on every mapping */
+
+void try_to_writeback_unused_inodes(unsigned long pexclusive);
+void writeback_single_inode(struct inode *inode,
+ int sync, int *nr_to_write);
+void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
+ unsigned long *older_than_this);
+void writeback_inodes_sb(struct super_block *);
+void __wait_on_inode(struct inode * inode);
+void sync_inodes(void);
+
+static inline void wait_on_inode(struct inode *inode)
+{
+ if (inode->i_state & I_LOCK)
+ __wait_on_inode(inode);
+}
+
+/*
+ * mm/page-writeback.c
+ */
+/*
+ * How much data to write out at a time in various places. This isn't
+ * really very important - it's just here to prevent any thread from
+ * locking an inode for too long and blocking other threads which wish
+ * to write the same file for allocation throttling purposes.
+ */
+#define WRITEOUT_PAGES ((4096 * 1024) / PAGE_CACHE_SIZE)
+
+void balance_dirty_pages(struct address_space *mapping);
+void balance_dirty_pages_ratelimited(struct address_space *mapping);
+int pdflush_flush(unsigned long nr_pages);
+int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
+
+#endif /* WRITEBACK_H */
fork_init(mempages);
proc_caches_init();
vfs_caches_init(mempages);
- buffer_init(mempages);
radix_tree_init();
#if defined(CONFIG_ARCH_S390)
ccwcache_init();
EXPORT_SYMBOL(mark_buffer_dirty);
EXPORT_SYMBOL(end_buffer_io_sync);
EXPORT_SYMBOL(set_buffer_async_io);
-EXPORT_SYMBOL(__mark_buffer_dirty);
EXPORT_SYMBOL(__mark_inode_dirty);
EXPORT_SYMBOL(get_empty_filp);
EXPORT_SYMBOL(init_private_file);
EXPORT_SYMBOL(__wait_on_buffer);
EXPORT_SYMBOL(___wait_on_page);
EXPORT_SYMBOL(generic_direct_IO);
-EXPORT_SYMBOL(discard_bh_page);
EXPORT_SYMBOL(block_write_full_page);
EXPORT_SYMBOL(block_read_full_page);
EXPORT_SYMBOL(block_prepare_write);
EXPORT_SYMBOL(read_dev_sector);
EXPORT_SYMBOL(tq_disk);
EXPORT_SYMBOL(init_buffer);
-EXPORT_SYMBOL(refile_buffer);
EXPORT_SYMBOL(wipe_partitions);
/* tty routines */
/* External variables not in a header file. */
extern int panic_timeout;
extern int C_A_D;
-extern int bdf_prm[], bdflush_min[], bdflush_max[];
extern int sysctl_overcommit_memory;
extern int max_threads;
extern atomic_t nr_queued_signals;
};
static ctl_table vm_table[] = {
- {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
- &proc_dointvec_minmax, &sysctl_intvec, NULL,
- &bdflush_min, &bdflush_max},
{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
{VM_PAGERDAEMON, "kswapd",
O_TARGET := mm.o
-export-objs := shmem.o filemap.o mempool.o page_alloc.o
+export-objs := shmem.o filemap.o mempool.o page_alloc.o \
+ page-writeback.o
obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
- pdflush.o
+ pdflush.o page-writeback.o
include $(TOPDIR)/Rules.make
*/
#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/shm.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
#include <linux/mman.h>
-#include <linux/locks.h>
#include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <linux/smp_lock.h>
-#include <linux/blkdev.h>
#include <linux/file.h>
-#include <linux/swapctl.h>
-#include <linux/init.h>
-#include <linux/mm.h>
#include <linux/iobuf.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/hash.h>
-#include <linux/blkdev.h>
+#include <linux/writeback.h>
-#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/mman.h>
-#include <linux/highmem.h>
-
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
/*
* Lock ordering:
- * pagemap_lru_lock ==> page_lock ==> i_shared_lock
+ *
+ * pagemap_lru_lock
+ * ->i_shared_lock (vmtruncate)
+ * ->i_bufferlist_lock (__free_pte->__set_page_dirty_buffers)
+ * ->unused_list_lock (try_to_free_buffers)
+ * ->mapping->page_lock
+ * ->inode_lock (__mark_inode_dirty)
+ * ->sb_lock (fs/fs-writeback.c)
*/
spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-#define CLUSTER_PAGES (1 << page_cluster)
-#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
-
/*
* Remove a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
return 0;
}
-/*
- * Add a page to the dirty page list.
- */
-void set_page_dirty(struct page *page)
-{
- if (!TestSetPageDirty(page)) {
- struct address_space *mapping = page->mapping;
-
- if (mapping) {
- write_lock(&mapping->page_lock);
- list_del(&page->list);
- list_add(&page->list, &mapping->dirty_pages);
- write_unlock(&mapping->page_lock);
-
- if (mapping->host)
- mark_inode_dirty_pages(mapping->host);
- }
- }
-}
-
/**
* invalidate_inode_pages - Invalidate all the unlocked pages of one inode
* @inode: the inode which pages we want to invalidate
/* Leave it on the LRU if it gets converted into anonymous buffers */
if (!PagePrivate(page) || do_flushpage(page, 0))
lru_cache_del(page);
-
- /*
- * We remove the page from the page cache _after_ we have
- * destroyed all buffer-cache references to it. Otherwise some
- * other process might think this inode page is not in the
- * page cache and creates a buffer-cache alias to it causing
- * all sorts of fun problems ...
- */
ClearPageDirty(page);
ClearPageUptodate(page);
remove_inode_page(page);
page_cache_release(page);
}
+/*
+ * Writeback walks the page list in ->prev order, which is low-to-high file
+ * offsets in the common case where he file was written linearly. So truncate
+ * walks the page list in the opposite (->next) direction, to avoid getting
+ * into lockstep with writeback's cursor. To prune as many pages as possible
+ * before the truncate cursor collides with the writeback cursor.
+ */
static int truncate_list_pages(struct address_space *mapping,
struct list_head *head, unsigned long start, unsigned *partial)
{
int unlocked = 0;
restart:
- curr = head->prev;
+ curr = head->next;
while (curr != head) {
unsigned long offset;
list_del(head);
if (!failed)
/* Restart after this page */
- list_add_tail(head, curr);
+ list_add(head, curr);
else
/* Restart on this page */
- list_add(head, curr);
+ list_add_tail(head, curr);
write_unlock(&mapping->page_lock);
unlocked = 1;
write_lock(&mapping->page_lock);
goto restart;
}
- curr = curr->prev;
+ curr = curr->next;
}
return unlocked;
}
write_lock(&mapping->page_lock);
do {
- unlocked = truncate_list_pages(mapping,
- &mapping->clean_pages, start, &partial);
+ unlocked |= truncate_list_pages(mapping,
+ &mapping->io_pages, start, &partial);
unlocked |= truncate_list_pages(mapping,
&mapping->dirty_pages, start, &partial);
+ unlocked = truncate_list_pages(mapping,
+ &mapping->clean_pages, start, &partial);
unlocked |= truncate_list_pages(mapping,
&mapping->locked_pages, start, &partial);
} while (unlocked);
/*
* The page is locked and we hold the mapping lock as well
* so both page_count(page) and page_buffers stays constant here.
+ * AKPM: fixme: No global lock any more. Is this still OK?
*/
if (page_count(page) == 1 + !!page_has_buffers(page)) {
/* Restart after this page */
page_cache_get(page);
write_unlock(&mapping->page_lock);
- block_invalidate_page(page);
+ block_flushpage(page, 0);
} else
unlocked = 0;
&mapping->clean_pages);
unlocked |= invalidate_list_pages2(mapping,
&mapping->dirty_pages);
+ unlocked |= invalidate_list_pages2(mapping,
+ &mapping->io_pages);
unlocked |= invalidate_list_pages2(mapping,
&mapping->locked_pages);
} while (unlocked);
/* writeout dirty buffers on pages from both clean and dirty lists */
retval = do_buffer_fdatasync(mapping, &mapping->dirty_pages,
start_idx, end_idx, writeout_one_page);
+ retval = do_buffer_fdatasync(mapping, &mapping->io_pages,
+ start_idx, end_idx, writeout_one_page);
retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages,
start_idx, end_idx, writeout_one_page);
retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages,
/* now wait for locked buffers on pages from both clean and dirty lists */
retval |= do_buffer_fdatasync(mapping, &mapping->dirty_pages,
start_idx, end_idx, waitfor_one_page);
+ retval |= do_buffer_fdatasync(mapping, &mapping->io_pages,
+ start_idx, end_idx, waitfor_one_page);
retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages,
start_idx, end_idx, waitfor_one_page);
retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages,
EXPORT_SYMBOL(fail_writepage);
/**
- * filemap_fdatasync - walk the list of dirty pages of the given address space
- * and writepage() all of them.
- *
- * @mapping: address space structure to write
+ * filemap_fdatasync - walk the list of dirty pages of the given address space
+ * and writepage() all of them.
+ *
+ * @mapping: address space structure to write
*
*/
-int filemap_fdatasync(struct address_space * mapping)
+int filemap_fdatasync(struct address_space *mapping)
{
- int ret = 0;
- int (*writepage)(struct page *) = mapping->a_ops->writepage;
-
- write_lock(&mapping->page_lock);
-
- while (!list_empty(&mapping->dirty_pages)) {
- struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
-
- list_del(&page->list);
- list_add(&page->list, &mapping->locked_pages);
-
- if (!PageDirty(page))
- continue;
-
- page_cache_get(page);
- write_unlock(&mapping->page_lock);
-
- lock_page(page);
-
- if (PageDirty(page)) {
- int err;
- ClearPageDirty(page);
- err = writepage(page);
- if (err && !ret)
- ret = err;
- } else
- UnlockPage(page);
-
- page_cache_release(page);
- write_lock(&mapping->page_lock);
- }
- write_unlock(&mapping->page_lock);
- return ret;
+ if (mapping->a_ops->writeback_mapping)
+ return mapping->a_ops->writeback_mapping(mapping, NULL);
+ return generic_writeback_mapping(mapping, NULL);
}
/**
if (status < 0)
break;
+ balance_dirty_pages_ratelimited(mapping);
} while (count);
done:
*ppos = pos;
--- /dev/null
+/*
+ * mm/page-writeback.c.
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains functions related to writing back dirty pages at the
+ * address_space level.
+ *
+ * 10Apr2002 akpm@zip.com.au
+ * Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/init.h>
+#include <linux/sysrq.h>
+
+/*
+ * Memory thresholds, in percentages
+ * FIXME: expose these via /proc or whatever.
+ */
+
+/*
+ * Start background writeback (via pdflush) at this level
+ */
+static int dirty_background_ratio = 30;
+
+/*
+ * The generator of dirty data starts async writeback at this level
+ */
+static int dirty_async_ratio = 45;
+
+/*
+ * The generator of dirty data performs sync writeout at this level
+ */
+static int dirty_sync_ratio = 60;
+
+/*
+ * balance_dirty_pages() must be called by processes which are
+ * generating dirty data. It looks at the number of dirty pages
+ * in the machine and either:
+ *
+ * - Starts background writeback or
+ * - Causes the caller to perform async writeback or
+ * - Causes the caller to perform synchronous writeback, then
+ * tells a pdflush thread to perform more writeback or
+ * - Does nothing at all.
+ *
+ * balance_dirty_pages() can sleep.
+ */
+void balance_dirty_pages(struct address_space *mapping)
+{
+ const int tot = nr_free_pagecache_pages();
+ struct page_state ps;
+ int background_thresh;
+ int async_thresh;
+ int sync_thresh;
+ int wake_pdflush = 0;
+ unsigned long dirty_and_locked;
+
+ get_page_state(&ps);
+ dirty_and_locked = ps.nr_dirty + ps.nr_locked;
+
+ background_thresh = (dirty_background_ratio * tot) / 100;
+ async_thresh = (dirty_async_ratio * tot) / 100;
+ sync_thresh = (dirty_sync_ratio * tot) / 100;
+
+ if (dirty_and_locked > sync_thresh) {
+ int nr_to_write = dirty_and_locked - async_thresh;
+
+ writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL);
+ wake_pdflush = 1;
+ } else if (dirty_and_locked > async_thresh) {
+ int nr_to_write = dirty_and_locked - async_thresh;
+
+ writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
+ } else if (dirty_and_locked > background_thresh) {
+ wake_pdflush = 1;
+ }
+
+ if (wake_pdflush && !IS_FLUSHING(mapping->host)) {
+ /*
+ * There is no flush thread against this device. Start one now.
+ */
+ get_page_state(&ps);
+ if (ps.nr_dirty > 0) {
+ pdflush_flush(ps.nr_dirty);
+ yield();
+ }
+ }
+}
+
+/*
+ * Front-end to balance_dirty_pages - just to make sure it's not called
+ * too often.
+ */
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+ static struct rate_limit_struct {
+ int count;
+ } ____cacheline_aligned ratelimits[NR_CPUS];
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ if (ratelimits[cpu].count++ >= 32) {
+ ratelimits[cpu].count = 0;
+ preempt_enable();
+ balance_dirty_pages(mapping);
+ return;
+ }
+ preempt_enable();
+}
+
+/*
+ * Here are some applications of the pdflush thread pool
+ */
+
+/*
+ * Start heavy writeback of everything. This is the analogue of the old
+ * wakeup_bdflush(). Returns zero if a thread was successfully launched.
+ *
+ * Is passed in the number of pages to write.
+ *
+ * We yield, to allow page allocators to perform their I/O against large files.
+ */
+
+static void pdflush_bdflush(unsigned long arg)
+{
+ int nr_pages = arg;
+
+ CHECK_EMERGENCY_SYNC
+
+ while (nr_pages) {
+ int nr_to_write = WRITEOUT_PAGES;
+
+ if (nr_to_write > nr_pages)
+ nr_to_write = nr_pages;
+ nr_pages -= nr_to_write;
+ writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
+ yield();
+ }
+ run_task_queue(&tq_disk);
+}
+
+int pdflush_flush(unsigned long nr_pages)
+{
+ return pdflush_operation(pdflush_bdflush, nr_pages);
+}
+
+/*
+ * The interval between `kupdate'-style writebacks.
+ *
+ * Traditional kupdate writes back data which is 30-35 seconds old.
+ * This one does that, but it also writes back just 1/6th of the dirty
+ * data. This is to avoid great I/O storms.
+ *
+ * We chunk the writes up and yield, to permit any throttled page-allocators
+ * to perform their I/O against a large file.
+ */
+static int wb_writeback_jifs = 5 * HZ;
+
+/*
+ * Periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space. So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Spot the bug: at jiffies wraparound, the attempt to set the inode's dirtying
+ * time won't work, because zero means not-dirty. That's OK. The data will get
+ * written out later by the VM (at least).
+ *
+ * We also limit the number of pages which are written out, to avoid writing
+ * huge amounts of data against a single file, which would cause memory
+ * allocators to block for too long.
+ */
+static void wb_kupdate(unsigned long arg)
+{
+ unsigned long oldest_jif = jiffies - 30*HZ;
+ struct page_state ps;
+ int total_to_write;
+ int nr_to_write;
+
+ sync_supers();
+
+ get_page_state(&ps);
+
+ total_to_write = ps.nr_dirty / 6;
+ if (total_to_write < 16384) {
+ total_to_write = 16384;
+ if (total_to_write > ps.nr_dirty)
+ total_to_write = ps.nr_dirty;
+ }
+ while (total_to_write > 0) {
+ nr_to_write = total_to_write;
+ if (nr_to_write > WRITEOUT_PAGES)
+ nr_to_write = WRITEOUT_PAGES;
+ total_to_write -= nr_to_write;
+ writeback_unlocked_inodes(&nr_to_write,
+ WB_SYNC_NONE, &oldest_jif);
+ yield();
+ }
+ run_task_queue(&tq_disk);
+}
+
+/*
+ * The writeback timer, for kupdate-style functionality
+ */
+static struct timer_list wb_timer;
+
+static void wb_timer_fn(unsigned long unused)
+{
+ mod_timer(&wb_timer, jiffies + wb_writeback_jifs);
+ pdflush_operation(wb_kupdate, 0);
+}
+
+static int __init wb_timer_init(void)
+{
+ init_timer(&wb_timer);
+ wb_timer.expires = jiffies + wb_writeback_jifs;
+ wb_timer.data = 0;
+ wb_timer.function = wb_timer_fn;
+ add_timer(&wb_timer);
+ return 0;
+}
+module_init(wb_timer_init);
+
+/*
+ * FIXME: PG_launder gets cleared by accident.
+ */
+static int writeback_mapping(struct page *page, int *nr_to_write)
+{
+ struct inode *inode = page->mapping->host;
+
+ SetPageDirty(page);
+
+ /*
+ * We don't own this inode, so we don't want the address_space
+ * vanishing while writeback is walking the list
+ */
+ inode = igrab(inode);
+ unlock_page(page);
+
+ if (inode) {
+ writeback_single_inode(inode, 0, nr_to_write);
+
+ /*
+ * This iput() will internally call ext2_discard_prealloc(),
+ * which is rather bogus. But there is no other way of
+ * dropping our ref to the inode. However, there's no harm
+ * in dropping the prealloc, because there probably isn't any.
+ * Just a waste of cycles.
+ */
+ iput(inode);
+ }
+ return 0;
+}
+
+/*
+ * A library function, which implements the vm_writeback a_op. It's fairly
+ * lame at this time. The idea is: the VM wants to liberate this page,
+ * so we pass the page to the address_space and give the fs the opportunity
+ * to write out lots of pages around this one. It allows extent-based
+ * filesytems to do intelligent things. It lets delayed-allocate filesystems
+ * perform better file layout. It lets the address_space opportunistically
+ * write back disk-contiguous pages which are in other zones.
+ */
+int generic_vm_writeback(struct page *page, int *nr_to_write)
+{
+ return writeback_mapping(page, nr_to_write);
+}
+EXPORT_SYMBOL(generic_vm_writeback);
+
+/**
+ * generic_writeback_mapping - walk the list of dirty pages of the given
+ * address space and writepage() all of them.
+ *
+ * @mapping: address space structure to write
+ * @nr_to_write: subtract the number of written pages from *@nr_to_write
+ *
+ * This is a library function, which implements the writeback_mapping()
+ * address_space_operation for filesystems which are using multipage BIO
+ * writeback.
+ *
+ * We need to be careful to avoid deadlocks here. mpage_bio_writepage() does
+ * not immediately start I/O against each page. It waits until the bio is
+ * full, or until mpage_bio_flush() is called. So generic_writeback_mapping()
+ * is locking multiple pages without necessarily starting I/O against them.
+ *
+ * AB/BA deadlocks are avoided via locking implemented in the filesystem.
+ * Only one process ever has multiple locked pages against any mapping.
+ *
+ * FIXME: doing the locking in the fs is a bit grotty, but it allows us to
+ * not have to put a new semaphore in struct inode. The fs could
+ * pass its bio_write_state up here, I guess.
+ *
+ * Pages can be moved from clean_pages or locked_pages onto dirty_pages
+ * at any time - it's not possible to lock against that. So pages which
+ * have already been added to a BIO may magically reappear on the dirty_pages
+ * list. And generic_writeback_mapping() will again try to lock those pages.
+ * But I/O has not yet been started agains the page. Thus deadlock.
+ *
+ * To avoid this, the entire contents of the dirty_pages list are moved
+ * onto io_pages up-front. We then walk io_pages, locking the
+ * pages and submitting them for I/O, moving them to locked_pages.
+ *
+ * This has the added benefit of preventing a livelock which would otherwise
+ * occur if pages are being dirtied faster than we can write them out.
+ *
+ * Thus generic_writeback_mapping() only makes the guarantee that all pages
+ * which were dirty at the time it was called will have I/O started against
+ * them. And it's not possible to make a stronger guarantee than that.
+ */
+int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write)
+{
+ int ret = 0;
+ int done = 0;
+ int err;
+ int (*writepage)(struct page *) = mapping->a_ops->writepage;
+
+ write_lock(&mapping->page_lock);
+
+ list_splice(&mapping->dirty_pages, &mapping->io_pages);
+ INIT_LIST_HEAD(&mapping->dirty_pages);
+ mapping->dirtied_when = 0;
+
+ while (!list_empty(&mapping->io_pages) && !done) {
+ struct page *page = list_entry(mapping->io_pages.prev,
+ struct page, list);
+ list_del(&page->list);
+ list_add(&page->list, &mapping->locked_pages);
+ if (!PageDirty(page))
+ continue;
+
+ page_cache_get(page);
+ write_unlock(&mapping->page_lock);
+
+ lock_page(page);
+
+ if (TestClearPageDirty(page)) {
+ err = writepage(page);
+ if (!ret)
+ ret = err;
+ if (nr_to_write) {
+ --(*nr_to_write);
+ if (*nr_to_write <= 0)
+ done = 1;
+ }
+ } else
+ UnlockPage(page);
+
+ page_cache_release(page);
+ write_lock(&mapping->page_lock);
+ }
+ if (!list_empty(&mapping->io_pages)) {
+ /*
+ * Put the rest back, in the correct order.
+ */
+ list_splice(&mapping->io_pages, mapping->dirty_pages.prev);
+ INIT_LIST_HEAD(&mapping->io_pages);
+ }
+ write_unlock(&mapping->page_lock);
+ return ret;
+}
+EXPORT_SYMBOL(generic_writeback_mapping);
+
+/*
+ * Add a page to the dirty page list.
+ *
+ * It is a sad fact of life that this function is called from several places
+ * deeply under spinlocking. It may not sleep.
+ *
+ * If the page has buffers, the uptodate buffers are set dirty, to preserve
+ * dirty-state coherency between the page and the buffers. It the page does
+ * not have buffers then when they are later attached they will all be set
+ * dirty.
+ *
+ * The buffers are dirtied before the page is dirtied. There's a small race
+ * window in which a writepage caller may see the page cleanness but not the
+ * buffer dirtiness. That's fine. If this code were to set the page dirty
+ * before the buffers, a concurrent writepage caller could clear the page dirty
+ * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
+ * page on the dirty page list.
+ *
+ * There is also a small window where the page is dirty, and not on dirty_pages.
+ * Also a possibility that by the time the page is added to dirty_pages, it has
+ * been set clean. The page lists are somewhat approximate in this regard.
+ * It's better to have clean pages accidentally attached to dirty_pages than to
+ * leave dirty pages attached to clean_pages.
+ *
+ * We use i_bufferlist_lock to lock against try_to_free_buffers while using the
+ * page's buffer list. Also use this to protect against clean buffers being
+ * added to the page after it was set dirty.
+ *
+ * FIXME: may need to call ->reservepage here as well. That's rather up to the
+ * address_space though.
+ */
+int __set_page_dirty_buffers(struct page *page)
+{
+ int ret = 0;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode;
+
+ if (mapping == NULL) {
+ SetPageDirty(page);
+ goto out;
+ }
+
+ inode = mapping->host;
+
+ spin_lock(&inode->i_bufferlist_lock);
+
+ if (page_has_buffers(page)) {
+ struct buffer_head *head = page_buffers(page);
+ struct buffer_head *bh = head;
+
+ do {
+ if (buffer_uptodate(bh))
+ set_bit(BH_Dirty, &bh->b_state);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+
+ if (!TestSetPageDirty(page)) {
+ write_lock(&mapping->page_lock);
+ list_del(&page->list);
+ list_add(&page->list, &mapping->dirty_pages);
+ write_unlock(&mapping->page_lock);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ }
+
+ spin_unlock(&inode->i_bufferlist_lock);
+out:
+ return ret;
+}
+EXPORT_SYMBOL(__set_page_dirty_buffers);
+
+/*
+ * For address_spaces which do not use buffers. Just set the page's dirty bit
+ * and move it to the dirty_pages list. Also perform space reservation if
+ * required.
+ *
+ * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page
+ * is still safe, as long as it actually manages to find some blocks at
+ * writeback time.
+ *
+ * This is also used when a single buffer is being dirtied: we want to set the
+ * page dirty in that case, but not all the buffers. This is a "bottom-up"
+ * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ */
+int __set_page_dirty_nobuffers(struct page *page)
+{
+ int ret = 0;
+
+ if (!TestSetPageDirty(page)) {
+ struct address_space *mapping = page->mapping;
+
+ if (mapping) {
+ write_lock(&mapping->page_lock);
+ list_del(&page->list);
+ list_add(&page->list, &mapping->dirty_pages);
+ write_unlock(&mapping->page_lock);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
return sum;
}
+/*
+ * Amount of free RAM allocatable as pagecache memory:
+ */
+unsigned int nr_free_pagecache_pages(void)
+{
+ pg_data_t *pgdat = pgdat_list;
+ unsigned int sum = 0;
+
+ do {
+ zonelist_t *zonelist = pgdat->node_zonelists +
+ (GFP_HIGHUSER & GFP_ZONEMASK);
+ zone_t **zonep = zonelist->zones;
+ zone_t *zone;
+
+ for (zone = *zonep++; zone; zone = *zonep++) {
+ unsigned long size = zone->size;
+ unsigned long high = zone->pages_high;
+ if (size > high)
+ sum += size - high;
+ }
+
+ pgdat = pgdat->node_next;
+ } while (pgdat);
+
+ return sum;
+}
+
#if CONFIG_HIGHMEM
unsigned int nr_free_highpages (void)
{
*
* Thread pool management algorithm:
*
- * - The minumum and maximum number of pdflush instances are bound
+ * - The minimum and maximum number of pdflush instances are bound
* by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
*
* - If there have been no idle pdflush instances for 1 second, create
/*
* Of course, my_work wants to be just a local in __pdflush(). It is
* separated out in this manner to hopefully prevent the compiler from
- * performing unfortunate optimisations agains the auto variables. Because
- * there are visible to other tasks and CPUs. (No problem has actually
+ * performing unfortunate optimisations against the auto variables. Because
+ * these are visible to other tasks and CPUs. (No problem has actually
* been observed. This is just paranoia).
*/
static int pdflush(void *dummy)
sync_page: block_sync_page,
};
+/*
+ * swapper_inode is needed only for for i_bufferlist_lock. This
+ * avoid special-casing in other parts of the kernel.
+ */
+static struct inode swapper_inode = {
+ i_bufferlist_lock: SPIN_LOCK_UNLOCKED,
+ i_mapping: &swapper_space,
+};
+
struct address_space swapper_space = {
page_tree: RADIX_TREE_INIT(GFP_ATOMIC),
page_lock: RW_LOCK_UNLOCKED,
clean_pages: LIST_HEAD_INIT(swapper_space.clean_pages),
dirty_pages: LIST_HEAD_INIT(swapper_space.dirty_pages),
+ io_pages: LIST_HEAD_INIT(swapper_space.io_pages),
locked_pages: LIST_HEAD_INIT(swapper_space.locked_pages),
+ host: &swapper_inode,
a_ops: &swap_aops,
i_shared_lock: SPIN_LOCK_UNLOCKED,
};
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/file.h>
+#include <linux/writeback.h>
#include <linux/compiler.h>
#include <asm/pgalloc.h>
mapping = page->mapping;
- if (PageDirty(page) && is_page_cache_freeable(page) && mapping) {
+ if (PageDirty(page) && is_page_cache_freeable(page) &&
+ page->mapping && (gfp_mask & __GFP_FS)) {
/*
* It is not critical here to write it only if
* the page is unmapped beause any direct writer
* pinned it and after the I/O to the page is finished,
* so the direct writes to the page cannot get lost.
*/
+ struct address_space_operations *a_ops;
+ int (*writeback)(struct page *, int *);
int (*writepage)(struct page *);
- writepage = mapping->a_ops->writepage;
- if ((gfp_mask & __GFP_FS) && writepage) {
+ /*
+ * There's no guarantee that writeback() will actually
+ * start I/O against *this* page. Which is broken if we're
+ * trying to free memory in a particular zone. FIXME.
+ */
+ a_ops = mapping->a_ops;
+ writeback = a_ops->vm_writeback;
+ writepage = a_ops->writepage;
+ if (writeback || writepage) {
ClearPageDirty(page);
SetPageLaunder(page);
page_cache_get(page);
spin_unlock(&pagemap_lru_lock);
- writepage(page);
+ if (writeback) {
+ int nr_to_write = WRITEOUT_PAGES;
+ writeback(page, &nr_to_write);
+ } else {
+ writepage(page);
+ }
page_cache_release(page);
spin_lock(&pagemap_lru_lock);