#include <linux/sysrq.h>
#include <linux/backing-dev.h>
+/*
+ * The maximum number of pages to writeout in a single bdflush/kupdate
+ * operation. We do this so we don't hold I_LOCK against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.
+ */
+#define MAX_WRITEBACK_PAGES 1024
+
/*
* Memory thresholds, in percentages
* FIXME: expose these via /proc or whatever.
*/
static int dirty_sync_ratio = 60;
+static void background_writeout(unsigned long unused);
+
/*
* balance_dirty_pages() must be called by processes which are
* generating dirty data. It looks at the number of dirty pages
* - Does nothing at all.
*
* balance_dirty_pages() can sleep.
+ *
+ * FIXME: WB_SYNC_LAST doesn't actually work. It waits on the last dirty
+ * inode on the superblock list. It should wait when nr_to_write is
+ * exhausted. Doesn't seem to matter.
*/
void balance_dirty_pages(struct address_space *mapping)
{
const int tot = nr_free_pagecache_pages();
struct page_state ps;
- int background_thresh;
- int async_thresh;
- int sync_thresh;
- int wake_pdflush = 0;
+ int background_thresh, async_thresh, sync_thresh;
unsigned long dirty_and_writeback;
get_page_state(&ps);
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL);
get_page_state(&ps);
- dirty_and_writeback = ps.nr_dirty + ps.nr_writeback;
- wake_pdflush = 1;
} else if (dirty_and_writeback > async_thresh) {
int nr_to_write = 1500;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
- } else if (dirty_and_writeback > background_thresh) {
- wake_pdflush = 1;
+ get_page_state(&ps);
}
- if (wake_pdflush && !writeback_in_progress(mapping->backing_dev_info)) {
- if (dirty_and_writeback > async_thresh) {
- pdflush_flush(dirty_and_writeback - async_thresh);
- yield();
- }
- }
+ if (!writeback_in_progress(mapping->backing_dev_info) &&
+ ps.nr_dirty > background_thresh)
+ pdflush_operation(background_writeout, 0);
}
-/*
- * Front-end to balance_dirty_pages - just to make sure it's not called
- * too often.
+/**
+ * balance_dirty_pages_ratelimited - balance dirty memory state
+ * @mapping - address_space which was dirtied
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied. The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * balance_dirty_pages_ratelimited() may sleep.
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
}
/*
- * Here are some applications of the pdflush thread pool
+ * writeback at least _min_pages, and keep writing until the amount of dirty
+ * memory is less than the background threshold, or until we're all clean.
*/
-
-/*
- * Start heavy writeback of everything. This is the analogue of the old
- * wakeup_bdflush(). Returns zero if a thread was successfully launched.
- *
- * Is passed in the number of pages to write.
- *
- * We yield, to allow page allocators to perform their I/O against large files.
- */
-
-static void pdflush_bdflush(unsigned long arg)
+static void background_writeout(unsigned long _min_pages)
{
- int nr_pages = arg;
-
- CHECK_EMERGENCY_SYNC
+ const int tot = nr_free_pagecache_pages();
+ const int background_thresh = (dirty_background_ratio * tot) / 100;
+ long min_pages = _min_pages;
+ int nr_to_write;
- while (nr_pages) {
- int nr_to_write = WRITEOUT_PAGES;
+ do {
+ struct page_state ps;
- if (nr_to_write > nr_pages)
- nr_to_write = nr_pages;
- nr_pages -= nr_to_write;
+ get_page_state(&ps);
+ if (ps.nr_dirty < background_thresh && min_pages <= 0)
+ break;
+ nr_to_write = MAX_WRITEBACK_PAGES;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
- yield();
- }
+ min_pages -= MAX_WRITEBACK_PAGES - nr_to_write;
+ } while (nr_to_write <= 0);
run_task_queue(&tq_disk);
}
-int pdflush_flush(unsigned long nr_pages)
+/*
+ * Start heavy writeback of everything.
+ */
+void wakeup_bdflush(void)
{
- return pdflush_operation(pdflush_bdflush, nr_pages);
+ struct page_state ps;
+
+ get_page_state(&ps);
+ pdflush_operation(background_writeout, ps.nr_dirty);
}
/*
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
- * We also limit the number of pages which are written out, to avoid writing
- * huge amounts of data against a single file, which would cause memory
- * allocators to block for too long.
+ * Try to run once per wb_writeback_jifs jiffies. But if a writeback event
+ * takes longer than a wb_writeback_jifs interval, then leave a one-second
+ * gap.
+ *
+ * older_than_this takes precedence over nr_to_write. So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
*/
static void wb_kupdate(unsigned long arg)
{
- unsigned long oldest_jif = jiffies - 30*HZ;
+ unsigned long oldest_jif;
+ unsigned long start_jif;
+ unsigned long next_jif;
struct page_state ps;
- int total_to_write;
int nr_to_write;
sync_supers();
-
get_page_state(&ps);
- total_to_write = ps.nr_dirty / 6;
- if (total_to_write < 16384) {
- total_to_write = 16384;
- if (total_to_write > ps.nr_dirty)
- total_to_write = ps.nr_dirty;
- }
- while (total_to_write > 0) {
- nr_to_write = total_to_write;
- if (nr_to_write > WRITEOUT_PAGES)
- nr_to_write = WRITEOUT_PAGES;
- total_to_write -= nr_to_write;
- writeback_unlocked_inodes(&nr_to_write,
- WB_SYNC_NONE, &oldest_jif);
- yield();
- }
+ oldest_jif = jiffies - 30*HZ;
+ start_jif = jiffies;
+ next_jif = start_jif + wb_writeback_jifs;
+ nr_to_write = ps.nr_dirty;
+ writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif);
run_task_queue(&tq_disk);
- mod_timer(&wb_timer, jiffies + wb_writeback_jifs);
+ yield();
+
+ if (time_before(next_jif, jiffies + HZ))
+ next_jif = jiffies + HZ;
+ mod_timer(&wb_timer, next_jif);
}
static void wb_timer_fn(unsigned long unused)
{
- pdflush_operation(wb_kupdate, 0);
+ if (pdflush_operation(wb_kupdate, 0) < 0)
+ mod_timer(&wb_timer, jiffies + HZ);
}
static int __init wb_timer_init(void)
module_init(wb_timer_init);
/*
- * FIXME: PG_launder gets cleared by accident.
+ * A library function, which implements the vm_writeback a_op. It's fairly
+ * lame at this time. The idea is: the VM wants to liberate this page,
+ * so we pass the page to the address_space and give the fs the opportunity
+ * to write out lots of pages around this one. It allows extent-based
+ * filesytems to do intelligent things. It lets delayed-allocate filesystems
+ * perform better file layout. It lets the address_space opportunistically
+ * write back disk-contiguous pages which are in other zones.
+ *
+ * FIXME: the VM wants to start I/O against *this* page. Because its zone
+ * is under pressure. But this function may start writeout against a
+ * totally different set of pages. Unlikely to be a huge problem, but if it
+ * is, we could just writepage the page if it is still (PageDirty &&
+ * !PageWriteback) (See below).
+ *
+ * Another option is to just reposition page->mapping->dirty_pages so we
+ * *know* that the page will be written. That will work fine, but seems
+ * unpleasant. (If the page is not for-sure on ->dirty_pages we're dead).
+ * Plus it assumes that the address_space is performing writeback in
+ * ->dirty_pages order.
+ *
+ * So. The proper fix is to leave the page locked-and-dirty and to pass
+ * it all the way down.
*/
-static int writeback_mapping(struct page *page, int *nr_to_write)
+int generic_vm_writeback(struct page *page, int *nr_to_write)
{
struct inode *inode = page->mapping->host;
- SetPageDirty(page);
-
/*
- * We don't own this inode, so we don't want the address_space
- * vanishing while writeback is walking the list
+ * We don't own this inode, and we don't want the address_space
+ * vanishing while writeback is walking its pages.
*/
inode = igrab(inode);
unlock_page(page);
if (inode) {
- writeback_single_inode(inode, 0, nr_to_write);
+ writeback_mapping(inode->i_mapping, nr_to_write);
/*
* This iput() will internally call ext2_discard_prealloc(),
* Just a waste of cycles.
*/
iput(inode);
+#if 0
+ if (!PageWriteback(page) && PageDirty(page)) {
+ lock_page(page);
+ if (!PageWriteback(page) && TestClearPageDirty(page))
+ page->mapping->a_ops->writepage(page);
+ else
+ unlock_page(page);
+ }
+#endif
}
return 0;
}
-
-/*
- * A library function, which implements the vm_writeback a_op. It's fairly
- * lame at this time. The idea is: the VM wants to liberate this page,
- * so we pass the page to the address_space and give the fs the opportunity
- * to write out lots of pages around this one. It allows extent-based
- * filesytems to do intelligent things. It lets delayed-allocate filesystems
- * perform better file layout. It lets the address_space opportunistically
- * write back disk-contiguous pages which are in other zones.
- */
-int generic_vm_writeback(struct page *page, int *nr_to_write)
-{
- return writeback_mapping(page, nr_to_write);
-}
EXPORT_SYMBOL(generic_vm_writeback);
/**
* @nr_to_write: subtract the number of written pages from *@nr_to_write
*
* This is a library function, which implements the writeback_mapping()
- * address_space_operation for filesystems which are using multipage BIO
- * writeback.
+ * address_space_operation.
*
* (The next two paragraphs refer to code which isn't here yet, but they
* explain the presence of address_space.io_pages)
*/
int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write)
{
+ int (*writepage)(struct page *) = mapping->a_ops->writepage;
int ret = 0;
int done = 0;
int err;
- int (*writepage)(struct page *) = mapping->a_ops->writepage;
write_lock(&mapping->page_lock);
continue;
}
list_add(&page->list, &mapping->locked_pages);
-
page_cache_get(page);
write_unlock(&mapping->page_lock);
-
lock_page(page);
- if (TestClearPageDirty(page)) {
+ /* It may have been removed from swapcache: check ->mapping */
+ if (page->mapping && TestClearPageDirty(page) &&
+ !PageWriteback(page)) {
+ /* FIXME: batch this up */
+ if (!PageActive(page) && PageLRU(page)) {
+ spin_lock(&pagemap_lru_lock);
+ if (!PageActive(page) && PageLRU(page)) {
+ list_del(&page->lru);
+ list_add(&page->lru, &inactive_list);
+ }
+ spin_unlock(&pagemap_lru_lock);
+ }
if (current->flags & PF_MEMALLOC)
SetPageLaunder(page);
err = writepage(page);
if (!ret)
ret = err;
- if (nr_to_write) {
- --(*nr_to_write);
- if (*nr_to_write <= 0)
- done = 1;
- }
+ if (nr_to_write && --(*nr_to_write) <= 0)
+ done = 1;
} else {
unlock_page(page);
}
}
EXPORT_SYMBOL(generic_writeback_mapping);
+int writeback_mapping(struct address_space *mapping, int *nr_to_write)
+{
+ if (mapping->a_ops->writeback_mapping)
+ return mapping->a_ops->writeback_mapping(mapping, nr_to_write);
+ return generic_writeback_mapping(mapping, nr_to_write);
+}
+
/**
* write_one_page - write out a single page and optionally wait on I/O
*
* @page - the page to write
* @wait - if true, wait on writeout
*
- * The page must be locked by the caller and will come unlocked when I/O
- * completes.
+ * The page must be locked by the caller and will be unlocked upon return.
*
* write_one_page() returns a negative error code if I/O failed.
*/