[PATCH] fs-writeback rework.

author Andrew Morton <akpm@digeo.com>

Sat, 14 Dec 2002 11:17:52 +0000 (03:17 -0800)

committer Jaroslav Kysela <perex@suse.cz>

Sat, 14 Dec 2002 11:17:52 +0000 (03:17 -0800)
author Andrew Morton <akpm@digeo.com>
Sat, 14 Dec 2002 11:17:52 +0000 (03:17 -0800)
committer Jaroslav Kysela <perex@suse.cz>
Sat, 14 Dec 2002 11:17:52 +0000 (03:17 -0800)
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking

index fa86d6a18ad1d0aa9a93d7021aabc920b6ea5972..66d727bb9ce770ebddfb6888672fe5f6a4095fa2 100644 (file)
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -211,6 +211,9 @@ written.  The address_space implementation may write more (or less) pages
  than *nr_to_write asks for, but it should try to be reasonably close.  If
  nr_to_write is NULL, all dirty pages must be written.
  
+writepages should _only_ write pages which are present on
+mapping->io_pages.
+
         ->set_page_dirty() is called from various places in the kernel
  when the target page is marked as needing writeback.  It may be called
  under spinlock (it cannot block) and is sometimes called with the page
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 33a59010c5df90c19ca05145a0bd36c4b26a23b4..326c2963ca3d45d6323a54dcc108fba4d81c9392 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -67,14 +67,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
  
         spin_lock(&inode_lock);
         if ((inode->i_state & flags) != flags) {
-               const int was_dirty = inode->i_state & I_DIRTY;
                 struct address_space *mapping = inode->i_mapping;
  
                 inode->i_state |= flags;
  
-               if (!was_dirty)
-                       mapping->dirtied_when = jiffies;
-
                 /*
                  * If the inode is locked, just update its dirty state. 
                  * The unlocker will place the inode on the appropriate
@@ -84,18 +80,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                         goto out;
  
                 /*
-                * Only add valid (hashed) inode to the superblock's
+                * Only add valid (hashed) inodes to the superblock's
                  * dirty list.  Add blockdev inodes as well.
                  */
                 if (list_empty(&inode->i_hash) && !S_ISBLK(inode->i_mode))
                         goto out;
  
                 /*
-                * If the inode was already on s_dirty, don't reposition
-                * it (that would break s_dirty time-ordering).
+                * If the inode was already on s_dirty or s_io, don't
+                * reposition it (that would break s_dirty time-ordering).
                  */
-               if (!was_dirty)
+               if (!mapping->dirtied_when) {
+                       mapping->dirtied_when = jiffies|1; /* 0 is special */
                         list_move(&inode->i_list, &sb->s_dirty);
+               }
         }
  out:
         spin_unlock(&inode_lock);
@@ -110,19 +108,17 @@ static void write_inode(struct inode *inode, int sync)
  
  /*
   * Write a single inode's dirty pages and inode data out to disk.
- * If `sync' is set, wait on the writeout.
- * Subtract the number of written pages from nr_to_write.
+ * If `wait' is set, wait on the writeout.
   *
- * Normally it is not legal for a single process to lock more than one
- * page at a time, due to ab/ba deadlock problems.  But writepages()
- * does want to lock a large number of pages, without immediately submitting
- * I/O against them (starting I/O is a "deferred unlock_page").
+ * The whole writeout design is quite complex and fragile.  We want to avoid
+ * starvation of particular inodes when others are being redirtied, prevent
+ * livelocks, etc.
   *
- * However it *is* legal to lock multiple pages, if this is only ever performed
- * by a single process.  We provide that exclusion via locking in the
- * filesystem's ->writepages a_op. This ensures that only a single
- * process is locking multiple pages against this inode.  And as I/O is
- * submitted against all those locked pages, there is no deadlock.
+ * So what we do is to move all pages which are to be written from dirty_pages
+ * onto io_pages.  And keep on writing io_pages until it's empty.  Refusing to
+ * move more pages onto io_pages until io_pages is empty.  Once that point has
+ * been reached, we are ready to take another pass across the inode's dirty
+ * pages.
   *
   * Called under inode_lock.
   */
@@ -131,7 +127,6 @@ __sync_single_inode(struct inode *inode, int wait,
                         struct writeback_control *wbc)
  {
         unsigned dirty;
-       unsigned long orig_dirtied_when;
         struct address_space *mapping = inode->i_mapping;
         struct super_block *sb = inode->i_sb;
  
@@ -141,8 +136,11 @@ __sync_single_inode(struct inode *inode, int wait,
         dirty = inode->i_state & I_DIRTY;
         inode->i_state |= I_LOCK;
         inode->i_state &= ~I_DIRTY;
-       orig_dirtied_when = mapping->dirtied_when;
-       mapping->dirtied_when = 0;      /* assume it's whole-file writeback */
+
+       write_lock(&mapping->page_lock);
+       if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages))
+               list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
+       write_unlock(&mapping->page_lock);
         spin_unlock(&inode_lock);
  
         do_writepages(mapping, wbc);
@@ -155,24 +153,26 @@ __sync_single_inode(struct inode *inode, int wait,
                 filemap_fdatawait(mapping);
  
         spin_lock(&inode_lock);
-
         inode->i_state &= ~I_LOCK;
         if (!(inode->i_state & I_FREEING)) {
-               list_del(&inode->i_list);
-               if (inode->i_state & I_DIRTY) {         /* Redirtied */
-                       list_add(&inode->i_list, &sb->s_dirty);
+               if (!list_empty(&mapping->io_pages)) {
+                       /* Needs more writeback */
+                       inode->i_state |= I_DIRTY_PAGES;
+               } else if (!list_empty(&mapping->dirty_pages)) {
+                       /* Redirtied */
+                       inode->i_state |= I_DIRTY_PAGES;
+                       mapping->dirtied_when = jiffies|1;
+                       list_move(&inode->i_list, &sb->s_dirty);
+               } else if (inode->i_state & I_DIRTY) {
+                       /* Redirtied */
+                       mapping->dirtied_when = jiffies|1;
+                       list_move(&inode->i_list, &sb->s_dirty);
+               } else if (atomic_read(&inode->i_count)) {
+                       mapping->dirtied_when = 0;
+                       list_move(&inode->i_list, &inode_in_use);
                 } else {
-                       if (!list_empty(&mapping->dirty_pages) ||
-                                       !list_empty(&mapping->io_pages)) {
-                               /* Not a whole-file writeback */
-                               mapping->dirtied_when = orig_dirtied_when;
-                               inode->i_state |= I_DIRTY_PAGES;
-                               list_add_tail(&inode->i_list, &sb->s_dirty);
-                       } else if (atomic_read(&inode->i_count)) {
-                               list_add(&inode->i_list, &inode_in_use);
-                       } else {
-                               list_add(&inode->i_list, &inode_unused);
-                       }
+                       mapping->dirtied_when = 0;
+                       list_move(&inode->i_list, &inode_unused);
                 }
         }
         wake_up_inode(inode);
@@ -185,8 +185,10 @@ static void
  __writeback_single_inode(struct inode *inode, int sync,
                         struct writeback_control *wbc)
  {
-       if (current_is_pdflush() && (inode->i_state & I_LOCK))
+       if (current_is_pdflush() && (inode->i_state & I_LOCK)) {
+               list_move(&inode->i_list, &inode->i_sb->s_dirty);
                 return;
+       }
  
         while (inode->i_state & I_LOCK) {
                 __iget(inode);
@@ -233,7 +235,9 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
  {
         const unsigned long start = jiffies;    /* livelock avoidance */
  
-       list_splice_init(&sb->s_dirty, &sb->s_io);
+       if (!wbc->for_kupdate || list_empty(&sb->s_io))
+               list_splice_init(&sb->s_dirty, &sb->s_io);
+
         while (!list_empty(&sb->s_io)) {
                 struct inode *inode = list_entry(sb->s_io.prev,
                                                 struct inode, i_list);
@@ -275,7 +279,6 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
                 really_sync = (wbc->sync_mode == WB_SYNC_ALL);
                 BUG_ON(inode->i_state & I_FREEING);
                 __iget(inode);
-               list_move(&inode->i_list, &sb->s_dirty);
                 __writeback_single_inode(inode, really_sync, wbc);
                 if (wbc->sync_mode == WB_SYNC_HOLD) {
                         mapping->dirtied_when = jiffies;
diff --git a/fs/mpage.c b/fs/mpage.c

index 59cf471aae632a45a923ca499ce55987a1bbb074..7f3043c7ee9054b76b754e97e57e776de829ddb6 100644 (file)
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -525,12 +525,12 @@ out:
   * Pages can be moved from clean_pages or locked_pages onto dirty_pages
   * at any time - it's not possible to lock against that.  So pages which
   * have already been added to a BIO may magically reappear on the dirty_pages
- * list.  And generic_writepages() will again try to lock those pages.
+ * list.  And mpage_writepages() will again try to lock those pages.
   * But I/O has not yet been started against the page.  Thus deadlock.
   *
- * To avoid this, the entire contents of the dirty_pages list are moved
- * onto io_pages up-front.  We then walk io_pages, locking the
- * pages and submitting them for I/O, moving them to locked_pages.
+ * To avoid this, mpage_writepages() will only write pages from io_pages. The
+ * caller must place them there.  We walk io_pages, locking the pages and
+ * submitting them for I/O, moving them to locked_pages.
   *
   * This has the added benefit of preventing a livelock which would otherwise
   * occur if pages are being dirtied faster than we can write them out.
@@ -539,8 +539,8 @@ out:
   * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
   * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
   * and msync() need to guarentee that all the data which was dirty at the time
- * the call was made get new I/O started against them.  The way to do this is
- * to run filemap_fdatawait() before calling filemap_fdatawrite().
+ * the call was made get new I/O started against them.  So if called_for_sync()
+ * is true, we must wait for existing IO to complete.
   *
   * It's fairly rare for PageWriteback pages to be on ->dirty_pages.  It
   * means that someone redirtied the page while it was under I/O.
@@ -570,10 +570,7 @@ mpage_writepages(struct address_space *mapping,
  
         pagevec_init(&pvec, 0);
         write_lock(&mapping->page_lock);
-
-       list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-
-        while (!list_empty(&mapping->io_pages) && !done) {
+       while (!list_empty(&mapping->io_pages) && !done) {
                 struct page *page = list_entry(mapping->io_pages.prev,
                                         struct page, list);
                 list_del(&page->list);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

index 42569bc0bb612a90ac4fda73b315017079c113a6..351e5851c04167dc5d7536e1d271fb65c3afc7c2 100644 (file)
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -41,6 +41,7 @@ struct writeback_control {
                                            this for each page written */
         int nonblocking;                /* Don't get stuck on request queues */
         int encountered_congestion;     /* An output: a queue is full */
+       int for_kupdate;                /* A kupdate writeback */
  };
  
  /*
diff --git a/mm/filemap.c b/mm/filemap.c

index 04b94af71ccff36f78d4adb2d4d937f4f85b6b16..50b05fe9a2e0748ea81a4eab46cec2e2425b5eec 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -62,6 +62,7 @@
   *          ->mapping->page_lock
   *  ->inode_lock
   *    ->sb_lock                        (fs/fs-writeback.c)
+ *    ->mapping->page_lock     (__sync_single_inode)
   *  ->page_table_lock
   *    ->swap_device_lock       (try_to_unmap_one)
   *    ->private_lock           (try_to_unmap_one)
@@ -133,6 +134,9 @@ int filemap_fdatawrite(struct address_space *mapping)
                 return 0;
  
         current->flags |= PF_SYNC;
+       write_lock(&mapping->page_lock);
+       list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
+       write_unlock(&mapping->page_lock);
         ret = do_writepages(mapping, &wbc);
         current->flags &= ~PF_SYNC;
         return ret;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index 9813676561576aa98d962434a64f2268e09f6513..3880394c8562bdcb8bf4f37ccb7a7b0e7ed222d9 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -286,6 +286,7 @@ static void wb_kupdate(unsigned long arg)
                 .older_than_this = &oldest_jif,
                 .nr_to_write    = 0,
                 .nonblocking    = 1,
+               .for_kupdate    = 1,
         };
  
         sync_supers();
@@ -299,7 +300,7 @@ static void wb_kupdate(unsigned long arg)
                 wbc.encountered_congestion = 0;
                 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                 writeback_inodes(&wbc);
-               if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
+               if (wbc.nr_to_write > 0) {
                         if (wbc.encountered_congestion)
                                 blk_congestion_wait(WRITE, HZ);
                         else
author	Andrew Morton <akpm@digeo.com>
	Sat, 14 Dec 2002 11:17:52 +0000 (03:17 -0800)
committer	Jaroslav Kysela <perex@suse.cz>
	Sat, 14 Dec 2002 11:17:52 +0000 (03:17 -0800)
Documentation/filesystems/Locking		patch \| blob \| history
fs/fs-writeback.c		patch \| blob \| history
fs/mpage.c		patch \| blob \| history
include/linux/writeback.h		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/page-writeback.c		patch \| blob \| history