]> git.hungrycats.org Git - linux/commitdiff
[PATCH] Convert i_shared_sem back to a spinlock
authorAndrew Morton <akpm@osdl.org>
Sat, 22 May 2004 15:02:36 +0000 (08:02 -0700)
committerLinus Torvalds <torvalds@ppc970.osdl.org>
Sat, 22 May 2004 15:02:36 +0000 (08:02 -0700)
Having a semaphore in there causes modest performance regressions on heavily
mmap-intensive workloads on some hardware.  Specifically, up to 30% in SDET on
NUMAQ and big PPC64.

So switch it back to being a spinlock.  This does mean that unmap_vmas() needs
to be told whether or not it is allowed to schedule away; that's simple to do
via the zap_details structure.

This change means that there will be high scheuling latencies when someone
truncates a large file which is currently mmapped, but nobody does that
anyway.  The scheduling points in unmap_vmas() are mainly for munmap() and
exit(), and they still will work OK for that.

From: Hugh Dickins <hugh@veritas.com>

  Sorry, my premature optimizations (trying to pass down NULL zap_details
  except when needed) have caught you out doubly: unmap_mapping_range_list was
  NULLing the details even though atomic was set; and if it hadn't, then
  zap_pte_range would have missed free_swap_and_cache and pte_clear when pte
  not present.  Moved the optimization into zap_pte_range itself.  Plus
  massive documentation update.

From: Hugh Dickins <hugh@veritas.com>

  Here's a second patch to add to the first: mremap's cows can't come home
  without releasing the i_mmap_lock, better move the whole "Subtle point"
  locking from move_vma into move_page_tables.  And it's possible for the file
  that was behind an anonymous page to be truncated while we drop that lock,
  don't want to abort mremap because of VM_FAULT_SIGBUS.

  (Eek, should we be checking do_swap_page of a vm_file area against the
  truncate_count sequence?  Technically yes, but I doubt we need bother.)

- We cannot hold i_mmap_lock across move_one_page() because
  move_one_page() needs to perform __GFP_WAIT allocations of pagetable pages.

- Move the cond_resched() out so we test it once per page rather than only
  when move_one_page() returns -EAGAIN.

13 files changed:
Documentation/vm/locking
fs/hugetlbfs/inode.c
fs/inode.c
include/linux/fs.h
include/linux/mm.h
include/linux/rmap.h
kernel/fork.c
mm/filemap.c
mm/madvise.c
mm/memory.c
mm/mmap.c
mm/mremap.c
mm/rmap.c

index d29992e40aa28d789ab7b80bb0def499e2a78f39..c3ef09ae3bb114f484247389776415ce401ed405 100644 (file)
@@ -66,7 +66,7 @@ in some cases it is not really needed. Eg, vm_start is modified by
 expand_stack(), it is hard to come up with a destructive scenario without 
 having the vmlist protection in this case.
 
-The page_table_lock nests with the inode i_shared_sem and the kmem cache
+The page_table_lock nests with the inode i_mmap_lock and the kmem cache
 c_spinlock spinlocks.  This is okay, since the kmem code asks for pages after
 dropping c_spinlock.  The page_table_lock also nests with pagecache_lock and
 pagemap_lru_lock spinlocks, and no code asks for memory with these locks
index 4290213f401004bff5ebf469a22e9846b6d391bb..f83ccf8530d807e547e4275fa705f56709a45b66 100644 (file)
@@ -321,12 +321,12 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
        pgoff = offset >> HPAGE_SHIFT;
 
        inode->i_size = offset;
-       down(&mapping->i_shared_sem);
+       spin_lock(&mapping->i_mmap_lock);
        if (!list_empty(&mapping->i_mmap))
                hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
        if (!list_empty(&mapping->i_mmap_shared))
                hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
-       up(&mapping->i_shared_sem);
+       spin_unlock(&mapping->i_mmap_lock);
        truncate_hugepages(mapping, offset);
        return 0;
 }
index 4e8cbca353f71fb9adebff5e2281dd81264de551..4897c3a35c848810c5f09b43a8f555a8ae2aa27c 100644 (file)
@@ -196,7 +196,7 @@ void inode_init_once(struct inode *inode)
        init_rwsem(&inode->i_alloc_sem);
        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
        spin_lock_init(&inode->i_data.tree_lock);
-       init_MUTEX(&inode->i_data.i_shared_sem);
+       spin_lock_init(&inode->i_data.i_mmap_lock);
        atomic_set(&inode->i_data.truncate_count, 0);
        INIT_LIST_HEAD(&inode->i_data.private_list);
        spin_lock_init(&inode->i_data.private_lock);
index a10ba0ee9c1ae86f0e6da33f273879a3ab0f32ed..c30ed84a5f7f60e7ab18d2c67917212349f0f0d4 100644 (file)
@@ -331,7 +331,7 @@ struct address_space {
        struct address_space_operations *a_ops; /* methods */
        struct list_head        i_mmap;         /* list of private mappings */
        struct list_head        i_mmap_shared;  /* list of shared mappings */
-       struct semaphore        i_shared_sem;   /* protect both above lists */
+       spinlock_t              i_mmap_lock;    /* protect both above lists */
        atomic_t                truncate_count; /* Cover race condition with truncate */
        unsigned long           flags;          /* error bits/gfp mask */
        struct backing_dev_info *backing_dev_info; /* device readahead, etc */
index 905e87989dd6b6e9a34273f6074ff13ea03c8f52..75cdd2d98da6b71f870a6c266f79fdfd26068db8 100644 (file)
@@ -477,6 +477,7 @@ struct zap_details {
        struct address_space *check_mapping;    /* Check page->mapping if set */
        pgoff_t first_index;                    /* Lowest page->index to unmap */
        pgoff_t last_index;                     /* Highest page->index to unmap */
+       int atomic;                             /* May not schedule() */
 };
 
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
index 33bfd51cf009da2521ce2c003b4a0825f88ccdb0..2e48763371be5e9214cdc7b77efacc8509d51fe9 100644 (file)
@@ -69,15 +69,9 @@ static inline int mremap_moved_anon_rmap(struct page *page, unsigned long addr)
 static inline int make_page_exclusive(struct vm_area_struct *vma,
                                        unsigned long addr)
 {
-       switch (handle_mm_fault(vma->vm_mm, vma, addr, 1)) {
-       case VM_FAULT_MINOR:
-       case VM_FAULT_MAJOR:
+       if (handle_mm_fault(vma->vm_mm, vma, addr, 1) != VM_FAULT_OOM)
                return 0;
-       case VM_FAULT_OOM:
-               return -ENOMEM;
-       default:
-               return -EFAULT;
-       }
+       return -ENOMEM;
 }
 
 /*
index 47b8f8e6f78730960bfe4bcbe1885793f0fb846e..3cb3bc41b0c0858ee45f812df7fdf987a747a5f4 100644 (file)
@@ -324,9 +324,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
                                atomic_dec(&inode->i_writecount);
       
                        /* insert tmp into the share list, just after mpnt */
-                       down(&file->f_mapping->i_shared_sem);
+                       spin_lock(&file->f_mapping->i_mmap_lock);
                        list_add(&tmp->shared, &mpnt->shared);
-                       up(&file->f_mapping->i_shared_sem);
+                       spin_unlock(&file->f_mapping->i_mmap_lock);
                }
 
                /*
index 1dcf80bf791818a966d47feaf1d9196e6812396f..804d96e88fd78a9ec31662ef1661af2c3d3f9dd5 100644 (file)
 /*
  * Lock ordering:
  *
- *  ->i_shared_sem             (vmtruncate)
+ *  ->i_mmap_lock              (vmtruncate)
  *    ->private_lock           (__free_pte->__set_page_dirty_buffers)
  *      ->swap_list_lock
  *        ->swap_device_lock   (exclusive_swap_page, others)
  *          ->mapping->tree_lock
  *
  *  ->i_sem
- *    ->i_shared_sem           (truncate->unmap_mapping_range)
+ *    ->i_mmap_lock            (truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
- *    ->i_shared_sem           (various places)
+ *    ->i_mmap_lock            (various places)
  *
  *  ->mmap_sem
  *    ->lock_page              (access_process_vm)
index 4b1360317c7cf7db720048d804fe4e5a7a858777..e54f3906d8ccaf0645495bebb8c57345a8980ae8 100644 (file)
@@ -92,16 +92,14 @@ static long madvise_willneed(struct vm_area_struct * vma,
 static long madvise_dontneed(struct vm_area_struct * vma,
                             unsigned long start, unsigned long end)
 {
-       struct zap_details details;
-
        if (vma->vm_flags & VM_LOCKED)
                return -EINVAL;
 
        if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
-               details.check_mapping = NULL;
-               details.nonlinear_vma = vma;
-               details.first_index = 0;
-               details.last_index = ULONG_MAX;
+               struct zap_details details = {
+                       .nonlinear_vma = vma,
+                       .last_index = ULONG_MAX,
+               };
                zap_page_range(vma, start, end - start, &details);
        } else
                zap_page_range(vma, start, end - start, NULL);
index 1c2ce1f326e596f36a8b9f3d622368cea0bb9fcf..b36b907bcaa04092bca483c1e1f23a1cdbe89f91 100644 (file)
@@ -365,6 +365,8 @@ static void zap_pte_range(struct mmu_gather *tlb,
        if (offset + size > PMD_SIZE)
                size = PMD_SIZE - offset;
        size &= PAGE_MASK;
+       if (details && !details->check_mapping && !details->nonlinear_vma)
+               details = NULL;
        for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
                pte_t pte = *ptep;
                if (pte_none(pte))
@@ -518,6 +520,7 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
        unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
        int tlb_start_valid = 0;
        int ret = 0;
+       int atomic = details && details->atomic;
 
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
                unsigned long start;
@@ -555,7 +558,7 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
                        zap_bytes -= block;
                        if ((long)zap_bytes > 0)
                                continue;
-                       if (need_resched()) {
+                       if (!atomic && need_resched()) {
                                int fullmm = tlb_is_full_mm(*tlbp);
                                tlb_finish_mmu(*tlbp, tlb_start, start);
                                cond_resched_lock(&mm->page_table_lock);
@@ -583,8 +586,6 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address,
        unsigned long end = address + size;
        unsigned long nr_accounted = 0;
 
-       might_sleep();
-
        if (is_vm_hugetlb_page(vma)) {
                zap_hugepage_range(vma, address, size);
                return;
@@ -1133,8 +1134,7 @@ static void unmap_mapping_range_list(struct list_head *head,
                        zea = vea;
                zap_page_range(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
-                       (zea - zba + 1) << PAGE_SHIFT,
-                       details->check_mapping? details: NULL);
+                       (zea - zba + 1) << PAGE_SHIFT, details);
        }
 }
 
@@ -1155,7 +1155,7 @@ static void unmap_mapping_range_list(struct list_head *head,
  * but 0 when invalidating pagecache, don't throw away private data.
  */
 void unmap_mapping_range(struct address_space *mapping,
-       loff_t const holebegin, loff_t const holelen, int even_cows)
+               loff_t const holebegin, loff_t const holelen, int even_cows)
 {
        struct zap_details details;
        pgoff_t hba = holebegin >> PAGE_SHIFT;
@@ -1173,10 +1173,11 @@ void unmap_mapping_range(struct address_space *mapping,
        details.nonlinear_vma = NULL;
        details.first_index = hba;
        details.last_index = hba + hlen - 1;
+       details.atomic = 1;     /* A spinlock is held */
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
 
-       down(&mapping->i_shared_sem);
+       spin_lock(&mapping->i_mmap_lock);
        /* Protect against page fault */
        atomic_inc(&mapping->truncate_count);
        if (unlikely(!list_empty(&mapping->i_mmap)))
@@ -1187,7 +1188,7 @@ void unmap_mapping_range(struct address_space *mapping,
 
        if (unlikely(!list_empty(&mapping->i_mmap_shared)))
                unmap_mapping_range_list(&mapping->i_mmap_shared, &details);
-       up(&mapping->i_shared_sem);
+       spin_unlock(&mapping->i_mmap_lock);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
index 54aab8d30d4eaf6fc42e03378d0aaccb5be272f2..0f66a034273d9b7b67970c68288bc584a4977078 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -63,7 +63,7 @@ EXPORT_SYMBOL(sysctl_max_map_count);
 EXPORT_SYMBOL(vm_committed_space);
 
 /*
- * Requires inode->i_mapping->i_shared_sem
+ * Requires inode->i_mapping->i_mmap_lock
  */
 static inline void
 __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
@@ -84,9 +84,9 @@ static void remove_shared_vm_struct(struct vm_area_struct *vma)
 
        if (file) {
                struct address_space *mapping = file->f_mapping;
-               down(&mapping->i_shared_sem);
+               spin_lock(&mapping->i_mmap_lock);
                __remove_shared_vm_struct(vma, file->f_dentry->d_inode);
-               up(&mapping->i_shared_sem);
+               spin_unlock(&mapping->i_mmap_lock);
        }
 }
 
@@ -286,12 +286,12 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
                mapping = vma->vm_file->f_mapping;
 
        if (mapping)
-               down(&mapping->i_shared_sem);
+               spin_lock(&mapping->i_mmap_lock);
        spin_lock(&mm->page_table_lock);
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        spin_unlock(&mm->page_table_lock);
        if (mapping)
-               up(&mapping->i_shared_sem);
+               spin_unlock(&mapping->i_mmap_lock);
 
        mark_mm_hugetlb(mm, vma);
        mm->map_count++;
@@ -301,7 +301,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 /*
  * Insert vm structure into process list sorted by address and into the inode's
  * i_mmap ring. The caller should hold mm->page_table_lock and
- * ->f_mappping->i_shared_sem if vm_file is non-NULL.
+ * ->f_mappping->i_mmap_lock if vm_file is non-NULL.
  */
 static void
 __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
@@ -391,7 +391,7 @@ static struct vm_area_struct *vma_merge(struct mm_struct *mm,
 {
        spinlock_t *lock = &mm->page_table_lock;
        struct inode *inode = file ? file->f_dentry->d_inode : NULL;
-       struct semaphore *i_shared_sem;
+       spinlock_t *i_mmap_lock;
 
        /*
         * We later require that vma->vm_flags == vm_flags, so this tests
@@ -400,7 +400,7 @@ static struct vm_area_struct *vma_merge(struct mm_struct *mm,
        if (vm_flags & VM_SPECIAL)
                return NULL;
 
-       i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
+       i_mmap_lock = file ? &file->f_mapping->i_mmap_lock : NULL;
 
        if (!prev) {
                prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
@@ -417,7 +417,7 @@ static struct vm_area_struct *vma_merge(struct mm_struct *mm,
 
                if (unlikely(file && prev->vm_next &&
                                prev->vm_next->vm_file == file)) {
-                       down(i_shared_sem);
+                       spin_lock(i_mmap_lock);
                        need_up = 1;
                }
                spin_lock(lock);
@@ -435,7 +435,7 @@ static struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        __remove_shared_vm_struct(next, inode);
                        spin_unlock(lock);
                        if (need_up)
-                               up(i_shared_sem);
+                               spin_unlock(i_mmap_lock);
                        if (file)
                                fput(file);
 
@@ -445,7 +445,7 @@ static struct vm_area_struct *vma_merge(struct mm_struct *mm,
                }
                spin_unlock(lock);
                if (need_up)
-                       up(i_shared_sem);
+                       spin_unlock(i_mmap_lock);
                return prev;
        }
 
@@ -460,13 +460,13 @@ static struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        return NULL;
                if (end == prev->vm_start) {
                        if (file)
-                               down(i_shared_sem);
+                               spin_lock(i_mmap_lock);
                        spin_lock(lock);
                        prev->vm_start = addr;
                        prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
                        spin_unlock(lock);
                        if (file)
-                               up(i_shared_sem);
+                               spin_unlock(i_mmap_lock);
                        return prev;
                }
        }
@@ -1232,7 +1232,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                 mapping = vma->vm_file->f_mapping;
 
        if (mapping)
-               down(&mapping->i_shared_sem);
+               spin_lock(&mapping->i_mmap_lock);
        spin_lock(&mm->page_table_lock);
 
        if (new_below) {
@@ -1245,7 +1245,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 
        spin_unlock(&mm->page_table_lock);
        if (mapping)
-               up(&mapping->i_shared_sem);
+               spin_unlock(&mapping->i_mmap_lock);
 
        return 0;
 }
@@ -1479,7 +1479,7 @@ void exit_mmap(struct mm_struct *mm)
 
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap ring.  If vm_file is non-NULL
- * then i_shared_sem is taken here.
+ * then i_mmap_lock is taken here.
  */
 void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
index b54bbfb5cb795edbdbb774ca1b0b0722650b3189..d7a9b0f3d7cbb10aa08a6a5cc50e851f7c288041 100644 (file)
@@ -180,7 +180,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                unsigned long new_len, unsigned long new_addr)
 {
        struct mm_struct *mm = vma->vm_mm;
-       struct address_space *mapping = NULL;
        struct vm_area_struct *new_vma;
        unsigned long vm_flags = vma->vm_flags;
        unsigned long new_pgoff;
@@ -201,16 +200,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        if (!new_vma)
                return -ENOMEM;
 
-       if (vma->vm_file) {
-               /*
-                * Subtle point from Rajesh Venkatasubramanian: before
-                * moving file-based ptes, we must lock vmtruncate out,
-                * since it might clean the dst vma before the src vma,
-                * and we propagate stale pages into the dst afterward.
-                */
-               mapping = vma->vm_file->f_mapping;
-               down(&mapping->i_shared_sem);
-       }
        moved_len = move_page_tables(vma, new_addr, old_addr, old_len, &cows);
        if (moved_len < old_len) {
                /*
@@ -227,8 +216,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        if (cows)       /* Downgrade or remove this message later */
                printk(KERN_WARNING "%s: mremap moved %d cows\n",
                                                        current->comm, cows);
-       if (mapping)
-               up(&mapping->i_shared_sem);
 
        /* Conceal VM_ACCOUNT so old reservation is not undone */
        if (vm_flags & VM_ACCOUNT) {
index a8267f6945ce4d5283694dd4bdce89caf58011e7..e0290854e30a5e03176b39b06e78c5e665bcdc88 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -300,7 +300,7 @@ migrate:
  *
  * This function is only called from page_referenced for object-based pages.
  *
- * The semaphore address_space->i_shared_sem is tried.  If it can't be gotten,
+ * The semaphore address_space->i_mmap_lock is tried.  If it can't be gotten,
  * assume a reference count of 0, so try_to_unmap will then have a go.
  */
 static inline int page_referenced_file(struct page *page)
@@ -313,7 +313,7 @@ static inline int page_referenced_file(struct page *page)
        int referenced = 0;
        int failed = 0;
 
-       if (down_trylock(&mapping->i_shared_sem))
+       if (!spin_trylock(&mapping->i_mmap_lock))
                return 0;
 
        list_for_each_entry(vma, &mapping->i_mmap, shared) {
@@ -355,7 +355,7 @@ static inline int page_referenced_file(struct page *page)
 
        WARN_ON(!failed);
 out:
-       up(&mapping->i_shared_sem);
+       spin_unlock(&mapping->i_mmap_lock);
        return referenced;
 }
 
@@ -725,7 +725,7 @@ out:
  *
  * This function is only called from try_to_unmap for object-based pages.
  *
- * The semaphore address_space->i_shared_sem is tried.  If it can't be gotten,
+ * The semaphore address_space->i_mmap_lock is tried.  If it can't be gotten,
  * return a temporary error.
  */
 static inline int try_to_unmap_file(struct page *page)
@@ -740,7 +740,7 @@ static inline int try_to_unmap_file(struct page *page)
        unsigned long max_nl_cursor = 0;
        unsigned long max_nl_size = 0;
 
-       if (down_trylock(&mapping->i_shared_sem))
+       if (!spin_trylock(&mapping->i_mmap_lock))
                return ret;
 
        list_for_each_entry(vma, &mapping->i_mmap, shared) {
@@ -839,7 +839,7 @@ static inline int try_to_unmap_file(struct page *page)
 relock:
        page_map_lock(page);
 out:
-       up(&mapping->i_shared_sem);
+       spin_unlock(&mapping->i_mmap_lock);
        return ret;
 }