[PATCH] hot-n-cold pages: page allocator core

author Andrew Morton <akpm@digeo.com>

Wed, 30 Oct 2002 07:35:53 +0000 (23:35 -0800)

committer Linus Torvalds <torvalds@penguin.transmeta.com>

Wed, 30 Oct 2002 07:35:53 +0000 (23:35 -0800)
author Andrew Morton <akpm@digeo.com>
Wed, 30 Oct 2002 07:35:53 +0000 (23:35 -0800)
committer Linus Torvalds <torvalds@penguin.transmeta.com>
Wed, 30 Oct 2002 07:35:53 +0000 (23:35 -0800)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h

index c340b447a96339e2214b194d38b5eac89a003260..8e093813e4f7a05772bb5f7c7258b9879b304b76 100644 (file)
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -17,6 +17,7 @@
  #define __GFP_IO       0x40    /* Can start low memory physical IO? */
  #define __GFP_HIGHIO   0x80    /* Can start high mem physical IO? */
  #define __GFP_FS       0x100   /* Can call down to low-level FS? */
+#define __GFP_COLD     0x200   /* Cache-cold page required */
  
  #define GFP_NOHIGHIO   (             __GFP_WAIT | __GFP_IO)
  #define GFP_NOIO       (             __GFP_WAIT)
@@ -32,6 +33,7 @@
  
  #define GFP_DMA                __GFP_DMA
  
+
  /*
   * There is only one page-allocator function, and two main namespaces to
   * it. The alloc_page*() variants return 'struct page *' and as such
@@ -77,11 +79,10 @@ extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
  #define __get_dma_pages(gfp_mask, order) \
                 __get_free_pages((gfp_mask) | GFP_DMA,(order))
  
-/*
- * There is only one 'core' page-freeing function.
- */
  extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
  extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
+extern void FASTCALL(free_hot_page(struct page *page));
+extern void FASTCALL(free_cold_page(struct page *page));
  
  #define __free_page(page) __free_pages((page), 0)
  #define free_page(addr) free_pages((addr),0)
diff --git a/include/linux/mm.h b/include/linux/mm.h

index cab2c4342047b1e28046ac8896bbc5c5b968b8f2..d9d2f20732d4cee737f61f9a5536ceaa73644cdc 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -211,7 +211,6 @@ struct page {
  #define set_page_count(p,v)    atomic_set(&(p)->count, v)
  
  extern void FASTCALL(__page_cache_release(struct page *));
-void FASTCALL(__free_pages_ok(struct page *page, unsigned int order));
  
  static inline void put_page(struct page *page)
  {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 10c4ee96802036e711fdb281369946c59534f829..d80490b1265c3edda1800fa1582ebbefa49dba72 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -9,6 +9,7 @@
  #include <linux/list.h>
  #include <linux/wait.h>
  #include <linux/cache.h>
+#include <linux/threads.h>
  #include <asm/atomic.h>
  #ifdef CONFIG_DISCONTIGMEM
  #include <asm/numnodes.h>
@@ -46,6 +47,18 @@ struct zone_padding {
  #define ZONE_PADDING(name)
  #endif
  
+struct per_cpu_pages {
+       int count;              /* number of pages in the list */
+       int low;                /* low watermark, refill needed */
+       int high;               /* high watermark, emptying needed */
+       int batch;              /* chunk size for buddy add/remove */
+       struct list_head list;  /* the list of pages */
+};
+
+struct per_cpu_pageset {
+       struct per_cpu_pages pcp[2];    /* 0: hot.  1: cold */
+} ____cacheline_aligned_in_smp;
+
  /*
   * On machines where it is needed (eg PCs) we divide physical memory
   * into multiple physical zones. On a PC we have 3 zones:
@@ -107,6 +120,10 @@ struct zone {
         unsigned long           wait_table_size;
         unsigned long           wait_table_bits;
  
+       ZONE_PADDING(_pad3_)
+
+       struct per_cpu_pageset  pageset[NR_CPUS];
+
         /*
          * Discontig memory support fields.
          */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index dd35f4d7ac498df6554d9320f82daecd35340562..f46471b255863a8081aa61f893919dd34e7a65af 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -10,6 +10,8 @@
   *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
   */
  
  #include <linux/config.h>
@@ -151,13 +153,14 @@ static inline void free_pages_check(const char *function, struct page *page)
   * Assumes all pages on list are in same zone, and of same order.
   * count is the number of pages to free, or 0 for all on the list.
   */
-static void
+static int
  free_pages_bulk(struct zone *zone, int count,
                 struct list_head *list, unsigned int order)
  {
         unsigned long mask, flags;
         struct free_area *area;
         struct page *base, *page = NULL;
+       int ret = 0;
  
         mask = (~0UL) << order;
         base = zone->zone_mem_map;
@@ -169,8 +172,10 @@ free_pages_bulk(struct zone *zone, int count,
                 list_del(&page->list);
                 __free_pages_bulk(page, base, zone, area, mask, order);
                 mod_page_state(pgfree, count<<order);
+               ret++;
         }
         spin_unlock_irqrestore(&zone->lock, flags);
+       return ret;
  }
  
  void __free_pages_ok(struct page *page, unsigned int order)
@@ -201,14 +206,13 @@ expand(struct zone *zone, struct page *page,
                 index += size;
                 page += size;
         }
-       BUG_ON(bad_range(zone, page));
         return page;
  }
  
  /*
   * This page is about to be returned from the page allocator
   */
-static inline void prep_new_page(struct page *page)
+static void prep_new_page(struct page *page)
  {
         if (    page->mapping ||
                 page_mapped(page) ||
@@ -248,36 +252,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
                         continue;
  
                 page = list_entry(curr, struct page, list);
-               BUG_ON(bad_range(zone, page));
                 list_del(curr);
                 index = page - zone->zone_mem_map;
                 if (current_order != MAX_ORDER-1)
                         MARK_USED(index, current_order, area);
                 zone->free_pages -= 1UL << order;
-               page = expand(zone, page, index, order, current_order, area);
-               return page;
+               return expand(zone, page, index, order, current_order, area);
         }
  
         return NULL;
  }
  
-/* Obtain a single element from the buddy allocator */
-static struct page *rmqueue(struct zone *zone, unsigned int order)
-{
-       unsigned long flags;
-       struct page *page;
-
-       spin_lock_irqsave(&zone->lock, flags);
-       page = __rmqueue(zone, order);
-       spin_unlock_irqrestore(&zone->lock, flags);
-
-       if (page != NULL) {
-               BUG_ON(bad_range(zone, page));
-               prep_new_page(page);
-       }
-       return page;
-}
-
  /* 
   * Obtain a specified number of elements from the buddy allocator, all under
   * a single hold of the lock, for efficiency.  Add them to the supplied list.
@@ -340,6 +325,72 @@ int is_head_of_free_region(struct page *page)
  }
  #endif /* CONFIG_SOFTWARE_SUSPEND */
  
+/*
+ * Free a 0-order page
+ */
+static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
+static void free_hot_cold_page(struct page *page, int cold)
+{
+       struct zone *zone = page_zone(page);
+       struct per_cpu_pages *pcp;
+       unsigned long flags;
+
+       free_pages_check(__FUNCTION__, page);
+       pcp = &zone->pageset[get_cpu()].pcp[cold];
+       local_irq_save(flags);
+       if (pcp->count >= pcp->high)
+               pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+       list_add(&page->list, &pcp->list);
+       pcp->count++;
+       local_irq_restore(flags);
+       put_cpu();
+}
+
+void free_hot_page(struct page *page)
+{
+       free_hot_cold_page(page, 0);
+}
+       
+void free_cold_page(struct page *page)
+{
+       free_hot_cold_page(page, 1);
+}
+
+static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
+{
+       unsigned long flags;
+       struct page *page = NULL;
+
+       if (order == 0) {
+               struct per_cpu_pages *pcp;
+
+               pcp = &zone->pageset[get_cpu()].pcp[cold];
+               local_irq_save(flags);
+               if (pcp->count <= pcp->low)
+                       pcp->count += rmqueue_bulk(zone, 0,
+                                               pcp->batch, &pcp->list);
+               if (pcp->count) {
+                       page = list_entry(pcp->list.next, struct page, list);
+                       list_del(&page->list);
+                       pcp->count--;
+               }
+               local_irq_restore(flags);
+               put_cpu();
+       }
+
+       if (page == NULL) {
+               spin_lock_irqsave(&zone->lock, flags);
+               page = __rmqueue(zone, order);
+               spin_unlock_irqrestore(&zone->lock, flags);
+       }
+
+       if (page != NULL) {
+               BUG_ON(bad_range(zone, page));
+               prep_new_page(page);
+       }
+       return page;
+}
+
  /*
   * This is the 'heart' of the zoned buddy allocator:
   */
@@ -349,13 +400,18 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
  {
         unsigned long min;
         struct zone **zones, *classzone;
-       struct page * page;
+       struct page *page;
         int cflags;
         int i;
+       int cold;
  
         if (gfp_mask & __GFP_WAIT)
                 might_sleep();
  
+       cold = 0;
+       if (gfp_mask & __GFP_COLD)
+               cold = 1;
+
         mod_page_state(pgalloc, 1<<order);
  
         zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
@@ -371,7 +427,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
                 /* the incremental min is allegedly to discourage fallback */
                 min += z->pages_low;
                 if (z->free_pages > min || z->free_pages >= z->pages_high) {
-                       page = rmqueue(z, order);
+                       page = buffered_rmqueue(z, order, cold);
                         if (page)
                                 return page;
                 }
@@ -396,7 +452,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
                         local_min >>= 2;
                 min += local_min;
                 if (z->free_pages > min || z->free_pages >= z->pages_high) {
-                       page = rmqueue(z, order);
+                       page = buffered_rmqueue(z, order, cold);
                         if (page)
                                 return page;
                 }
@@ -410,7 +466,7 @@ rebalance:
                 for (i = 0; zones[i] != NULL; i++) {
                         struct zone *z = zones[i];
  
-                       page = rmqueue(z, order);
+                       page = buffered_rmqueue(z, order, cold);
                         if (page)
                                 return page;
                 }
@@ -440,7 +496,7 @@ nopage:
  
                 min += z->pages_min;
                 if (z->free_pages > min || z->free_pages >= z->pages_high) {
-                       page = rmqueue(z, order);
+                       page = buffered_rmqueue(z, order, cold);
                         if (page)
                                 return page;
                 }
@@ -492,13 +548,17 @@ void __pagevec_free(struct pagevec *pvec)
         int i = pagevec_count(pvec);
  
         while (--i >= 0)
-               __free_pages_ok(pvec->pages[i], 0);
+               free_hot_page(pvec->pages[i]);
  }
  
  void __free_pages(struct page *page, unsigned int order)
  {
-       if (!PageReserved(page) && put_page_testzero(page))
-               __free_pages_ok(page, order);
+       if (!PageReserved(page) && put_page_testzero(page)) {
+               if (order == 0)
+                       free_hot_page(page);
+               else
+                       __free_pages_ok(page, order);
+       }
  }
  
  void free_pages(unsigned long addr, unsigned int order)
@@ -899,7 +959,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
         unsigned long i, j;
         unsigned long local_offset;
         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
-       int nid = pgdat->node_id;
+       int cpu, nid = pgdat->node_id;
         struct page *lmem_map = pgdat->node_mem_map;
         unsigned long zone_start_pfn = pgdat->node_start_pfn;
  
@@ -911,13 +971,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                 struct zone *zone = pgdat->node_zones + j;
                 unsigned long mask;
                 unsigned long size, realsize;
+               unsigned long batch;
  
                 zone_table[nid * MAX_NR_ZONES + j] = zone;
                 realsize = size = zones_size[j];
                 if (zholes_size)
                         realsize -= zholes_size[j];
  
-               printk("  %s zone: %lu pages\n", zone_names[j], realsize);
                 zone->spanned_pages = size;
                 zone->present_pages = realsize;
                 zone->name = zone_names[j];
@@ -925,6 +985,40 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                 spin_lock_init(&zone->lru_lock);
                 zone->zone_pgdat = pgdat;
                 zone->free_pages = 0;
+
+               /*
+                * The per-cpu-pages pools are set to around 1000th of the
+                * size of the zone.  But no more than 1/4 of a meg - there's
+                * no point in going beyond the size of L2 cache.
+                *
+                * OK, so we don't know how big the cache is.  So guess.
+                */
+               batch = zone->present_pages / 1024;
+               if (batch * PAGE_SIZE > 256 * 1024)
+                       batch = (256 * 1024) / PAGE_SIZE;
+               batch /= 4;             /* We effectively *= 4 below */
+               if (batch < 1)
+                       batch = 1;
+
+               for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                       struct per_cpu_pages *pcp;
+
+                       pcp = &zone->pageset[cpu].pcp[0];       /* hot */
+                       pcp->count = 0;
+                       pcp->low = 2 * batch;
+                       pcp->high = 6 * batch;
+                       pcp->batch = 1 * batch;
+                       INIT_LIST_HEAD(&pcp->list);
+
+                       pcp = &zone->pageset[cpu].pcp[1];       /* cold */
+                       pcp->count = 0;
+                       pcp->low = 0;
+                       pcp->high = 2 * batch;
+                       pcp->batch = 1 * batch;
+                       INIT_LIST_HEAD(&pcp->list);
+               }
+               printk("  %s zone: %lu pages, LIFO batch:%lu\n",
+                               zone_names[j], realsize, batch);
                 INIT_LIST_HEAD(&zone->active_list);
                 INIT_LIST_HEAD(&zone->inactive_list);
                 atomic_set(&zone->refill_counter, 0);
diff --git a/mm/swap.c b/mm/swap.c

index 72f4c9cdd5c4f8a89be980f99c082f619035cc0f..225e24f1973dbcd3bc95a1d969f5a073e442a31e 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -69,7 +69,8 @@ void lru_add_drain(void)
  }
  
  /*
- * This path almost never happens - pages are normally freed via pagevecs.
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs.  But it gets used by networking.
   */
  void __page_cache_release(struct page *page)
  {
@@ -83,7 +84,7 @@ void __page_cache_release(struct page *page)
                 page = NULL;
         spin_unlock_irqrestore(&zone->lru_lock, flags);
         if (page)
-               __free_pages_ok(page, 0);
+               free_hot_page(page);
  }
  
  /*
author	Andrew Morton <akpm@digeo.com>
	Wed, 30 Oct 2002 07:35:53 +0000 (23:35 -0800)
committer	Linus Torvalds <torvalds@penguin.transmeta.com>
	Wed, 30 Oct 2002 07:35:53 +0000 (23:35 -0800)
include/linux/gfp.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history