[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] 2/7 xen: Add basic NUMA support - Page Allocator



This patch modifies the Xen page allocator to provide a simple
NUMA-aware API.  The goal is to give a good first effort in preferring
local memory over any other memory fitting the request.

To help determine if an allocation request is local, we require a cpu
argument.  We've added two new alloc functions:  __alloc_heap_pages()
and __alloc_domheap_pages() which require a cpu argument.  We've
retained the original alloc() calls for callers which do not have a cpu
preference.  

The bulk of the work of finding a local page is located in
__find_local_page() which will scan the list of pfn's of the indicated
order and test if each pfn fits within any memory chunks that are
local to the requesting cpu.  If none of the pfn's in the list are
local, we use the first pfn in the list.  This approach is very simple
and will likely be tuned after more testing has occurred.

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


diffstat output:
 common/page_alloc.c |  206 +++++++++++++++++++++++++++++++++++++++++++++-------
 include/xen/sched.h |    6 +
 2 files changed, 186 insertions(+), 26 deletions(-)

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
Signed-off-by: Ryan Grimm <grimm@xxxxxxxxxx>

---
diff -r d7322c375c6e xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Mon Nov 14 22:50:19 2005
+++ b/xen/common/page_alloc.c   Tue Nov 15 12:28:51 2005
@@ -4,6 +4,7 @@
  * Simple buddy heap allocator for Xen.
  * 
  * Copyright (c) 2002-2004 K A Fraser
+ * Copyright (c) 2005 IBM
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,6 +34,14 @@
 #include <xen/shadow.h>
 #include <xen/domain_page.h>
 #include <asm/page.h>
+#ifdef CONFIG_NUMA
+#include <xen/numa.h>
+#endif
+
+#define MEMZONE_XEN 0
+#define MEMZONE_DOM 1
+#define MEMZONE_DMADOM 2
+#define NR_ZONES    3
 
 /*
  * Comma-separated list of hexadecimal page numbers containing bad bytes.
@@ -135,6 +144,14 @@
 physaddr_t init_boot_allocator(physaddr_t bitmap_start)
 {
     unsigned long bitmap_size;
+
+#ifdef CONFIG_NUMA
+    /* clear stats */
+    memset(page_alloc, 0, sizeof(page_alloc[NR_ZONES]));
+    memset(page_free, 0, sizeof(page_free[NR_ZONES]));
+    memset(numa_hit, 0, sizeof(numa_hit[NR_ZONES]));
+    memset(numa_miss, 0, sizeof(numa_miss[NR_ZONES]));
+#endif
 
     bitmap_start = round_pgup(bitmap_start);
 
@@ -210,12 +227,6 @@
  * BINARY BUDDY ALLOCATOR
  */
 
-#define MEMZONE_XEN 0
-#define MEMZONE_DOM 1
-#define MEMZONE_DMADOM 2
-#define NR_ZONES    3
-
-
 #define MAX_DMADOM_PFN 0x7FFFFUL /* 31 addressable bits */
 #define pfn_dom_zone_type(_pfn)                                 \
     (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
@@ -226,6 +237,26 @@
 
 static unsigned long avail[NR_ZONES];
 
+#ifdef CONFIG_NUMA
+/* min and max paddr per node */
+extern u64 node_min_paddr[];
+extern u64 node_max_paddr[];
+extern int num_memory_chunks;
+extern struct node_memory_chunk_s node_memory_chunk[];
+extern int cpu_to_node[];
+extern cpumask_t node_to_cpumask[];
+
+/* NUMA/Alloc stats per zone*/
+unsigned long page_alloc[NR_ZONES];  /* total page allocs */
+unsigned long page_free[NR_ZONES];   /* total page frees */
+unsigned long numa_hit[NR_ZONES];    /* allocated in intended node */
+unsigned long numa_miss[NR_ZONES];   /* allocated in non intended node */
+EXPORT_SYMBOL(page_alloc);
+EXPORT_SYMBOL(page_free);
+EXPORT_SYMBOL(numa_hit);
+EXPORT_SYMBOL(numa_miss);
+#endif
+ 
 static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED;
 
 void end_boot_allocator(void)
@@ -263,31 +294,122 @@
         free_heap_pages(zone, pg+i, 0);
 }
 
-
-/* Allocate 2^@order contiguous pages. */
-struct pfn_info *alloc_heap_pages(unsigned int zone, unsigned int order)
+#ifdef CONFIG_NUMA
+/* A local pfn is one whose physical address range falls within 
+ * a node_memory_chunk which belongs to the designated cpu
+ * return NULL, if no pfns in the list are local
+ * assumes caller holds heap lock.
+ */
+struct pfn_info *__find_local_page(struct list_head *pfn_list, 
+                                 int list_order, int cpu) 
+{  
+   int i, nid;
+   u64 cpu_start; /* lowest start addr near cpu */
+   u64 cpu_end;   /* highest end addr near cpu */
+   u64 start = 0, end = 0, length = (PAGE_SIZE << list_order);
+   struct pfn_info *pg = NULL;
+   struct node_memory_chunk_s *c;
+
+   nid = cpu_to_node[cpu];
+   cpu_start = node_min_paddr[nid];
+   cpu_end = node_max_paddr[nid];
+
+   list_for_each_entry ( pg, pfn_list, list )
+   {
+       /* find starting/ending pa of page */
+       start = page_to_phys(pg);
+       end = start + length;
+
+       /* if the current page is in the cpu min, max range 
+        * dig deeper to see if the page falls within a 
+        * a chunk and not spanning two chunks.
+        */
+       if ( (cpu_start <= start && end <= cpu_end) )
+       {
+           /* walk the chunk array, bail if we find a chunk
+            * bound to cpu and page is within chunk range
+            */
+           for (i = 0; i < num_memory_chunks; i++) {
+               c =  &node_memory_chunk[i];
+               if ((c->nid == nid) && (c->start_paddr <= start) && 
+                     (end <= c->end_paddr)) {
+                  DPRINTK("NUMA hit: (%"PRIx64",%"PRIx64") is in "
+                          "chunk(%"PRIx64",%"PRIx64")\n",
+                          start,end,c->start_paddr,c->end_paddr);
+                  goto out;
+               }
+           }
+       }
+    }
+    /* make sure if none are found we return NULL */
+    DPRINTK("NUMA miss: (%"PRIx64",%"PRIx64") not in CPU%d chunk "
+            "max range(%"PRIx64",%"PRIx64")\n", 
+            start, end, cpu, cpu_start, cpu_end);
+    pg = NULL;
+
+ out:
+    return pg;
+}
+#endif
+
+/* encapsulate the work of finding a pfn that satisfies the request
+ * in a NUMA-aware manner if configured.  For now, we attempt to find a 
+ * local page (via __find_local_page) and if that fails, we allocate 
+ * the first pfn in the first non-empty list for the given order.
+ *
+ * A second pass at this should try to find the *next* closes page to the
+ * requestor. Or even with out *next* closest, we search up the order list
+ * though that would possibly increase memory fragmentation.
+ *
+ * Without CONFIG_NUMA, __get_page() returns the first page in the
+ * first non-empty buck of order >= requested order.
+ *
+ * Assumes caller is holding heap_lock.
+ */
+struct pfn_info *__get_page(unsigned int zone, 
+                            unsigned int order, 
+                            int cpu, 
+                            struct domain *d)
 {
     int i;
-    struct pfn_info *pg;
+    struct pfn_info *pg = NULL;
 
     ASSERT(zone < NR_ZONES);
 
     if ( unlikely(order > MAX_ORDER) )
         return NULL;
 
-    spin_lock(&heap_lock);
-
     /* Find smallest order which can satisfy the request. */
-    for ( i = order; i <= MAX_ORDER; i++ )
-        if ( !list_empty(&heap[zone][i]) )
-            goto found;
-
+    for ( i = order; i <= MAX_ORDER; i++ ) {
+        if ( !list_empty(&heap[zone][i]) ) {
+#ifdef CONFIG_NUMA
+            /* see if we can get a local one if we have multiple chunks */
+            if ( num_memory_chunks > 1 ) {
+                pg = __find_local_page(&heap[zone][i], i, cpu);
+                /* increment counters based on pg */
+                (pg) ?  numa_hit[zone]++ : numa_miss[zone]++;
+                (pg) ?  d->numa_hit++ : d->numa_miss++;
+            }
+            if ( pg == NULL ) {
+                /* No local pages, just take the first */
+                pg = list_entry(heap[zone][i].next, struct pfn_info, list);
+            }
+            if ( pg != NULL) {
+                d->page_alloc++;
+                page_alloc[zone]++;  /* increment number of successful allocs 
*/
+                goto found;
+            }
+#else
+            if( (pg = list_entry(heap[zone][i].next, struct pfn_info, list)) )
+                goto found;
+#endif
+        }
+    }
     /* No suitable memory blocks. Fail the request. */
-    spin_unlock(&heap_lock);
     return NULL;
 
  found: 
-    pg = list_entry(heap[zone][i].next, struct pfn_info, list);
+    ASSERT(pg != NULL);
     list_del(&pg->list);
 
     /* We may have to halve the chunk a number of times. */
@@ -297,15 +419,34 @@
         list_add_tail(&pg->list, &heap[zone][i]);
         pg += 1 << i;
     }
-    
-    map_alloc(page_to_pfn(pg), 1 << order);
-    avail[zone] -= 1 << order;
-
+ 
+    return pg;
+}
+
+/* Allocate 2^@order contiguous pages. */
+struct pfn_info *__alloc_heap_pages(unsigned int zone, 
+                                    unsigned int order, 
+                                    int cpu, 
+                                    struct domain *d)
+{
+    struct pfn_info *pg;
+
+    spin_lock(&heap_lock);
+    pg = __get_page(zone, order, cpu, d);
+    if (likely(pg != NULL)) {
+        map_alloc(page_to_pfn(pg), 1 << order);
+        avail[zone] -= 1 << order;
+    }
     spin_unlock(&heap_lock);
 
     return pg;
 }
 
+/* wrapper for previous API */
+inline struct pfn_info *alloc_heap_pages(unsigned int zone, unsigned int order)
+{
+   return __alloc_heap_pages(zone, order, current->processor, current->domain);
+}
 
 /* Free 2^@order set of pages. */
 void free_heap_pages(
@@ -317,6 +458,11 @@
     ASSERT(order <= MAX_ORDER);
 
     spin_lock(&heap_lock);
+
+#ifdef CONFIG_NUMA
+    /* increment page_free counter */
+    page_free[zone]++;
+#endif
 
     map_free(page_to_pfn(pg), 1 << order);
     avail[zone] += 1 << order;
@@ -507,8 +653,9 @@
 }
 
 
-struct pfn_info *alloc_domheap_pages(
-    struct domain *d, unsigned int order, unsigned int flags)
+struct pfn_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int order, unsigned int flags, 
+    int cpu)
 {
     struct pfn_info *pg = NULL;
     cpumask_t mask;
@@ -517,10 +664,10 @@
     ASSERT(!in_irq());
 
     if ( !(flags & ALLOC_DOM_DMA) )
-        pg = alloc_heap_pages(MEMZONE_DOM, order);
+        pg = __alloc_heap_pages(MEMZONE_DOM, order, cpu, d);
 
     if ( pg == NULL )
-        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
+        if ( (pg = __alloc_heap_pages(MEMZONE_DMADOM, order, cpu, d)) == NULL )
             return NULL;
 
     mask = pg->u.free.cpumask;
@@ -584,6 +731,13 @@
     return pg;
 }
 
+/* old api wrapper */
+inline struct pfn_info *alloc_domheap_pages(struct domain *d, 
+                                            unsigned int order, 
+                                            unsigned int flags)
+{
+    return __alloc_domheap_pages(d, order, flags, current->processor);
+}
 
 void free_domheap_pages(struct pfn_info *pg, unsigned int order)
 {
diff -r d7322c375c6e xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Mon Nov 14 22:50:19 2005
+++ b/xen/include/xen/sched.h   Tue Nov 15 12:28:51 2005
@@ -102,6 +102,12 @@
     unsigned int     tot_pages;       /* number of pages currently possesed */
     unsigned int     max_pages;       /* maximum value for tot_pages        */
     unsigned int     xenheap_pages;   /* # pages allocated from Xen heap    */
+
+#ifdef CONFIG_NUMA
+    unsigned long    numa_hit;
+    unsigned long    numa_miss;
+    unsigned long    page_alloc;
+#endif
 
     /* Scheduling. */
     int              shutdown_code; /* code value from OS (if DOMF_shutdown) */

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.