WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 2/6] xen: NUMA-aware page allocator

To: xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH 2/6] xen: NUMA-aware page allocator
From: Ryan Harper <ryanh@xxxxxxxxxx>
Date: Tue, 11 Jul 2006 10:36:22 -0500
Delivery-date: Tue, 11 Jul 2006 08:39:05 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mutt/1.5.6+20040907i
This patch changes the heap structure to include a per-NODE bucket where
memory from each ZONE is deposited.  When the heap is initialized, on
insertion, we determine to which node each page belong and add it to
appropriate place.  Using reservation pages, we mark any non-MAX_ORDER
aligned boundaries which would otherwise allow the allocator to
coalesce chucks across two nodes.  New heap allocator functions now
take a cpu parameter which will be used to determine which node bucket
will be used to satisfy the request.

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


diffstat output:
 common/page_alloc.c |  199 ++++++++++++++++++++++++++++++++++++++++------------
 include/xen/mm.h    |    7 +
 2 files changed, 162 insertions(+), 44 deletions(-)

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
diff -r 52ceac03bf85 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Mon Jul  3 13:53:18 2006
+++ b/xen/common/page_alloc.c   Mon Jul  3 09:42:41 2006
@@ -4,6 +4,7 @@
  * Simple buddy heap allocator for Xen.
  * 
  * Copyright (c) 2002-2004 K A Fraser
+ * Copyright (c) 2006 IBM Ryan Harper <ryanh@xxxxxxxxxx>
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,6 +34,7 @@
 #include <xen/shadow.h>
 #include <xen/domain_page.h>
 #include <xen/keyhandler.h>
+#include <asm/numa.h>
 #include <asm/page.h>
 
 /*
@@ -247,22 +249,23 @@
 #define pfn_dom_zone_type(_pfn)                                 \
     (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
 
-static struct list_head heap[NR_ZONES][MAX_ORDER+1];
-
-static unsigned long avail[NR_ZONES];
+static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1];
+
+static unsigned long avail[NR_ZONES][MAX_NUMNODES];
 
 static DEFINE_SPINLOCK(heap_lock);
 
 void end_boot_allocator(void)
 {
-    unsigned long i, j;
+    unsigned long i, j, k;
     int curr_free = 0, next_free = 0;
 
     memset(avail, 0, sizeof(avail));
 
     for ( i = 0; i < NR_ZONES; i++ )
-        for ( j = 0; j <= MAX_ORDER; j++ )
-            INIT_LIST_HEAD(&heap[i][j]);
+        for ( j = 0; j < MAX_NUMNODES; j++ )
+            for ( k = 0; k <= MAX_ORDER; k++ )
+                INIT_LIST_HEAD(&heap[i][j][k]);
 
     /* Pages that are free now go to the domain sub-allocator. */
     for ( i = 0; i < max_page; i++ )
@@ -272,29 +275,58 @@
         if ( next_free )
             map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
         if ( curr_free )
-            free_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 0);
-    }
-}
-
-/* Hand the specified arbitrary page range to the specified heap zone. */
+            init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1);
+    }
+}
+
+/* 
+ * Hand the specified arbitrary page range to the specified heap zone
+ * checking the node_id of the previous page.  If they differ and the
+ * latter is not on a MAX_ORDER boundary, then we reserve the page by
+ * not freeing it to the buddy allocator.
+ */
+#define MAX_ORDER_ALIGNED (1UL << (MAX_ORDER))
 void init_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned long nr_pages)
 {
+    unsigned int nid_curr,nid_prev;
     unsigned long i;
 
     ASSERT(zone < NR_ZONES);
 
+    if ( likely(page_to_mfn(pg) != 0) )
+        nid_prev = phys_to_nid(page_to_maddr(pg-1));
+    else
+        nid_prev = phys_to_nid(page_to_maddr(pg));
+
     for ( i = 0; i < nr_pages; i++ )
-        free_heap_pages(zone, pg+i, 0);
-}
-
+    {
+        nid_curr = phys_to_nid(page_to_maddr(pg+i));
+
+        /*
+         * free pages of the same node, or if they differ, but are on a
+         * MAX_ORDER alignement boundary (which already get reserved)
+         */
+         if ( (nid_curr == nid_prev) || (page_to_maddr(pg+i) &
+                                         MAX_ORDER_ALIGNED) )
+             free_heap_pages(zone, pg+i, 0);
+         else
+             printk("Reserving non-aligned node boundary @ mfn %lu\n",
+                    page_to_mfn(pg+i));
+
+        nid_prev = nid_curr;
+    }
+}
 
 /* Allocate 2^@order contiguous pages. */
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order)
-{
-    int i;
+struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu,
+                                   unsigned int order)
+{
+    int i,j, node;
     struct page_info *pg;
 
+    ASSERT(cpu_to_node[cpu] >= 0);
+    ASSERT(cpu_to_node[cpu] < num_online_nodes());
     ASSERT(zone < NR_ZONES);
 
     if ( unlikely(order > MAX_ORDER) )
@@ -302,29 +334,38 @@
 
     spin_lock(&heap_lock);
 
-    /* Find smallest order which can satisfy the request. */
-    for ( i = order; i <= MAX_ORDER; i++ )
-        if ( !list_empty(&heap[zone][i]) )
-            goto found;
+    /* start with requested node, but exhaust all node memory
+     * in requested zone before failing */
+    for ( i = 0; i < num_online_nodes(); i++ )
+    {
+        node = (cpu_to_node[cpu]+i) % num_online_nodes();
+        /* Find smallest order which can satisfy the request. */
+        for ( j = order; j <= MAX_ORDER; j++ )
+        {
+            if ( !list_empty(&heap[zone][node][j]) )
+                goto found;
+        }
+    }
 
     /* No suitable memory blocks. Fail the request. */
     spin_unlock(&heap_lock);
     return NULL;
 
  found: 
-    pg = list_entry(heap[zone][i].next, struct page_info, list);
+    pg = list_entry(heap[zone][node][j].next, struct page_info, list);
     list_del(&pg->list);
 
     /* We may have to halve the chunk a number of times. */
-    while ( i != order )
-    {
-        PFN_ORDER(pg) = --i;
-        list_add_tail(&pg->list, &heap[zone][i]);
-        pg += 1 << i;
+    while ( j != order )
+    {
+        PFN_ORDER(pg) = --j;
+        list_add_tail(&pg->list, &heap[zone][node][j]);
+        pg += 1 << j;
     }
     
     map_alloc(page_to_mfn(pg), 1 << order);
-    avail[zone] -= 1 << order;
+    ASSERT(avail[zone][node] >= (1 << order));
+    avail[zone][node] -= 1 << order;
 
     spin_unlock(&heap_lock);
 
@@ -337,14 +378,17 @@
     unsigned int zone, struct page_info *pg, unsigned int order)
 {
     unsigned long mask;
+    int node = phys_to_nid(page_to_maddr(pg));
 
     ASSERT(zone < NR_ZONES);
     ASSERT(order <= MAX_ORDER);
+    ASSERT(node >= 0);
+    ASSERT(node < num_online_nodes());
 
     spin_lock(&heap_lock);
 
     map_free(page_to_mfn(pg), 1 << order);
-    avail[zone] += 1 << order;
+    avail[zone][node] += 1 << order;
     
     /* Merge chunks as far as possible. */
     while ( order < MAX_ORDER )
@@ -370,10 +414,13 @@
         }
         
         order++;
+
+        /* after merging, pg should be in the same node */
+        ASSERT(phys_to_nid(page_to_maddr(pg)) == node );
     }
 
     PFN_ORDER(pg) = order;
-    list_add_tail(&pg->list, &heap[zone][order]);
+    list_add_tail(&pg->list, &heap[zone][node][order]);
 
     spin_unlock(&heap_lock);
 }
@@ -466,7 +513,7 @@
     int i;
 
     local_irq_save(flags);
-    pg = alloc_heap_pages(MEMZONE_XEN, order);
+    pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order);
     local_irq_restore(flags);
 
     if ( unlikely(pg == NULL) )
@@ -580,8 +627,9 @@
 }
 
 
-struct page_info *alloc_domheap_pages(
-    struct domain *d, unsigned int order, unsigned int memflags)
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, 
+    unsigned int memflags)
 {
     struct page_info *pg = NULL;
     cpumask_t mask;
@@ -591,17 +639,17 @@
 
     if ( !(memflags & MEMF_dma) )
     {
-        pg = alloc_heap_pages(MEMZONE_DOM, order);
+        pg = alloc_heap_pages(MEMZONE_DOM, cpu, order);
         /* Failure? Then check if we can fall back to the DMA pool. */
         if ( unlikely(pg == NULL) &&
              ((order > MAX_ORDER) ||
-              (avail[MEMZONE_DMADOM] <
+              (avail_heap_pages(MEMZONE_DMADOM,-1) <
                (lowmem_emergency_pool_pages + (1UL << order)))) )
             return NULL;
     }
 
     if ( pg == NULL )
-        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
+        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL )
             return NULL;
 
     mask = pg->u.free.cpumask;
@@ -638,6 +686,13 @@
     }
     
     return pg;
+}
+
+inline struct page_info *alloc_domheap_pages(
+    struct domain *d, unsigned int order, unsigned int flags)
+{
+    return __alloc_domheap_pages(d, smp_processor_id(), order, flags);
+
 }
 
 
@@ -714,13 +769,27 @@
 }
 
 
+unsigned long avail_heap_pages(int zone, int node)
+{
+    int i,j;
+    unsigned long free_pages = 0;
+   
+    for (i=0; i<NR_ZONES; i++)
+        if ( (zone == -1) || (zone == i) )
+            for (j=0; j<num_online_nodes(); j++)
+                if ( (node == -1) || (node == j) )
+                    free_pages += avail[i][j];            
+
+    return free_pages;
+}
+
 unsigned long avail_domheap_pages(void)
 {
     unsigned long avail_nrm, avail_dma;
-
-    avail_nrm = avail[MEMZONE_DOM];
-
-    avail_dma = avail[MEMZONE_DMADOM];
+    
+    avail_nrm = avail_heap_pages(MEMZONE_DOM,-1);
+
+    avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1);
     if ( avail_dma > lowmem_emergency_pool_pages )
         avail_dma -= lowmem_emergency_pool_pages;
     else
@@ -729,6 +798,10 @@
     return avail_nrm + avail_dma;
 }
 
+unsigned long avail_nodeheap_pages(int node)
+{
+    return avail_heap_pages(-1, node);
+}
 
 static void pagealloc_keyhandler(unsigned char key)
 {
@@ -736,9 +809,9 @@
     printk("    Xen heap: %lukB free\n"
            "    DMA heap: %lukB free\n"
            "    Dom heap: %lukB free\n",
-           avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+           avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10));
 }
 
 
@@ -805,6 +878,46 @@
 {
     return scrub_pages;
 }
+
+static unsigned long count_bucket(struct list_head* l, int order)
+{
+    unsigned long total_pages = 0;
+    int pages = 1 << order;
+    struct page_info *pg;
+
+    list_for_each_entry(pg, l, list)
+        total_pages += pages;
+
+    return total_pages;
+}
+
+static void dump_heap(unsigned char key)
+{
+    s_time_t       now = NOW();
+    int i,j,k;
+    unsigned long total;
+
+    printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
+           (u32)(now>>32), (u32)now);
+
+    for (i=0; i<NR_ZONES; i++ )
+        for (j=0;j<MAX_NUMNODES;j++)
+            for (k=0;k<=MAX_ORDER;k++)
+                if ( !list_empty(&heap[i][j][k]) )
+                {
+                    total = count_bucket(&heap[i][j][k], k);
+                    printk("heap[%d][%d][%d]-> %lu pages\n",
+                            i, j, k, total);
+                }
+}
+
+static __init int register_heap_trigger(void)
+{
+    register_keyhandler('H', dump_heap, "dump heap info");
+    return 0;
+}
+__initcall(register_heap_trigger);
+
 
 static __init int page_scrub_init(void)
 {
diff -r 52ceac03bf85 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h      Mon Jul  3 13:53:18 2006
+++ b/xen/include/xen/mm.h      Mon Jul  3 09:42:41 2006
@@ -45,7 +45,8 @@
 /* Generic allocator. These functions are *not* interrupt-safe. */
 void init_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned long nr_pages);
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order);
+struct page_info *alloc_heap_pages(
+    unsigned int zone, unsigned int cpu, unsigned int order);
 void free_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned int order);
 void scrub_heap_pages(void);
@@ -61,8 +62,12 @@
 void init_domheap_pages(paddr_t ps, paddr_t pe);
 struct page_info *alloc_domheap_pages(
     struct domain *d, unsigned int order, unsigned int memflags);
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, 
+    unsigned int memflags);
 void free_domheap_pages(struct page_info *pg, unsigned int order);
 unsigned long avail_domheap_pages(void);
+unsigned long avail_heap_pages(int zone, int node);
 #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0))
 #define free_domheap_page(p)  (free_domheap_pages(p,0))
 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>