Same as [1]previous post:
> This patch changes the heap structure to include a per-NODE bucket
> where
> memory from each ZONE is deposited. When the heap is initialized, on
> insertion, we determine to which node each page belong and add it to
> appropriate place. Using reservation pages, we mark any non-MAX_ORDER
> aligned boundaries which would otherwise allow the allocator to
> coalesce chucks across two nodes. New heap allocator functions now
> take a cpu parameter which will be used to determine which node bucket
> will be used to satisfy the request.
Update includes some optimization in alloc_heap_pages():
1. Move num_online_nodes() out of in-loop target node calc.
2. Move target node calculation after inner loop; only calc next node
when needed.
3. Check if target node can support request via checking avail array
skipping a node if it can't satisfy the request.
4. Replacing modulo to remove the div from loop when wrapping while
picking the next node.
[1] http://lists.xensource.com/archives/html/xen-devel/2006-07/msg00543.html
--
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253 T/L: 678-9253
ryanh@xxxxxxxxxx
diffstat output:
common/page_alloc.c | 210 +++++++++++++++++++++++++++++++++++++++++-----------
include/xen/mm.h | 7 +
2 files changed, 172 insertions(+), 45 deletions(-)
Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
diff -r d10f60fbc760 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c Mon Jul 31 10:06:12 2006 -0500
+++ b/xen/common/page_alloc.c Mon Jul 31 10:29:51 2006 -0500
@@ -4,6 +4,7 @@
* Simple buddy heap allocator for Xen.
*
* Copyright (c) 2002-2004 K A Fraser
+ * Copyright (c) 2006 IBM Ryan Harper <ryanh@xxxxxxxxxx>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -33,6 +34,7 @@
#include <xen/shadow.h>
#include <xen/domain_page.h>
#include <xen/keyhandler.h>
+#include <asm/numa.h>
#include <asm/page.h>
/*
@@ -247,22 +249,23 @@ unsigned long alloc_boot_pages(unsigned
#define pfn_dom_zone_type(_pfn) \
(((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
-static struct list_head heap[NR_ZONES][MAX_ORDER+1];
-
-static unsigned long avail[NR_ZONES];
+static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1];
+
+static unsigned long avail[NR_ZONES][MAX_NUMNODES];
static DEFINE_SPINLOCK(heap_lock);
void end_boot_allocator(void)
{
- unsigned long i, j;
+ unsigned long i, j, k;
int curr_free = 0, next_free = 0;
memset(avail, 0, sizeof(avail));
for ( i = 0; i < NR_ZONES; i++ )
- for ( j = 0; j <= MAX_ORDER; j++ )
- INIT_LIST_HEAD(&heap[i][j]);
+ for ( j = 0; j < MAX_NUMNODES; j++ )
+ for ( k = 0; k <= MAX_ORDER; k++ )
+ INIT_LIST_HEAD(&heap[i][j][k]);
/* Pages that are free now go to the domain sub-allocator. */
for ( i = 0; i < max_page; i++ )
@@ -272,29 +275,59 @@ void end_boot_allocator(void)
if ( next_free )
map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
if ( curr_free )
- free_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 0);
- }
-}
-
-/* Hand the specified arbitrary page range to the specified heap zone. */
+ init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1);
+ }
+}
+
+/*
+ * Hand the specified arbitrary page range to the specified heap zone
+ * checking the node_id of the previous page. If they differ and the
+ * latter is not on a MAX_ORDER boundary, then we reserve the page by
+ * not freeing it to the buddy allocator.
+ */
+#define MAX_ORDER_ALIGNED (1UL << (MAX_ORDER))
void init_heap_pages(
unsigned int zone, struct page_info *pg, unsigned long nr_pages)
{
+ unsigned int nid_curr,nid_prev;
unsigned long i;
ASSERT(zone < NR_ZONES);
+ if ( likely(page_to_mfn(pg) != 0) )
+ nid_prev = phys_to_nid(page_to_maddr(pg-1));
+ else
+ nid_prev = phys_to_nid(page_to_maddr(pg));
+
for ( i = 0; i < nr_pages; i++ )
- free_heap_pages(zone, pg+i, 0);
-}
-
+ {
+ nid_curr = phys_to_nid(page_to_maddr(pg+i));
+
+ /*
+ * free pages of the same node, or if they differ, but are on a
+ * MAX_ORDER alignement boundary (which already get reserved)
+ */
+ if ( (nid_curr == nid_prev) || (page_to_maddr(pg+i) &
+ MAX_ORDER_ALIGNED) )
+ free_heap_pages(zone, pg+i, 0);
+ else
+ printk("Reserving non-aligned node boundary @ mfn %lu\n",
+ page_to_mfn(pg+i));
+
+ nid_prev = nid_curr;
+ }
+}
/* Allocate 2^@order contiguous pages. */
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order)
-{
- int i;
+struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu,
+ unsigned int order)
+{
+ unsigned int i,j, node = cpu_to_node[cpu], num_nodes = num_online_nodes();
+ unsigned int request = (1UL << order);
struct page_info *pg;
+ ASSERT(node >= 0);
+ ASSERT(node < num_nodes);
ASSERT(zone < NR_ZONES);
if ( unlikely(order > MAX_ORDER) )
@@ -302,29 +335,46 @@ struct page_info *alloc_heap_pages(unsig
spin_lock(&heap_lock);
- /* Find smallest order which can satisfy the request. */
- for ( i = order; i <= MAX_ORDER; i++ )
- if ( !list_empty(&heap[zone][i]) )
- goto found;
+ /* start with requested node, but exhaust all node memory
+ * in requested zone before failing, only calc new node
+ * value if we fail to find memory in target node, this avoids
+ * needless computation on fast-path */
+ for ( i = 0; i < num_nodes; i++ )
+ {
+ /* check if target node can support the allocation */
+ if ( avail[zone][node] >= request )
+ {
+ /* Find smallest order which can satisfy the request. */
+ for ( j = order; j <= MAX_ORDER; j++ )
+ {
+ if ( !list_empty(&heap[zone][node][j]) )
+ goto found;
+ }
+ }
+ /* pick next node, wrapping around if needed */
+ if ( ++node == num_nodes )
+ node = 0;
+ }
/* No suitable memory blocks. Fail the request. */
spin_unlock(&heap_lock);
return NULL;
found:
- pg = list_entry(heap[zone][i].next, struct page_info, list);
+ pg = list_entry(heap[zone][node][j].next, struct page_info, list);
list_del(&pg->list);
/* We may have to halve the chunk a number of times. */
- while ( i != order )
- {
- PFN_ORDER(pg) = --i;
- list_add_tail(&pg->list, &heap[zone][i]);
- pg += 1 << i;
+ while ( j != order )
+ {
+ PFN_ORDER(pg) = --j;
+ list_add_tail(&pg->list, &heap[zone][node][j]);
+ pg += 1 << j;
}
- map_alloc(page_to_mfn(pg), 1 << order);
- avail[zone] -= 1 << order;
+ map_alloc(page_to_mfn(pg), request);
+ ASSERT(avail[zone][node] >= request);
+ avail[zone][node] -= request;
spin_unlock(&heap_lock);
@@ -337,14 +387,17 @@ void free_heap_pages(
unsigned int zone, struct page_info *pg, unsigned int order)
{
unsigned long mask;
+ int node = phys_to_nid(page_to_maddr(pg));
ASSERT(zone < NR_ZONES);
ASSERT(order <= MAX_ORDER);
+ ASSERT(node >= 0);
+ ASSERT(node < num_online_nodes());
spin_lock(&heap_lock);
map_free(page_to_mfn(pg), 1 << order);
- avail[zone] += 1 << order;
+ avail[zone][node] += 1 << order;
/* Merge chunks as far as possible. */
while ( order < MAX_ORDER )
@@ -370,10 +423,13 @@ void free_heap_pages(
}
order++;
+
+ /* after merging, pg should be in the same node */
+ ASSERT(phys_to_nid(page_to_maddr(pg)) == node );
}
PFN_ORDER(pg) = order;
- list_add_tail(&pg->list, &heap[zone][order]);
+ list_add_tail(&pg->list, &heap[zone][node][order]);
spin_unlock(&heap_lock);
}
@@ -466,7 +522,7 @@ void *alloc_xenheap_pages(unsigned int o
int i;
local_irq_save(flags);
- pg = alloc_heap_pages(MEMZONE_XEN, order);
+ pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order);
local_irq_restore(flags);
if ( unlikely(pg == NULL) )
@@ -580,8 +636,9 @@ int assign_pages(
}
-struct page_info *alloc_domheap_pages(
- struct domain *d, unsigned int order, unsigned int memflags)
+struct page_info *__alloc_domheap_pages(
+ struct domain *d, unsigned int cpu, unsigned int order,
+ unsigned int memflags)
{
struct page_info *pg = NULL;
cpumask_t mask;
@@ -591,17 +648,17 @@ struct page_info *alloc_domheap_pages(
if ( !(memflags & MEMF_dma) )
{
- pg = alloc_heap_pages(MEMZONE_DOM, order);
+ pg = alloc_heap_pages(MEMZONE_DOM, cpu, order);
/* Failure? Then check if we can fall back to the DMA pool. */
if ( unlikely(pg == NULL) &&
((order > MAX_ORDER) ||
- (avail[MEMZONE_DMADOM] <
+ (avail_heap_pages(MEMZONE_DMADOM,-1) <
(lowmem_emergency_pool_pages + (1UL << order)))) )
return NULL;
}
if ( pg == NULL )
- if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
+ if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL )
return NULL;
mask = pg->u.free.cpumask;
@@ -638,6 +695,13 @@ struct page_info *alloc_domheap_pages(
}
return pg;
+}
+
+inline struct page_info *alloc_domheap_pages(
+ struct domain *d, unsigned int order, unsigned int flags)
+{
+ return __alloc_domheap_pages(d, smp_processor_id(), order, flags);
+
}
@@ -714,13 +778,27 @@ void free_domheap_pages(struct page_info
}
+unsigned long avail_heap_pages(int zone, int node)
+{
+ int i,j, num_nodes = num_online_nodes();
+ unsigned long free_pages = 0;
+
+ for (i=0; i<NR_ZONES; i++)
+ if ( (zone == -1) || (zone == i) )
+ for (j=0; j < num_nodes; j++)
+ if ( (node == -1) || (node == j) )
+ free_pages += avail[i][j];
+
+ return free_pages;
+}
+
unsigned long avail_domheap_pages(void)
{
unsigned long avail_nrm, avail_dma;
-
- avail_nrm = avail[MEMZONE_DOM];
-
- avail_dma = avail[MEMZONE_DMADOM];
+
+ avail_nrm = avail_heap_pages(MEMZONE_DOM,-1);
+
+ avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1);
if ( avail_dma > lowmem_emergency_pool_pages )
avail_dma -= lowmem_emergency_pool_pages;
else
@@ -729,6 +807,10 @@ unsigned long avail_domheap_pages(void)
return avail_nrm + avail_dma;
}
+unsigned long avail_nodeheap_pages(int node)
+{
+ return avail_heap_pages(-1, node);
+}
static void pagealloc_keyhandler(unsigned char key)
{
@@ -736,9 +818,9 @@ static void pagealloc_keyhandler(unsigne
printk(" Xen heap: %lukB free\n"
" DMA heap: %lukB free\n"
" Dom heap: %lukB free\n",
- avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
- avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
- avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+ avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10),
+ avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10),
+ avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10));
}
@@ -805,6 +887,46 @@ unsigned long avail_scrub_pages(void)
{
return scrub_pages;
}
+
+static unsigned long count_bucket(struct list_head* l, int order)
+{
+ unsigned long total_pages = 0;
+ int pages = 1 << order;
+ struct page_info *pg;
+
+ list_for_each_entry(pg, l, list)
+ total_pages += pages;
+
+ return total_pages;
+}
+
+static void dump_heap(unsigned char key)
+{
+ s_time_t now = NOW();
+ int i,j,k;
+ unsigned long total;
+
+ printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
+ (u32)(now>>32), (u32)now);
+
+ for (i=0; i<NR_ZONES; i++ )
+ for (j=0;j<MAX_NUMNODES;j++)
+ for (k=0;k<=MAX_ORDER;k++)
+ if ( !list_empty(&heap[i][j][k]) )
+ {
+ total = count_bucket(&heap[i][j][k], k);
+ printk("heap[%d][%d][%d]-> %lu pages\n",
+ i, j, k, total);
+ }
+}
+
+static __init int register_heap_trigger(void)
+{
+ register_keyhandler('H', dump_heap, "dump heap info");
+ return 0;
+}
+__initcall(register_heap_trigger);
+
static __init int page_scrub_init(void)
{
diff -r d10f60fbc760 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h Mon Jul 31 10:06:12 2006 -0500
+++ b/xen/include/xen/mm.h Mon Jul 31 10:29:51 2006 -0500
@@ -45,7 +45,8 @@ void end_boot_allocator(void);
/* Generic allocator. These functions are *not* interrupt-safe. */
void init_heap_pages(
unsigned int zone, struct page_info *pg, unsigned long nr_pages);
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order);
+struct page_info *alloc_heap_pages(
+ unsigned int zone, unsigned int cpu, unsigned int order);
void free_heap_pages(
unsigned int zone, struct page_info *pg, unsigned int order);
void scrub_heap_pages(void);
@@ -61,8 +62,12 @@ void init_domheap_pages(paddr_t ps, padd
void init_domheap_pages(paddr_t ps, paddr_t pe);
struct page_info *alloc_domheap_pages(
struct domain *d, unsigned int order, unsigned int memflags);
+struct page_info *__alloc_domheap_pages(
+ struct domain *d, unsigned int cpu, unsigned int order,
+ unsigned int memflags);
void free_domheap_pages(struct page_info *pg, unsigned int order);
unsigned long avail_domheap_pages(void);
+unsigned long avail_heap_pages(int zone, int node);
#define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0))
#define free_domheap_page(p) (free_domheap_pages(p,0))
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|