WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] numa: Attempt more efficient NUMA allocat

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] numa: Attempt more efficient NUMA allocation in hypervisor by default.
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 05 Aug 2010 01:35:25 -0700
Delivery-date: Thu, 05 Aug 2010 01:36:55 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1280932528 -3600
# Node ID 581ebaa7e2da17c23a2dd890943572837a02b29f
# Parent  39448a99227b61abb463c91e7e7c93763ddb3dce
numa: Attempt more efficient NUMA allocation in hypervisor by default.

 1. Try to allocate from nodes containing CPUs which a guest can be
 scheduled on.
 2. Remember which node we allocated from last, and round-robin
 allocations among above-mentioned nodes.

Signed-off-by: Keir Fraser <keir.fraser@xxxxxxxxxx>
---
 xen/common/domain.c     |   29 +++++++++++++++++++
 xen/common/memory.c     |    9 ++----
 xen/common/page_alloc.c |   72 ++++++++++++++++++++++++++++++++----------------
 xen/common/schedule.c   |    3 ++
 xen/include/xen/sched.h |    9 ++++++
 5 files changed, 93 insertions(+), 29 deletions(-)

diff -r 39448a99227b -r 581ebaa7e2da xen/common/domain.c
--- a/xen/common/domain.c       Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/common/domain.c       Wed Aug 04 15:35:28 2010 +0100
@@ -191,6 +191,8 @@ struct vcpu *alloc_vcpu(
     /* Must be called after making new vcpu visible to for_each_vcpu(). */
     vcpu_check_shutdown(v);
 
+    domain_update_node_affinity(d);
+
     return v;
 }
 
@@ -234,6 +236,8 @@ struct domain *domain_create(
     spin_lock_init(&d->hypercall_deadlock_mutex);
     INIT_PAGE_LIST_HEAD(&d->page_list);
     INIT_PAGE_LIST_HEAD(&d->xenpage_list);
+
+    spin_lock_init(&d->node_affinity_lock);
 
     spin_lock_init(&d->shutdown_lock);
     d->shutdown_code = -1;
@@ -338,6 +342,31 @@ struct domain *domain_create(
     xfree(d->pirq_to_evtchn);
     free_domain_struct(d);
     return NULL;
+}
+
+
+void domain_update_node_affinity(struct domain *d)
+{
+    cpumask_t cpumask = CPU_MASK_NONE;
+    nodemask_t nodemask = NODE_MASK_NONE;
+    struct vcpu *v;
+    unsigned int node;
+
+    spin_lock(&d->node_affinity_lock);
+
+    for_each_vcpu ( d, v )
+        cpus_or(cpumask, cpumask, v->cpu_affinity);
+
+    for_each_online_node ( node )
+    {
+        if ( cpus_intersects(node_to_cpumask(node), cpumask) )
+            node_set(node, nodemask);
+        else
+            node_clear(node, nodemask);
+    }
+
+    d->node_affinity = nodemask;
+    spin_unlock(&d->node_affinity_lock);
 }
 
 
diff -r 39448a99227b -r 581ebaa7e2da xen/common/memory.c
--- a/xen/common/memory.c       Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/common/memory.c       Wed Aug 04 15:35:28 2010 +0100
@@ -259,7 +259,7 @@ static long memory_exchange(XEN_GUEST_HA
     unsigned long in_chunk_order, out_chunk_order;
     xen_pfn_t     gpfn, gmfn, mfn;
     unsigned long i, j, k;
-    unsigned int  node, memflags = 0;
+    unsigned int  memflags = 0;
     long          rc = 0;
     struct domain *d;
     struct page_info *page;
@@ -324,10 +324,7 @@ static long memory_exchange(XEN_GUEST_HA
         d,
         XENMEMF_get_address_bits(exch.out.mem_flags) ? :
         (BITS_PER_LONG+PAGE_SHIFT)));
-    node = XENMEMF_get_node(exch.out.mem_flags);
-    if ( node == NUMA_NO_NODE )
-        node = domain_to_node(d);
-    memflags |= MEMF_node(node);
+    memflags |= MEMF_node(XENMEMF_get_node(exch.out.mem_flags));
 
     for ( i = (exch.nr_exchanged >> in_chunk_order);
           i < (exch.in.nr_extents >> in_chunk_order);
@@ -545,7 +542,7 @@ long do_memory_op(unsigned long cmd, XEN
         }
 
         args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags));
-        if (reservation.mem_flags & XENMEMF_exact_node_request)
+        if ( reservation.mem_flags & XENMEMF_exact_node_request )
             args.memflags |= MEMF_exact_node;
 
         if ( op == XENMEM_populate_physmap
diff -r 39448a99227b -r 581ebaa7e2da xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/common/page_alloc.c   Wed Aug 04 15:35:28 2010 +0100
@@ -295,20 +295,29 @@ static unsigned long init_node_heap(int 
 /* Allocate 2^@order contiguous pages. */
 static struct page_info *alloc_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi,
-    unsigned int node, unsigned int order, unsigned int memflags)
-{
-    unsigned int i, j, zone = 0;
-    unsigned int num_nodes = num_online_nodes();
+    unsigned int order, unsigned int memflags,
+    struct domain *d)
+{
+    unsigned int first_node, i, j, zone = 0, nodemask_retry = 0;
+    unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1);
     unsigned long request = 1UL << order;
-    bool_t exact_node_request = !!(memflags & MEMF_exact_node);
     cpumask_t extra_cpus_mask, mask;
     struct page_info *pg;
+    nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
 
     if ( node == NUMA_NO_NODE )
     {
-        node = cpu_to_node(smp_processor_id());
-        exact_node_request = 0;
-    }
+        memflags &= ~MEMF_exact_node;
+        if ( d != NULL )
+        {
+            node = next_node(d->last_alloc_node, nodemask);
+            if ( node >= MAX_NUMNODES )
+                node = first_node(nodemask);
+        }
+        if ( node >= MAX_NUMNODES )
+            node = cpu_to_node(smp_processor_id());
+    }
+    first_node = node;
 
     ASSERT(node >= 0);
     ASSERT(zone_lo <= zone_hi);
@@ -335,7 +344,7 @@ static struct page_info *alloc_heap_page
      * zone before failing, only calc new node value if we fail to find memory 
      * in target node, this avoids needless computation on fast-path.
      */
-    for ( i = 0; i < num_nodes; i++ )
+    for ( ; ; )
     {
         zone = zone_hi;
         do {
@@ -349,18 +358,35 @@ static struct page_info *alloc_heap_page
                     goto found;
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
-        if ( exact_node_request )
+        if ( memflags & MEMF_exact_node )
             goto not_found;
 
-        /* Pick next node, wrapping around if needed. */
-        node = next_node(node, node_online_map);
-        if (node == MAX_NUMNODES)
-            node = first_node(node_online_map);
+        /* Pick next node. */
+        if ( !node_isset(node, nodemask) )
+        {
+            /* Very first node may be caller-specified and outside nodemask. */
+            ASSERT(!nodemask_retry);
+            first_node = node = first_node(nodemask);
+            if ( node < MAX_NUMNODES )
+                continue;
+        }
+        else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
+            node = first_node(nodemask);
+        if ( node == first_node )
+        {
+            /* When we have tried all in nodemask, we fall back to others. */
+            if ( nodemask_retry++ )
+                goto not_found;
+            nodes_andnot(nodemask, node_online_map, nodemask);
+            first_node = node = first_node(nodemask);
+            if ( node >= MAX_NUMNODES )
+                goto not_found;
+        }
     }
 
  try_tmem:
     /* Try to free memory from tmem */
-    if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL )
+    if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL )
     {
         /* reassigning an already allocated anonymous heap page */
         spin_unlock(&heap_lock);
@@ -385,6 +411,9 @@ static struct page_info *alloc_heap_page
     avail[node][zone] -= request;
     total_avail_pages -= request;
     ASSERT(total_avail_pages >= 0);
+
+    if ( d != NULL )
+        d->last_alloc_node = node;
 
     spin_unlock(&heap_lock);
 
@@ -1010,7 +1039,7 @@ void *alloc_xenheap_pages(unsigned int o
     ASSERT(!in_irq());
 
     pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
-        cpu_to_node(smp_processor_id()), order, memflags);
+                          order, memflags, NULL);
     if ( unlikely(pg == NULL) )
         return NULL;
 
@@ -1153,24 +1182,21 @@ struct page_info *alloc_domheap_pages(
 {
     struct page_info *pg = NULL;
     unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
-    unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
+    unsigned int dma_zone;
 
     ASSERT(!in_irq());
-
-    if ( (node == NUMA_NO_NODE) && (d != NULL) )
-        node = domain_to_node(d);
 
     bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
     if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
         return NULL;
 
     if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
-        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
+        pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
 
     if ( (pg == NULL) &&
          ((memflags & MEMF_no_dma) ||
-          ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
-                                  node, order, memflags)) == NULL)) )
+          ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
+                                  memflags, d)) == NULL)) )
          return NULL;
 
     if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
diff -r 39448a99227b -r 581ebaa7e2da xen/common/schedule.c
--- a/xen/common/schedule.c     Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/common/schedule.c     Wed Aug 04 15:35:28 2010 +0100
@@ -270,6 +270,7 @@ int sched_move_domain(struct domain *d, 
         SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
 
         cpus_setall(v->cpu_affinity);
+        domain_update_node_affinity(d);
         v->processor = new_p;
         v->sched_priv = vcpu_priv[v->vcpu_id];
         evtchn_move_pirqs(v);
@@ -477,6 +478,7 @@ int cpu_disable_scheduler(unsigned int c
                 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
                         v->domain->domain_id, v->vcpu_id);
                 cpus_setall(v->cpu_affinity);
+                domain_update_node_affinity(d);
             }
 
             if ( v->processor == cpu )
@@ -519,6 +521,7 @@ int vcpu_set_affinity(struct vcpu *v, cp
 
     old_affinity = v->cpu_affinity;
     v->cpu_affinity = *affinity;
+    domain_update_node_affinity(v->domain);
     *affinity = old_affinity;
     if ( !cpu_isset(v->processor, v->cpu_affinity) )
         set_bit(_VPF_migrating, &v->pause_flags);
diff -r 39448a99227b -r 581ebaa7e2da xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/include/xen/sched.h   Wed Aug 04 15:35:28 2010 +0100
@@ -23,6 +23,8 @@
 #include <xen/mm.h>
 #include <xen/tasklet.h>
 #include <public/mem_event.h>
+#include <xen/cpumask.h>
+#include <xen/nodemask.h>
 
 #ifdef CONFIG_COMPAT
 #include <compat/vcpu.h>
@@ -326,6 +328,11 @@ struct domain
 
     /* Memory paging support */
     struct mem_event_domain mem_event;
+
+    /* Currently computed from union of all vcpu cpu-affinity masks. */
+    nodemask_t node_affinity;
+    unsigned int last_alloc_node;
+    spinlock_t node_affinity_lock;
 };
 
 struct domain_setup_info
@@ -393,6 +400,8 @@ static inline void get_knownalive_domain
     ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED));
 }
 
+void domain_update_node_affinity(struct domain *d);
+
 struct domain *domain_create(
     domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] numa: Attempt more efficient NUMA allocation in hypervisor by default., Xen patchbot-unstable <=