WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 1/2]: hvm: NUMA guest: allocate memory and pin cpus a

To: Keir Fraser <keir.fraser@xxxxxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH 1/2]: hvm: NUMA guest: allocate memory and pin cpus according to guestnodes number (resend)
From: Andre Przywara <andre.przywara@xxxxxxx>
Date: Fri, 11 Jul 2008 16:11:54 +0200
Cc:
Delivery-date: Fri, 11 Jul 2008 07:12:53 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Thunderbird 1.5.0.10 (X11/20070409)
This patch introduces a new config file option called guestnodes.
Depending on the specified number (which can be 0 (the default) to
return to current behavior) a set of suitable nodes (which have enough
memory and are the least used ones) is selected and memory allocation is
split evenly across these host nodes. CPU affinity is set accordingly.

Reworked to apply against staging 18036.

Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
diff -r f40c310dca31 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/libxc/xc_hvm_build.c        Fri Jul 11 15:53:59 2008 +0200
@@ -18,6 +18,8 @@
 #include "xc_e820.h"
 
 #include <xen/libelf.h>
+
+#include <asm/bitops.h>
 
 #define SUPERPAGE_PFN_SHIFT  9
 #define SUPERPAGE_NR_PFNS    (1UL << SUPERPAGE_PFN_SHIFT)
@@ -155,8 +157,173 @@
     return rc;
 }
 
+static int hweight_long (unsigned long value)
+{
+int ret=0;
+
+    while (value>0)
+    {
+        if (value&1) ++ret;
+        value>>=1;
+    }
+    return ret;
+}
+
+static int get_nodemasks (int xc_handle, uint64_t **nodemasks)
+{
+#define MAX_CPU_ID 255
+    xc_physinfo_t physinfo;
+    xc_cpu_to_node_t *cpumap;
+    int nrcpus, i;
+
+    cpumap=(xc_cpu_to_node_t *)malloc(sizeof(xc_cpu_to_node_t)*MAX_CPU_ID);
+    set_xen_guest_handle(physinfo.cpu_to_node, cpumap);
+
+    xc_physinfo (xc_handle,&physinfo);
+    nrcpus = physinfo.threads_per_core * physinfo.cores_per_socket *
+             physinfo.nr_nodes;
+
+    *nodemasks=malloc(sizeof(uint64_t)*physinfo.nr_nodes);
+    memset (*nodemasks,0,sizeof(uint64_t)*physinfo.nr_nodes);
+    for ( i = 0; i < nrcpus; i++ )
+    {
+        (*nodemasks)[cpumap[i]] |= 1 << i;
+    }
+    return nrcpus;
+}
+
+/* Distribute the VCPUs to the given NUMA nodes.
+ * Use xc_vcpu_setaffinity to pin physical CPUs to the VCPUs.
+ */
+static int setup_numa_affinity (int xc_handle, uint32_t dom,
+                                unsigned long nodemask)
+{
+    uint64_t *nodemasks, usemask;
+
+    int nrcpus, i;
+    xc_dominfo_t dominfo;
+    int nrnodes,curnode,vcpusleft;
+
+    nrnodes = hweight_long (nodemask);
+
+    nrcpus = get_nodemasks (xc_handle, &nodemasks);
+
+    if (xc_domain_getinfo (xc_handle, dom, 1, &dominfo) != 1)
+    {
+        ERROR("Unable to get platform info.");
+        return -1;
+    }
+    curnode = -1;
+    vcpusleft = 0;
+    for ( i = 0; i <= dominfo.max_vcpu_id; i++ )
+    {
+        if ( vcpusleft == 0 )
+        {
+            vcpusleft = ( dominfo.max_vcpu_id + 1 ) / nrnodes;
+            if ( ++curnode < ( ( dominfo.max_vcpu_id + 1 ) % nrnodes ) )
+                vcpusleft++;
+            usemask = nodemasks[__ffs(nodemask)];
+            nodemask &= ~(1ULL<<__ffs(nodemask));
+        }
+        xc_vcpu_setaffinity (xc_handle, dom, i, usemask);
+        vcpusleft--;
+    }
+
+    return 0;
+}
+
+static int populate_on_node ( int xc_handle, uint32_t dom,
+                              unsigned long *cur_pages,
+                              unsigned long nr_pages,
+                              int memflags, xen_pfn_t* page_array)
+{
+int rc=0;
+unsigned long i;
+
+    while ( (rc == 0) && (nr_pages > 0 ) )
+    {
+        /* Clip count to maximum 8MB extent. */
+        unsigned long count = nr_pages;
+        if ( count > 2048 )
+            count = 2048;
+
+        /* Clip partial superpage extents to superpage boundaries. */
+        if ( ((*cur_pages & (SUPERPAGE_NR_PFNS-1)) != 0) &&
+             (count > (-*cur_pages & (SUPERPAGE_NR_PFNS-1))) )
+            count = -*cur_pages & (SUPERPAGE_NR_PFNS-1); /* clip s.p. tail */
+        else if ( ((count & (SUPERPAGE_NR_PFNS-1)) != 0) &&
+                  (count > SUPERPAGE_NR_PFNS) )
+            count &= ~(SUPERPAGE_NR_PFNS - 1); /* clip non-s.p. tail */
+
+        /* Attempt to allocate superpage extents. */
+        if ( ((count | *cur_pages) & (SUPERPAGE_NR_PFNS - 1)) == 0 )
+        {
+            long done;
+            xen_pfn_t sp_extents[2048 >> SUPERPAGE_PFN_SHIFT];
+            struct xen_memory_reservation sp_req = {
+                .nr_extents   = count >> SUPERPAGE_PFN_SHIFT,
+                .extent_order = SUPERPAGE_PFN_SHIFT,
+                .mem_flags    = memflags,
+                .domid        = dom
+            };
+            set_xen_guest_handle(sp_req.extent_start, sp_extents);
+            for ( i = 0; i < sp_req.nr_extents; i++ )
+                sp_extents[i] = 
page_array[*cur_pages+(i<<SUPERPAGE_PFN_SHIFT)];
+
+            done = xc_memory_op(xc_handle, XENMEM_populate_physmap, &sp_req);
+            if ( done > 0 )
+            {
+                done <<= SUPERPAGE_PFN_SHIFT;
+                *cur_pages += done;
+                count -= done;
+                nr_pages -= done;
+            }
+        }
+
+        /* Fall back to 4kB extents. */
+        if ( count != 0 )
+        {
+            rc = xc_domain_memory_populate_physmap(
+                xc_handle, dom, count, 0, memflags,
+                &page_array[*cur_pages]);
+            *cur_pages += count;
+            nr_pages -= count;
+        }
+    }
+    return rc;
+}
+
+static int setup_numa_mem ( int xc_handle, uint32_t dom,
+                            unsigned long *cur_pages, unsigned long nr_pages,
+                            unsigned nodemask, xen_pfn_t *page_array)
+{
+    int i, rc;
+    unsigned long cur_node_pages;
+    unsigned long pages_per_node;
+    int numanodes;
+
+    numanodes = hweight_long (nodemask);
+
+    pages_per_node = ((nr_pages+0xFF)&(~0xFFUL))/numanodes;
+
+    for ( i = 0 ; i < numanodes ; i++ )
+    {
+        if ( i == numanodes - 1 )
+            cur_node_pages = nr_pages - i * pages_per_node;
+        else cur_node_pages = pages_per_node;
+        if ( i == 0 ) cur_node_pages -= *cur_pages;
+
+        rc = populate_on_node (xc_handle, dom, cur_pages, cur_node_pages,
+                               XENMEMF_node (__ffs(nodemask)), page_array);
+        if ( rc != 0 ) return rc;
+
+        nodemask &= ~(1<<__ffs(nodemask));
+    }
+    return 0;
+}
+
 static int setup_guest(int xc_handle,
-                       uint32_t dom, int memsize,
+                       uint32_t dom, int memsize, unsigned long nodemask,
                        char *image, unsigned long image_size)
 {
     xen_pfn_t *page_array = NULL;
@@ -169,6 +336,7 @@
     struct elf_binary elf;
     uint64_t v_start, v_end;
     int rc;
+    unsigned int memflags;
     xen_capabilities_info_t caps;
 
     /* An HVM guest must be initialised with at least 2MB memory. */
@@ -217,59 +385,30 @@
      * We allocate pages in batches of no more than 8MB to ensure that
      * we can be preempted and hence dom0 remains responsive.
      */
+
+    if ( nodemask == 0 ) memflags = 0;
+        else memflags = XENMEMF_node (__ffs (nodemask));
+
     rc = xc_domain_memory_populate_physmap(
-        xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]);
+        xc_handle, dom, 0xa0, 0, memflags, &page_array[0x00]);
     cur_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
-    {
-        /* Clip count to maximum 8MB extent. */
-        unsigned long count = nr_pages - cur_pages;
-        if ( count > 2048 )
-            count = 2048;
 
-        /* Clip partial superpage extents to superpage boundaries. */
-        if ( ((cur_pages & (SUPERPAGE_NR_PFNS-1)) != 0) &&
-             (count > (-cur_pages & (SUPERPAGE_NR_PFNS-1))) )
-            count = -cur_pages & (SUPERPAGE_NR_PFNS-1); /* clip s.p. tail */
-        else if ( ((count & (SUPERPAGE_NR_PFNS-1)) != 0) &&
-                  (count > SUPERPAGE_NR_PFNS) )
-            count &= ~(SUPERPAGE_NR_PFNS - 1); /* clip non-s.p. tail */
-
-        /* Attempt to allocate superpage extents. */
-        if ( ((count | cur_pages) & (SUPERPAGE_NR_PFNS - 1)) == 0 )
-        {
-            long done;
-            xen_pfn_t sp_extents[2048 >> SUPERPAGE_PFN_SHIFT];
-            struct xen_memory_reservation sp_req = {
-                .nr_extents   = count >> SUPERPAGE_PFN_SHIFT,
-                .extent_order = SUPERPAGE_PFN_SHIFT,
-                .domid        = dom
-            };
-            set_xen_guest_handle(sp_req.extent_start, sp_extents);
-            for ( i = 0; i < sp_req.nr_extents; i++ )
-                sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_PFN_SHIFT)];
-            done = xc_memory_op(xc_handle, XENMEM_populate_physmap, &sp_req);
-            if ( done > 0 )
-            {
-                done <<= SUPERPAGE_PFN_SHIFT;
-                cur_pages += done;
-                count -= done;
-            }
-        }
-
-        /* Fall back to 4kB extents. */
-        if ( count != 0 )
-        {
-            rc = xc_domain_memory_populate_physmap(
-                xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
-            cur_pages += count;
-        }
-    }
+    if ( hweight_long (nodemask) > 1 )
+        rc = setup_numa_mem (xc_handle, dom, &cur_pages, nr_pages,
+                             nodemask, page_array);
+    else
+        rc = populate_on_node (xc_handle, dom, &cur_pages, nr_pages - 
cur_pages,
+                               memflags, page_array);
 
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");
         goto error_out;
+    }
+
+    if ( hweight_long (nodemask) > 1 )
+    {
+        setup_numa_affinity (xc_handle, dom, nodemask);
     }
 
     if ( loadelfimage(&elf, xc_handle, dom, page_array) != 0 )
@@ -364,6 +503,7 @@
 static int xc_hvm_build_internal(int xc_handle,
                                  uint32_t domid,
                                  int memsize,
+                                 unsigned long nodemask,
                                  char *image,
                                  unsigned long image_size)
 {
@@ -373,7 +513,7 @@
         return -1;
     }
 
-    return setup_guest(xc_handle, domid, memsize, image, image_size);
+    return setup_guest(xc_handle, domid, memsize, nodemask, image, image_size);
 }
 
 static inline int is_loadable_phdr(Elf32_Phdr *phdr)
@@ -388,6 +528,7 @@
 int xc_hvm_build(int xc_handle,
                  uint32_t domid,
                  int memsize,
+                 unsigned long nodemask,
                  const char *image_name)
 {
     char *image;
@@ -398,7 +539,8 @@
          ((image = xc_read_image(image_name, &image_size)) == NULL) )
         return -1;
 
-    sts = xc_hvm_build_internal(xc_handle, domid, memsize, image, image_size);
+    sts = xc_hvm_build_internal(xc_handle, domid, memsize, nodemask,
+                                image, image_size);
 
     free(image);
 
@@ -411,6 +553,7 @@
 int xc_hvm_build_mem(int xc_handle,
                      uint32_t domid,
                      int memsize,
+                     unsigned long nodemask,
                      const char *image_buffer,
                      unsigned long image_size)
 {
@@ -433,7 +576,7 @@
         return -1;
     }
 
-    sts = xc_hvm_build_internal(xc_handle, domid, memsize,
+    sts = xc_hvm_build_internal(xc_handle, domid, memsize, nodemask,
                                 img, img_len);
 
     /* xc_inflate_buffer may return the original buffer pointer (for
diff -r f40c310dca31 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/libxc/xenguest.h    Fri Jul 11 15:53:59 2008 +0200
@@ -128,11 +128,13 @@
 int xc_hvm_build(int xc_handle,
                  uint32_t domid,
                  int memsize,
+                 unsigned long nodemask,
                  const char *image_name);
 
 int xc_hvm_build_mem(int xc_handle,
                      uint32_t domid,
                      int memsize,
+                     unsigned long nodemask,
                      const char *image_buffer,
                      unsigned long image_size);
 
diff -r f40c310dca31 tools/libxc/xg_private.c
--- a/tools/libxc/xg_private.c  Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/libxc/xg_private.c  Fri Jul 11 15:53:59 2008 +0200
@@ -177,6 +177,7 @@
     int xc_hvm_build(int xc_handle,
                      uint32_t domid,
                      int memsize,
+                     unsigned long nodemask,
                      const char *image_name)
 {
     errno = ENOSYS;
diff -r f40c310dca31 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c Fri Jul 11 15:53:59 2008 +0200
@@ -873,16 +873,17 @@
 #endif
     char *image;
     int memsize, vcpus = 1, acpi = 0, apic = 1;
+    unsigned long nodemask;
 
     static char *kwd_list[] = { "domid",
                                 "memsize", "image", "vcpus", "acpi",
-                                "apic", NULL };
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iii", kwd_list,
-                                      &dom, &memsize,
-                                      &image, &vcpus, &acpi, &apic) )
+                                "apic", "nodemask", NULL };
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iiil", kwd_list,
+                                      &dom, &memsize, &image,
+                                      &vcpus, &acpi, &apic, &nodemask) )
         return NULL;
 
-    if ( xc_hvm_build(self->xc_handle, dom, memsize, image) != 0 )
+    if ( xc_hvm_build(self->xc_handle, dom, memsize, nodemask, image) != 0 )
         return pyxc_error_to_exception();
 
 #if !defined(__ia64__)
diff -r f40c310dca31 tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/xend/XendConfig.py       Fri Jul 11 15:53:59 2008 +0200
@@ -162,6 +162,7 @@
     'vhpt': int,
     'guest_os_type': str,
     'hap': int,
+    'guestnodes': int,
 }
 
 # Xen API console 'other_config' keys.
@@ -375,6 +376,7 @@
             'other_config': {},
             'platform': {},
             'target': 0,
+            'guestnodes': 0,
         }
         
         return defaults
@@ -570,7 +572,10 @@
             cfg["memory"] = int(sxp.child_value(sxp_cfg, "memory"))
         if sxp.child_value(sxp_cfg, "maxmem") != None:
             cfg["maxmem"] = int(sxp.child_value(sxp_cfg, "maxmem"))
-            
+
+        if sxp.child_value(sxp_cfg, "guestnodes") != None:
+            cfg["guestnodes"] = int(sxp.child_value(sxp_cfg, "guestnodes"))
+
         # Convert scheduling parameters to vcpus_params
         if 'vcpus_params' not in cfg:
             cfg['vcpus_params'] = {}
diff -r f40c310dca31 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/xend/XendDomainInfo.py   Fri Jul 11 15:53:59 2008 +0200
@@ -2150,7 +2150,7 @@
                     if self.info['cpus'][v]:
                         xc.vcpu_setaffinity(self.domid, v, 
self.info['cpus'][v])
             else:
-                def find_relaxed_node(node_list):
+                def find_relaxed_node(node_list, numnodes):
                     import sys
                     nr_nodes = info['nr_nodes']
                     if node_list is None:
@@ -2175,21 +2175,36 @@
                             nodeload[i] = int(nodeload[i] * 16 / 
len(info['node_to_cpu'][i]))
                         else:
                             nodeload[i] = sys.maxint
-                    index = nodeload.index( min(nodeload) )    
-                    return index
+
+                    if numnodes == 0:
+                        return nodeload.index( min(nodeload) )
+                    else:
+                        nodemask = 0
+                        for i in range (0,numnodes):
+                            index = min((n, i) for i, n in 
enumerate(nodeload))[1]
+                            nodemask = nodemask | (1 << index)
+                            nodeload[index] = sys.maxint
+                        return nodemask
 
                 info = xc.physinfo()
+                nodemask = 0
                 if info['nr_nodes'] > 1:
                     node_memory_list = info['node_to_memory']
                     needmem = 
self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024
+                    if self.image.guestnodes > 1:
+                        needmem = needmem / self.image.guestnodes
                     candidate_node_list = []
                     for i in range(0, info['nr_nodes']):
                         if node_memory_list[i] >= needmem and 
len(info['node_to_cpu'][i]) > 0:
                             candidate_node_list.append(i)
-                    index = find_relaxed_node(candidate_node_list)
-                    cpumask = info['node_to_cpu'][index]
-                    for v in range(0, self.info['VCPUs_max']):
-                        xc.vcpu_setaffinity(self.domid, v, cpumask)
+                    nodemask = find_relaxed_node(candidate_node_list, 
+                                                 self.image.guestnodes)
+                    if self.image.guestnodes < 1:
+                        cpumask = info['node_to_cpu'][nodemask]
+                        for v in range(0, self.info['VCPUs_max']):
+                            xc.vcpu_setaffinity(self.domid, v, cpumask)
+                    else:
+                        self.image.nodemask = nodemask
 
             # Use architecture- and image-specific calculations to determine
             # the various headrooms necessary, given the raw configured
diff -r f40c310dca31 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/xend/image.py    Fri Jul 11 15:53:59 2008 +0200
@@ -127,6 +127,9 @@
             self.cpuid = vmConfig['cpuid'];
         if 'cpuid_check' in vmConfig:
             self.cpuid_check = vmConfig['cpuid_check']
+
+        self.guestnodes = int(vmConfig['platform'].get('guestnodes',0))
+        self.nodemask   = 0
 
     def cleanupBootloading(self):
         if self.bootloader:
@@ -696,6 +699,7 @@
         self.apic = int(vmConfig['platform'].get('apic', 0))
         self.acpi = int(vmConfig['platform'].get('acpi', 0))
         self.guest_os_type = vmConfig['platform'].get('guest_os_type')
+        self.guestnodes = int(vmConfig['platform'].get('guestnodes', 0))
            
 
     # Return a list of cmd line args to the device models based on the
@@ -797,13 +801,16 @@
         log.debug("vcpus          = %d", self.vm.getVCpuCount())
         log.debug("acpi           = %d", self.acpi)
         log.debug("apic           = %d", self.apic)
+        log.debug("guestnodes     = %d", self.guestnodes)
+        log.debug("nodemask       = %d", self.nodemask)
 
         rc = xc.hvm_build(domid          = self.vm.getDomid(),
                           image          = self.loader,
                           memsize        = mem_mb,
                           vcpus          = self.vm.getVCpuCount(),
                           acpi           = self.acpi,
-                          apic           = self.apic)
+                          apic           = self.apic,
+                          nodemask       = self.nodemask)
         rc['notes'] = { 'SUSPEND_CANCEL': 1 }
 
         rc['store_mfn'] = xc.hvm_get_param(self.vm.getDomid(),
diff -r f40c310dca31 tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/xm/create.py     Fri Jul 11 15:53:59 2008 +0200
@@ -567,6 +567,10 @@
           fn=set_int, default=None,
           use="""Maximum machine address size""")
 
+gopts.var('guestnodes', val="GUESTNODES",
+          fn=set_int, default=0,
+          use="""Number of NUMA nodes to appear in the guest.""")
+
 def err(msg):
     """Print an error to stderr and exit.
     """
@@ -845,7 +849,8 @@
              'vnc', 'vncdisplay', 'vncunused', 'vncconsole', 'vnclisten',
              'sdl', 'display', 'xauthority', 'rtc_timeoffset', 'monitor',
              'acpi', 'apic', 'usb', 'usbdevice', 'keymap', 'pci', 'hpet',
-             'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check']
+             'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check',
+             'guestnodes' ]
 
     for a in args:
         if a in vals.__dict__ and vals.__dict__[a] is not None:
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 1/2]: hvm: NUMA guest: allocate memory and pin cpus according to guestnodes number (resend), Andre Przywara <=