WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 4/6] xen: Add NUMA support to Xen

To: xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-devel] [PATCH 4/6] xen: Add NUMA support to Xen
From: Ryan Harper <ryanh@xxxxxxxxxx>
Date: Mon, 1 May 2006 16:59:09 -0500
Cc: Ryan Grimm <grimm@xxxxxxxxxx>
Delivery-date: Mon, 01 May 2006 15:00:37 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mutt/1.5.6+20040907i
This patch exports NUMA specific information collected by the
hypervisor in the physinfo hypercall.  This additional information
is also integrated into the xm info command which will display
the NUMA information.

nr_nodes is now calculated from num_online_nodes, rather than a
hard-coded value of 1.

nr_nodes               : 2

We display the 64-bit address of each memory chunk and which node
to which it belongs.

mem_chunks             : node0:0x0000000000000000-0x000000000009ffff
                         node0:0x0000000000100000-0x000000007fffffff
                         node1:0x0000000080000000-0x00000000dfffffff


We provide a node to cpu mapping as well.  The cpu value is a
collapsed range, so for example, on a two node 32-way, the
node_to_cpu value might look like:

node_to_cpu            : node0:0-15
                         node1:16-31

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


diffstat output:
 b/xen/include/public/numa_structs.h                 |   26 ++++++
 tools/libxc/xc_misc.c                               |    3 
 tools/libxc/xenctrl.h                               |    3 
 tools/python/xen/lowlevel/xc/xc.c                   |   66 +++++++++++++++-
 tools/python/xen/xend/XendNode.py                   |   67 +++++++++++++++++
 tools/xm-test/tests/info/02_info_compiledata_pos.py |    4 -
 xen/arch/x86/dom0_ops.c                             |   78 +++++++++++++++++++-
 xen/include/public/arch-x86_32.h                    |    1 
 xen/include/public/arch-x86_64.h                    |    1 
 xen/include/public/dom0_ops.h                       |    4 +
 xen/include/xen/numa.h                              |    9 --
 11 files changed, 246 insertions(+), 16 deletions(-)

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
Signed-off-by: Ryan Grimm <grimm@xxxxxxxxxx>
---
# HG changeset patch
# User Ryan Harper <ryanh@xxxxxxxxxx>
# Node ID 2b51b0ae911cbc31f48e347a587d5b07215977c7
# Parent  ab3f95176e39d87e8996776053f4261a55ee7d4b
This patch exports NUMA specific information collected by the
hypervisor in the physinfo hypercall.  This additional information
is also integrated into the xm info command which will display
the NUMA information.

nr_nodes is now calculated from num_online_nodes, rather than a
hard-coded value of 1.

nr_nodes               : 2

We display the 64-bit address of each memory chunk and which node
to which it belongs.

mem_chunks             : node0:0x0000000000000000-0x000000000009ffff
                         node0:0x0000000000100000-0x000000007fffffff
                         node1:0x0000000080000000-0x00000000dfffffff


We provide a node to cpu mapping as well.  The cpu value is a
collapsed range, so for example, on a two node 32-way, the
node_to_cpu value might look like:

node_to_cpu            : node0:0-15
                         node1:16-31

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
Signed-off-by: Ryan Grimm <grimm@xxxxxxxxxx>

diff -r ab3f95176e39 -r 2b51b0ae911c tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c     Mon May  1 15:11:32 2006
+++ b/tools/libxc/xc_misc.c     Mon May  1 20:47:43 2006
@@ -53,6 +53,9 @@
 
     op.cmd = DOM0_PHYSINFO;
     op.interface_version = DOM0_INTERFACE_VERSION;
+    /* set pointers to caller's so memcpy doesn't clobber them */
+    op.u.physinfo.memory_chunks = put_info->memory_chunks;
+    op.u.physinfo.node_to_cpu = put_info->node_to_cpu;
 
     if ( (ret = do_dom0_op(xc_handle, &op)) != 0 )
         return ret;
diff -r ab3f95176e39 -r 2b51b0ae911c tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Mon May  1 15:11:32 2006
+++ b/tools/libxc/xenctrl.h     Mon May  1 20:47:43 2006
@@ -19,6 +19,7 @@
 #include <xen/sched_ctl.h>
 #include <xen/memory.h>
 #include <xen/acm.h>
+#include <xen/numa_structs.h>
 
 #ifdef __ia64__
 #define XC_PAGE_SHIFT           14
@@ -390,6 +391,8 @@
                        int clear);
 
 typedef dom0_physinfo_t xc_physinfo_t;
+typedef node_memory_chunk_t xc_memory_chunk_t;
+typedef uint64_t xc_node_to_cpu_t;
 int xc_physinfo(int xc_handle,
                 xc_physinfo_t *info);
 
diff -r ab3f95176e39 -r 2b51b0ae911c tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Mon May  1 15:11:32 2006
+++ b/tools/python/xen/lowlevel/xc/xc.c Mon May  1 20:47:43 2006
@@ -603,8 +603,21 @@
 {
     xc_physinfo_t info;
     char cpu_cap[128], *p=cpu_cap, *q=cpu_cap;
-    int i;
-    
+    int i,j;
+    PyObject *ret_obj, *memchunk_obj, *node_to_cpu_obj;
+    xc_memory_chunk_t *chunks;
+    xc_node_to_cpu_t  *map;
+
+    /* make space for mem chunks */
+    chunks =  (xc_memory_chunk_t *)malloc( sizeof(xc_memory_chunk_t) * 
+                                     PUBLIC_MAXCHUNKS );
+    set_xen_guest_handle(info.memory_chunks, chunks);
+
+    /* make space for node_to_cpu mapping */
+    map = (xc_node_to_cpu_t *)malloc( sizeof(xc_node_to_cpu_t) *
+                                    PUBLIC_MAX_NUMNODES ); 
+    set_xen_guest_handle(info.node_to_cpu, map);
+
     if ( xc_physinfo(self->xc_handle, &info) != 0 )
         return PyErr_SetFromErrno(xc_error);
 
@@ -617,16 +630,59 @@
     }
     if(q>cpu_cap)
         *(q-1)=0;
-
-    return Py_BuildValue("{s:i,s:i,s:i,s:i,s:l,s:l,s:i,s:s}",
+    
+    ret_obj = Py_BuildValue("{s:i,s:i,s:i,s:l,s:l,s:i,s:s}",
                          "threads_per_core", info.threads_per_core,
                          "cores_per_socket", info.cores_per_socket,
                          "sockets_per_node", info.sockets_per_node,
-                         "nr_nodes",         info.nr_nodes,
                          "total_memory",     pages_to_mb(info.total_pages),
                          "free_memory",      pages_to_mb(info.free_pages),
                          "cpu_khz",          info.cpu_khz,
                          "hw_caps",          cpu_cap);
+     
+    /* memchunks */
+    memchunk_obj = PyList_New(0);
+ 
+    /* build list of each memchunk's attributes */
+    for ( i=0; i<info.nr_chunks; i++ ) 
+    {
+        PyList_Append(memchunk_obj, 
+                      Py_BuildValue("{s:i,s:K,s:K}",
+                      "node"       , chunks[i].nid,
+                      "start_paddr", chunks[i].start_paddr,
+                      "end_paddr"  , chunks[i].end_paddr));
+    }
+    /* add list of attributes and nr_chunks to physinfo dictionary */
+    PyDict_SetItemString(ret_obj, "mem_chunks", memchunk_obj);
+    PyDict_SetItemString(ret_obj, "nr_chunks", 
+             Py_BuildValue("i", info.nr_chunks));
+ 
+    /* node to cpu mappings */
+    node_to_cpu_obj = PyList_New(0);
+    /* build list of node to cpu mappings */
+    for ( i=0; i<info.nr_nodes; i++ )
+    {
+        cpumap_t cpumap = (cpumap_t)map[i];
+        PyObject *cpus = PyList_New(0);
+ 
+        for ( j=0; cpumap != 0; j++ ) 
+        {
+            if ( cpumap & 1 )
+                PyList_Append(cpus, PyInt_FromLong(j));
+            cpumap >>=1;
+        }
+        PyList_Append(node_to_cpu_obj, cpus); 
+    }
+    /* add list of node to cpu mappings and nr_nodes to physinfo dictionary */
+    PyDict_SetItemString(ret_obj, "node_to_cpu",  node_to_cpu_obj);
+    PyDict_SetItemString(ret_obj, "nr_nodes", 
+             Py_BuildValue("i", info.nr_nodes));
+
+    /* free malloc'd memory */
+    free(chunks);
+    free(map);
+ 
+    return ret_obj;
 }
 
 static PyObject *pyxc_xeninfo(XcObject *self)
diff -r ab3f95176e39 -r 2b51b0ae911c tools/python/xen/xend/XendNode.py
--- a/tools/python/xen/xend/XendNode.py Mon May  1 15:11:32 2006
+++ b/tools/python/xen/xend/XendNode.py Mon May  1 20:47:43 2006
@@ -56,6 +56,69 @@
                 ['version', ver],
                 ['machine', mch]]
 
+    def list_to_rangepairs(self,cmap):
+            cmap.sort()
+            pairs = []
+            x = y = 0
+            for i in range(0,len(cmap)):
+                try:
+                    if ((cmap[y+1] - cmap[i]) > 1):
+                        pairs.append((cmap[x],cmap[y]))
+                        x = y = i+1
+                    else:
+                        y = y + 1
+                # if we go off the end, then just add x to y
+                except IndexError:
+                    pairs.append((cmap[x],cmap[y]))
+
+            return pairs
+
+    def format_pairs(self,pairs):
+            if not pairs:
+                return "no cpus"
+            out = ""
+            for f,s in pairs:
+                if (f==s):
+                    out += '%d'%f
+                else:
+                    out += '%d-%d'%(f,s)
+                out += ','
+            # trim trailing ','
+            return out[:-1]
+
+    def list_to_strrange(self,list):
+        return self.format_pairs(self.list_to_rangepairs(list))
+
+    def format_memchunks(self, pinfo):
+        str=''
+        whitespace=''
+        try:
+            chunk=pinfo['mem_chunks']
+            for i in range(0, pinfo['nr_chunks']):
+                str+='%snode%d:0x%016x-0x%016x\n' % (whitespace,
+                                                    chunk[i]['node'],
+                                                    chunk[i]['start_paddr'], 
+                                                    chunk[i]['end_paddr']) 
+                whitespace='%25s' % ''
+        except:
+            str='none\n' 
+        return str[:-1]
+        
+    def format_node_to_cpu(self, pinfo):
+        str=''
+        whitespace=''
+        try:
+            node_to_cpu=pinfo['node_to_cpu']
+            for i in range(0, pinfo['nr_nodes']):
+                str+='%snode%d:%s\n' % (whitespace,
+                                        i, 
+                                      self.list_to_strrange(node_to_cpu[i]))
+                whitespace='%25s' % ''        
+        except:
+            str='none\n'
+        return str[:-1];
+
+
     def physinfo(self):
         info = self.xc.physinfo()
 
@@ -64,6 +127,8 @@
                            info['cores_per_socket'] *
                            info['threads_per_core'])
         info['cpu_mhz'] = info['cpu_khz'] / 1000
+        info['mem_chunks'] = self.format_memchunks(info)
+        info['node_to_cpu'] = self.format_node_to_cpu(info)
 
         ITEM_ORDER = ['nr_cpus',
                       'nr_nodes',
@@ -74,6 +139,8 @@
                       'hw_caps',
                       'total_memory',
                       'free_memory',
+                      'mem_chunks',
+                      'node_to_cpu'
                       ]
 
         return [[k, info[k]] for k in ITEM_ORDER]
diff -r ab3f95176e39 -r 2b51b0ae911c 
tools/xm-test/tests/info/02_info_compiledata_pos.py
--- a/tools/xm-test/tests/info/02_info_compiledata_pos.py       Mon May  1 
15:11:32 2006
+++ b/tools/xm-test/tests/info/02_info_compiledata_pos.py       Mon May  1 
20:47:43 2006
@@ -18,9 +18,7 @@
 for line in lines:
     pieces = line.split(" : ", 1)
 
-    if len(pieces) < 2:
-        FAIL("Found invalid line: [%s]" % line)
-    else:
+    if len(pieces) > 1:
         map[pieces[0]] = pieces[1]
 
 for field in ["cores_per_socket", "threads_per_core", "cpu_mhz",
diff -r ab3f95176e39 -r 2b51b0ae911c xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c   Mon May  1 15:11:32 2006
+++ b/xen/arch/x86/dom0_ops.c   Mon May  1 20:47:43 2006
@@ -25,6 +25,7 @@
 #include <asm/hvm/support.h>
 #include <asm/processor.h>
 #include <public/sched_ctl.h>
+#include <xen/numa.h>  /* needed for MAX_NUMNODES without numa=y */
 
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
@@ -183,6 +184,8 @@
     case DOM0_PHYSINFO:
     {
         dom0_physinfo_t *pi = &op->u.physinfo;
+        int i;
+        u64 node_to_cpu_64[MAX_NUMNODES];
 
         pi->threads_per_core =
             cpus_weight(cpu_sibling_map[0]);
@@ -191,12 +194,85 @@
         pi->sockets_per_node = 
             num_online_cpus() / cpus_weight(cpu_core_map[0]);
 
-        pi->nr_nodes         = 1;
         pi->total_pages      = total_pages;
         pi->free_pages       = avail_domheap_pages();
         pi->cpu_khz          = cpu_khz;
         memset(pi->hw_cap, 0, sizeof(pi->hw_cap));
         memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4);
+
+#ifdef CONFIG_NUMA
+        /* memory chunks */
+        pi->nr_chunks = num_memory_chunks;
+
+        DPRINTK("num_memory_chunks:%d\n", num_memory_chunks);
+        for ( i=0; i<num_memory_chunks; i++ ) {
+            DPRINTK("node%d:%"PRIx64"\n", node_memory_chunk[i].nid,
+                                          node_memory_chunk[i].start_paddr);
+            DPRINTK("node%d:%"PRIx64"\n", node_memory_chunk[i].nid,
+                                          node_memory_chunk[i].end_paddr);
+
+            /* copy memory chunk structs to guest */
+            ret = 0;
+            if ( copy_to_guest_offset(op->u.physinfo.memory_chunks, i, 
+                                      &(node_memory_chunk[i]), 1) ) {
+                ret = -EFAULT;
+                break;
+            }
+        }
+
+        /* node to cpu mask */
+        pi->nr_nodes = num_online_nodes();
+
+        /* copy cpu to node mapping to domU */
+        /* converting cpumask to u64 b/c userspace doesn't 
+         * know about cpumask_t and is accepting a u64 */
+        memset(node_to_cpu_64, 0, sizeof(node_to_cpu_64));
+        for ( i=0; i<pi->nr_nodes; i++) {
+            int j = 0;
+            for ( j=0; j<num_online_cpus(); j++)
+                if ( cpu_isset(j, node_to_cpumask[i]) )
+                    node_to_cpu_64[i] |= (u64)1 << j;
+
+            if ( copy_to_guest_offset(op->u.physinfo.node_to_cpu, 
+                                      i, &(node_to_cpu_64[i]), 1) ) {
+                ret = -EFAULT;
+                break;
+            }
+        }
+#else
+        {
+            node_memory_chunk_t chunk;
+
+            /* if no CONFIG_NUMA, construct a memory chunk of all memory
+             * in system and node to all online cpus map */
+            pi->nr_chunks = 1;
+
+            /* send over node_memory_chunk */
+            chunk.start_paddr = 0;
+            chunk.end_paddr = total_pages * PAGE_SIZE;
+            chunk.nid = 1;
+            chunk.pxm = 1;
+            ret = 0;
+            if ( copy_to_guest_offset(op->u.physinfo.memory_chunks, 
+                                      0, &chunk, 1) ) {
+                ret = -EFAULT;
+                break;
+            }
+
+            /* create node to cpu mapping of one node to all online cpus */
+            pi->nr_nodes = 1;
+            node_to_cpu_64[0] = 0;
+            for ( i=0; i<num_online_cpus(); i++)
+                node_to_cpu_64[0] |= (u64)1 << i;
+
+            if ( copy_to_guest_offset(op->u.physinfo.node_to_cpu, 
+                                      0, &(node_to_cpu_64[0]), 1) ) {
+                ret = -EFAULT;
+                break;
+            }
+        }
+#endif
+       
         ret = 0;
         if ( copy_to_guest(u_dom0_op, op, 1) )
             ret = -EFAULT;
diff -r ab3f95176e39 -r 2b51b0ae911c xen/include/public/arch-x86_32.h
--- a/xen/include/public/arch-x86_32.h  Mon May  1 15:11:32 2006
+++ b/xen/include/public/arch-x86_32.h  Mon May  1 20:47:43 2006
@@ -24,6 +24,7 @@
 __DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
 __DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
 __DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
+__DEFINE_XEN_GUEST_HANDLE(u64, uint64_t);
 DEFINE_XEN_GUEST_HANDLE(char);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
diff -r ab3f95176e39 -r 2b51b0ae911c xen/include/public/arch-x86_64.h
--- a/xen/include/public/arch-x86_64.h  Mon May  1 15:11:32 2006
+++ b/xen/include/public/arch-x86_64.h  Mon May  1 20:47:43 2006
@@ -24,6 +24,7 @@
 __DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
 __DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
 __DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
+__DEFINE_XEN_GUEST_HANDLE(u64, uint64_t);
 DEFINE_XEN_GUEST_HANDLE(char);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
diff -r ab3f95176e39 -r 2b51b0ae911c xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Mon May  1 15:11:32 2006
+++ b/xen/include/public/dom0_ops.h     Mon May  1 20:47:43 2006
@@ -13,6 +13,7 @@
 
 #include "xen.h"
 #include "sched_ctl.h"
+#include "numa_structs.h"
 
 /*
  * Make sure you increment the interface version whenever you modify this file!
@@ -219,6 +220,9 @@
     unsigned long total_pages;
     unsigned long free_pages;
     uint32_t hw_cap[8];
+    uint32_t nr_chunks;
+    XEN_GUEST_HANDLE(node_memory_chunk_t) memory_chunks;
+    XEN_GUEST_HANDLE(u64) node_to_cpu;
 } dom0_physinfo_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_physinfo_t);
 
diff -r ab3f95176e39 -r 2b51b0ae911c xen/include/xen/numa.h
--- a/xen/include/xen/numa.h    Mon May  1 15:11:32 2006
+++ b/xen/include/xen/numa.h    Mon May  1 20:47:43 2006
@@ -2,6 +2,7 @@
 #define _XEN_NUMA_H
 
 #include <xen/config.h>
+#include <public/numa_structs.h>
 
 #ifdef CONFIG_DISCONTIGMEM
 #include <asm/numnodes.h>
@@ -19,13 +20,7 @@
 #define MAX_CHUNKS_PER_NODE   4
 #define MAXCHUNKS    (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
 
-typedef struct node_memory_chunk {
-   uint64_t start_paddr; /* physical address of chunk start */
-   uint64_t end_paddr;   /* physical address of chunk end */
-   uint8_t pxm;          /* proximity domain of node */
-   uint8_t nid;          /* which cnode contains this chunk? */
-} node_memory_chunk_t;
-DEFINE_XEN_GUEST_HANDLE(node_memory_chunk_t);
+#include <xen/nodemask.h>
 
 extern node_memory_chunk_t node_memory_chunk[];
 extern int num_memory_chunks;
diff -r ab3f95176e39 -r 2b51b0ae911c xen/include/public/numa_structs.h
--- /dev/null   Mon May  1 15:11:32 2006
+++ b/xen/include/public/numa_structs.h Mon May  1 20:47:43 2006
@@ -0,0 +1,26 @@
+/*
+ * Ryan Grimm <grimm@xxxxxxxxxx>
+ * Copyright (c) 2006, International Business Machines Corporation.
+ *
+ */
+
+#ifndef __XEN_PUBLIC_NUMA_STRUCTS_H__
+
+#define __XEN_PUBLIC_NUMA_STRUCTS_H__
+
+#include "xen.h"
+
+/* define these for xc to use b/c MAX_NUMNODES and MAX_CHUNKS
+ * are not exposed in /public */
+#define PUBLIC_MAX_NUMNODES 16
+#define PUBLIC_MAXCHUNKS 32
+
+typedef struct node_memory_chunk {
+   uint64_t start_paddr; /* physical address of chunk start */
+   uint64_t end_paddr;   /* physical address of chunk end */
+   uint8_t pxm;          /* proximity domain of node */
+   uint8_t nid;          /* which cnode contains this chunk? */
+} node_memory_chunk_t;
+DEFINE_XEN_GUEST_HANDLE(node_memory_chunk_t);
+
+#endif

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>