[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] RE: [PATCH] x86: fix NUMA handling (c/s 20599)



I didn't expect node with so smcall hunk, as explained in the comments to 20599.

But for this specific SRAT, seems we can also improve constructing of 
node_memblk_range[], so that we can merge 0~a0000 with 100000-80000000, and  
80000000-d0000000 with 100000000-130000000. Depends on whether we need keep 
BIOS's memory affinity information, we can either create a new structure to 
hold the result, or we can simply compact node_memblk_range[]. The benefit is, 
we can reduce the sizeof memnodemap, but I'm not sure if it is needed still 
after this patch.

Thanks
--jyh


>-----Original Message-----
>From: Jan Beulich [mailto:JBeulich@xxxxxxxxxx]
>Sent: Wednesday, January 06, 2010 11:36 PM
>To: xen-devel@xxxxxxxxxxxxxxxxxxx
>Cc: Jiang, Yunhong
>Subject: [PATCH] x86: fix NUMA handling (c/s 20599)
>
>c/s 20599 caused the hash shift to become significantly smaller on
>systems with an SRAT like this
>
>(XEN) SRAT: Node 0 PXM 0 0-a0000
>(XEN) SRAT: Node 0 PXM 0 100000-80000000
>(XEN) SRAT: Node 1 PXM 1 80000000-d0000000
>(XEN) SRAT: Node 1 PXM 1 100000000-130000000
>
>Comined with the static size of the memnodemap[] array, NUMA got
>therefore disabled on such systems. The backport from Linux was really
>incomplete, as Linux much earlier had already introduced a dynamcially
>allocated memnodemap[].
>
>Further, doing to/from pdx translations on addresses just past a valid
>range is not correct, as it may strip/fail to insert non-zero bits in
>this case.
>
>Finally, using 63 as the cover-it-all shift value is invalid on 32bit,
>since pdx values are unsigned long.
>
>Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>
>
>--- 2010-01-06.orig/xen/arch/x86/numa.c        2010-01-06 15:16:18.000000000 
>+0100
>+++ 2010-01-06/xen/arch/x86/numa.c     2010-01-06 16:10:59.000000000 +0100
>@@ -30,7 +30,9 @@ struct node_data node_data[MAX_NUMNODES]
>
> /* Mapping from pdx to node id */
> int memnode_shift;
>-u8  memnodemap[NODEMAPSIZE];
>+static typeof(*memnodemap) _memnodemap[2];
>+unsigned long memnodemapsize;
>+u8 *memnodemap;
>
> unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
>       [0 ... NR_CPUS-1] = NUMA_NO_NODE
>@@ -62,13 +64,13 @@ static int __init populate_memnodemap(co
>       unsigned long spdx, epdx;
>       int i, res = -1;
>
>-      memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
>+      memset(memnodemap, NUMA_NO_NODE, memnodemapsize *
>sizeof(*memnodemap));
>       for (i = 0; i < numnodes; i++) {
>               spdx = paddr_to_pdx(nodes[i].start);
>-              epdx = paddr_to_pdx(nodes[i].end);
>+              epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
>               if (spdx >= epdx)
>                       continue;
>-              if ((epdx >> shift) >= NODEMAPSIZE)
>+              if ((epdx >> shift) >= memnodemapsize)
>                       return 0;
>               do {
>                       if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
>@@ -86,6 +88,28 @@ static int __init populate_memnodemap(co
>       return res;
> }
>
>+static int __init allocate_cachealigned_memnodemap(void)
>+{
>+      unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
>+      unsigned long mfn = alloc_boot_pages(size, 1);
>+
>+      if (!mfn) {
>+              printk(KERN_ERR
>+                     "NUMA: Unable to allocate Memory to Node hash map\n");
>+              memnodemapsize = 0;
>+              return -1;
>+      }
>+
>+      memnodemap = mfn_to_virt(mfn);
>+      mfn <<= PAGE_SHIFT;
>+      size <<= PAGE_SHIFT;
>+      printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
>+             mfn, mfn + size);
>+      memnodemapsize = size / sizeof(*memnodemap);
>+
>+      return 0;
>+}
>+
> /*
>  * The LSB of all start and end addresses in the node map is the value of the
>  * maximum possible shift.
>@@ -99,7 +123,7 @@ static int __init extract_lsb_from_nodes
>
>       for (i = 0; i < numnodes; i++) {
>               spdx = paddr_to_pdx(nodes[i].start);
>-              epdx = paddr_to_pdx(nodes[i].end);
>+              epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
>               if (spdx >= epdx)
>                       continue;
>               bitfield |= spdx;
>@@ -108,9 +132,10 @@ static int __init extract_lsb_from_nodes
>                       memtop = epdx;
>       }
>       if (nodes_used <= 1)
>-              i = 63;
>+              i = BITS_PER_LONG - 1;
>       else
>               i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
>+      memnodemapsize = (memtop >> i) + 1;
>       return i;
> }
>
>@@ -120,6 +145,10 @@ int __init compute_hash_shift(struct nod
>       int shift;
>
>       shift = extract_lsb_from_nodes(nodes, numnodes);
>+      if (memnodemapsize <= ARRAY_SIZE(_memnodemap))
>+              memnodemap = _memnodemap;
>+      else if (allocate_cachealigned_memnodemap())
>+              return -1;
>       printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
>               shift);
>
>@@ -233,8 +262,8 @@ void __init numa_initmem_init(unsigned l
>              (u64)start_pfn << PAGE_SHIFT,
>              (u64)end_pfn << PAGE_SHIFT);
>       /* setup dummy node covering all memory */
>-      memnode_shift = 63;
>-      memnodemap[0] = 0;
>+      memnode_shift = BITS_PER_LONG - 1;
>+      memnodemap = _memnodemap;
>       nodes_clear(node_online_map);
>       node_set_online(0);
>       for (i = 0; i < NR_CPUS; i++)
>--- 2010-01-06.orig/xen/include/asm-x86/numa.h 2010-01-06 15:16:18.000000000
>+0100
>+++ 2010-01-06/xen/include/asm-x86/numa.h      2010-01-06 15:15:41.000000000
>+0100
>@@ -25,7 +25,6 @@ extern int pxm_to_node(int nid);
>
> #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
> #define VIRTUAL_BUG_ON(x)
>-#define NODEMAPSIZE 0xfff
>
> extern void numa_add_cpu(int cpu);
> extern void numa_init_array(void);
>@@ -51,7 +50,8 @@ static inline void clear_node_cpumask(in
>
> /* Simple perfect hash to map pdx to node numbers */
> extern int memnode_shift;
>-extern u8  memnodemap[NODEMAPSIZE];
>+extern unsigned long memnodemapsize;
>+extern u8 *memnodemap;
>
> struct node_data {
>     unsigned long node_start_pfn;
>@@ -64,7 +64,7 @@ extern struct node_data node_data[];
> static inline __attribute__((pure)) int phys_to_nid(paddr_t addr)
> {
>       unsigned nid;
>-      VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >=
>NODEMAPSIZE);
>+      VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >=
>memnodemapsize);
>       nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift];
>       VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
>       return nid;
>


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.