c/s 20599 caused the hash shift to become significantly smaller on systems with an SRAT like this (XEN) SRAT: Node 0 PXM 0 0-a0000 (XEN) SRAT: Node 0 PXM 0 100000-80000000 (XEN) SRAT: Node 1 PXM 1 80000000-d0000000 (XEN) SRAT: Node 1 PXM 1 100000000-130000000 Comined with the static size of the memnodemap[] array, NUMA got therefore disabled on such systems. The backport from Linux was really incomplete, as Linux much earlier had already introduced a dynamcially allocated memnodemap[]. Further, doing to/from pdx translations on addresses just past a valid range is not correct, as it may strip/fail to insert non-zero bits in this case. Finally, using 63 as the cover-it-all shift value is invalid on 32bit, since pdx values are unsigned long. Signed-off-by: Jan Beulich --- 2010-01-06.orig/xen/arch/x86/numa.c 2010-01-06 15:16:18.000000000 +0100 +++ 2010-01-06/xen/arch/x86/numa.c 2010-01-06 16:10:59.000000000 +0100 @@ -30,7 +30,9 @@ struct node_data node_data[MAX_NUMNODES] /* Mapping from pdx to node id */ int memnode_shift; -u8 memnodemap[NODEMAPSIZE]; +static typeof(*memnodemap) _memnodemap[2]; +unsigned long memnodemapsize; +u8 *memnodemap; unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE @@ -62,13 +64,13 @@ static int __init populate_memnodemap(co unsigned long spdx, epdx; int i, res = -1; - memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap)); + memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap)); for (i = 0; i < numnodes; i++) { spdx = paddr_to_pdx(nodes[i].start); - epdx = paddr_to_pdx(nodes[i].end); + epdx = paddr_to_pdx(nodes[i].end - 1) + 1; if (spdx >= epdx) continue; - if ((epdx >> shift) >= NODEMAPSIZE) + if ((epdx >> shift) >= memnodemapsize) return 0; do { if (memnodemap[spdx >> shift] != NUMA_NO_NODE) @@ -86,6 +88,28 @@ static int __init populate_memnodemap(co return res; } +static int __init allocate_cachealigned_memnodemap(void) +{ + unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap)); + unsigned long mfn = alloc_boot_pages(size, 1); + + if (!mfn) { + printk(KERN_ERR + "NUMA: Unable to allocate Memory to Node hash map\n"); + memnodemapsize = 0; + return -1; + } + + memnodemap = mfn_to_virt(mfn); + mfn <<= PAGE_SHIFT; + size <<= PAGE_SHIFT; + printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", + mfn, mfn + size); + memnodemapsize = size / sizeof(*memnodemap); + + return 0; +} + /* * The LSB of all start and end addresses in the node map is the value of the * maximum possible shift. @@ -99,7 +123,7 @@ static int __init extract_lsb_from_nodes for (i = 0; i < numnodes; i++) { spdx = paddr_to_pdx(nodes[i].start); - epdx = paddr_to_pdx(nodes[i].end); + epdx = paddr_to_pdx(nodes[i].end - 1) + 1; if (spdx >= epdx) continue; bitfield |= spdx; @@ -108,9 +132,10 @@ static int __init extract_lsb_from_nodes memtop = epdx; } if (nodes_used <= 1) - i = 63; + i = BITS_PER_LONG - 1; else i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + memnodemapsize = (memtop >> i) + 1; return i; } @@ -120,6 +145,10 @@ int __init compute_hash_shift(struct nod int shift; shift = extract_lsb_from_nodes(nodes, numnodes); + if (memnodemapsize <= ARRAY_SIZE(_memnodemap)) + memnodemap = _memnodemap; + else if (allocate_cachealigned_memnodemap()) + return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); @@ -233,8 +262,8 @@ void __init numa_initmem_init(unsigned l (u64)start_pfn << PAGE_SHIFT, (u64)end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ - memnode_shift = 63; - memnodemap[0] = 0; + memnode_shift = BITS_PER_LONG - 1; + memnodemap = _memnodemap; nodes_clear(node_online_map); node_set_online(0); for (i = 0; i < NR_CPUS; i++) --- 2010-01-06.orig/xen/include/asm-x86/numa.h 2010-01-06 15:16:18.000000000 +0100 +++ 2010-01-06/xen/include/asm-x86/numa.h 2010-01-06 15:15:41.000000000 +0100 @@ -25,7 +25,6 @@ extern int pxm_to_node(int nid); #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) #define VIRTUAL_BUG_ON(x) -#define NODEMAPSIZE 0xfff extern void numa_add_cpu(int cpu); extern void numa_init_array(void); @@ -51,7 +50,8 @@ static inline void clear_node_cpumask(in /* Simple perfect hash to map pdx to node numbers */ extern int memnode_shift; -extern u8 memnodemap[NODEMAPSIZE]; +extern unsigned long memnodemapsize; +extern u8 *memnodemap; struct node_data { unsigned long node_start_pfn; @@ -64,7 +64,7 @@ extern struct node_data node_data[]; static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) { unsigned nid; - VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= NODEMAPSIZE); + VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize); nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); return nid;