# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1260355373 0
# Node ID e5a757ce7845a92b2b2315873984eeb97ac6ca32
# Parent 1f5f1674e53f478b1ca57ab629d96f18d0bf4711
SRAT memory hotplug 2/2: Support overlapped and sparse node memory arrangement.
Currently xen hypervisor use nodes to keep start/end address of
node. It assume memory among nodes has no overlap, this is not always
true, especially if we have memory hotplug support in the system.
This patch backport Linux kernel's memblks to support overlapping
among node. The memblks will be used both for checking conflict, and
caculate memnode_shift.
Also, currently if there is no memory populated in a node when system
booting, the node will be unparsed later, and the corresponding CPU's
numa information will be removed also. This patch will keep the CPU
information.
One thing need notice is, currently we caculate memnode_shift with all
memory, including un-populated ones. This should work if the smallest
chuck is not so small. Other option can be flags in the page_info
structure, etc.
The memnodemap is changed from paddr to pdx, both to save space, and
also because currently most access is from pfn.
A flag is mem_hotplug added if there is hotplug memory range.
Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
---
xen/arch/x86/mm.c | 2
xen/arch/x86/numa.c | 96 +++++++++++++++++++++++++++++----------------
xen/arch/x86/srat.c | 82 +++++++++++++++++++++++++++-----------
xen/include/asm-x86/mm.h | 1
xen/include/asm-x86/numa.h | 14 +++---
xen/include/asm-x86/page.h | 1
6 files changed, 132 insertions(+), 64 deletions(-)
diff -r 1f5f1674e53f -r e5a757ce7845 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Wed Dec 09 10:41:37 2009 +0000
+++ b/xen/arch/x86/mm.c Wed Dec 09 10:42:53 2009 +0000
@@ -135,6 +135,8 @@ l1_pgentry_t __attribute__ ((__section__
#define PTE_UPDATE_WITH_CMPXCHG
#endif
+int mem_hotplug = 0;
+
/* Private domain structs for DOMID_XEN and DOMID_IO. */
struct domain *dom_xen, *dom_io;
diff -r 1f5f1674e53f -r e5a757ce7845 xen/arch/x86/numa.c
--- a/xen/arch/x86/numa.c Wed Dec 09 10:41:37 2009 +0000
+++ b/xen/arch/x86/numa.c Wed Dec 09 10:42:53 2009 +0000
@@ -28,6 +28,7 @@ custom_param("numa", numa_setup);
struct node_data node_data[MAX_NUMNODES];
+/* Mapping from pdx to node id */
int memnode_shift;
u8 memnodemap[NODEMAPSIZE];
@@ -52,54 +53,81 @@ int acpi_numa __devinitdata;
* 0 if memnodmap[] too small (of shift too small)
* -1 if node overlap or lost ram (shift too big)
*/
-static int __devinit
-populate_memnodemap(const struct node *nodes, int numnodes, int shift)
-{
- int i;
- int res = -1;
- paddr_t addr, end;
-
- if (shift >= 64)
- return -1;
- memset(memnodemap, 0xff, sizeof(memnodemap));
+static int __init populate_memnodemap(const struct node *nodes,
+ int numnodes, int shift, int *nodeids)
+{
+ unsigned long spdx, epdx;
+ int i, res = -1;
+
+ memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
for (i = 0; i < numnodes; i++) {
- addr = nodes[i].start;
- end = nodes[i].end;
- if (addr >= end)
- continue;
- if ((end >> shift) >= NODEMAPSIZE)
+ spdx = paddr_to_pdx(nodes[i].start);
+ epdx = paddr_to_pdx(nodes[i].end);
+ if (spdx >= epdx)
+ continue;
+ if ((epdx >> shift) >= NODEMAPSIZE)
return 0;
do {
- if (memnodemap[addr >> shift] != 0xff)
+ if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
return -1;
- memnodemap[addr >> shift] = i;
- addr += (1ULL << shift);
- } while (addr < end);
+
+ if (!nodeids)
+ memnodemap[spdx >> shift] = i;
+ else
+ memnodemap[spdx >> shift] = nodeids[i];
+
+ spdx += (1UL << shift);
+ } while (spdx < epdx);
res = 1;
- }
+ }
return res;
}
-int __init compute_hash_shift(struct node *nodes, int numnodes)
-{
- int shift = 20;
-
- while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
- shift++;
-
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init extract_lsb_from_nodes(const struct node *nodes,
+ int numnodes)
+{
+ int i, nodes_used = 0;
+ unsigned long spdx, epdx;
+ unsigned long bitfield = 0, memtop = 0;
+
+ for (i = 0; i < numnodes; i++) {
+ spdx = paddr_to_pdx(nodes[i].start);
+ epdx = paddr_to_pdx(nodes[i].end);
+ if (spdx >= epdx)
+ continue;
+ bitfield |= spdx;
+ nodes_used++;
+ if (epdx > memtop)
+ memtop = epdx;
+ }
+ if (nodes_used <= 1)
+ i = 63;
+ else
+ i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+ return i;
+}
+
+int __init compute_hash_shift(struct node *nodes, int numnodes,
+ int *nodeids)
+{
+ int shift;
+
+ shift = extract_lsb_from_nodes(nodes, numnodes);
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
- if (populate_memnodemap(nodes, numnodes, shift) != 1) {
- printk(KERN_INFO
- "Your memory is not aligned you need to rebuild your kernel "
- "with a bigger NODEMAPSIZE shift=%d\n",
- shift);
+ if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
+ printk(KERN_INFO "Your memory is not aligned you need to "
+ "rebuild your kernel with a bigger NODEMAPSIZE "
+ "shift=%d\n", shift);
return -1;
}
return shift;
}
-
/* initialize NODE_DATA given nodeid and start/end */
void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
{
@@ -167,7 +195,7 @@ static int numa_emulation(u64 start_pfn,
(nodes[i].end - nodes[i].start) >> 20);
node_set_online(i);
}
- memnode_shift = compute_hash_shift(nodes, numa_fake);
+ memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
if (memnode_shift < 0) {
memnode_shift = 0;
printk(KERN_ERR "No NUMA hash function found. Emulation
disabled.\n");
diff -r 1f5f1674e53f -r e5a757ce7845 xen/arch/x86/srat.c
--- a/xen/arch/x86/srat.c Wed Dec 09 10:41:37 2009 +0000
+++ b/xen/arch/x86/srat.c Wed Dec 09 10:42:53 2009 +0000
@@ -27,6 +27,11 @@ static struct node nodes[MAX_NUMNODES] _
static struct node nodes[MAX_NUMNODES] __initdata;
static u8 __read_mostly pxm2node[256] = { [0 ... 255] = 0xff };
+
+static int num_node_memblks;
+static struct node node_memblk_range[NR_NODE_MEMBLKS];
+static int memblk_nodeid[NR_NODE_MEMBLKS];
+
/* Too small nodes confuse the VM badly. Usually they result
from BIOS bugs. */
#define NODE_MIN_SIZE (4*1024*1024)
@@ -54,17 +59,33 @@ __devinit int setup_node(int pxm)
return pxm2node[pxm];
}
-static __init int conflicting_nodes(u64 start, u64 end)
-{
- int i;
- for_each_node_mask(i, nodes_parsed) {
- struct node *nd = &nodes[i];
+int valid_numa_range(unsigned long start, unsigned long end, int node)
+{
+ int i;
+
+ for (i = 0; i < num_node_memblks; i++) {
+ struct node *nd = &node_memblk_range[i];
+
+ if (nd->start <= start && nd->end > end &&
+ memblk_nodeid[i] == node )
+ return 1;
+ }
+
+ return 0;
+}
+
+static __init int conflicting_memblks(unsigned long start, unsigned long end)
+{
+ int i;
+
+ for (i = 0; i < num_node_memblks; i++) {
+ struct node *nd = &node_memblk_range[i];
if (nd->start == nd->end)
continue;
if (nd->end > start && nd->start < end)
- return i;
+ return memblk_nodeid[i];
if (nd->end == end && nd->start == start)
- return i;
+ return memblk_nodeid[i];
}
return -1;
}
@@ -174,6 +195,15 @@ acpi_numa_memory_affinity_init(struct ac
}
if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
return;
+
+ if (num_node_memblks >= NR_NODE_MEMBLKS)
+ {
+ dprintk(XENLOG_WARNING,
+ "Too many numa entry, try bigger NR_NODE_MEMBLKS \n");
+ bad_srat();
+ return;
+ }
+
start = ma->base_address;
end = start + ma->length;
pxm = ma->proximity_domain;
@@ -187,9 +217,15 @@ acpi_numa_memory_affinity_init(struct ac
}
/* It is fine to add this area to the nodes data it will be used later*/
if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
+ {
printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" -
%"PRIx64" \n",
start, end);
- i = conflicting_nodes(start, end);
+#ifdef CONFIG_X86_64
+ mem_hotplug = 1;
+#endif
+ }
+
+ i = conflicting_memblks(start, end);
if (i == node) {
printk(KERN_WARNING
"SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with
itself (%"
@@ -213,7 +249,12 @@ acpi_numa_memory_affinity_init(struct ac
nd->end = end;
}
printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node,
pxm,
- nd->start, nd->end);
+ start, end);
+
+ node_memblk_range[num_node_memblks].start = start;
+ node_memblk_range[num_node_memblks].end = end;
+ memblk_nodeid[num_node_memblks] = node;
+ num_node_memblks++;
}
/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -258,16 +299,6 @@ static int nodes_cover_memory(void)
return 1;
}
-static void unparse_node(int node)
-{
- int i;
- node_clear(node, nodes_parsed);
- for (i = 0; i < MAX_LOCAL_APIC; i++) {
- if (apicid_to_node[i] == node)
- apicid_to_node[i] = NUMA_NO_NODE;
- }
-}
-
void __init acpi_numa_arch_fixup(void) {}
#ifdef __x86_64__
@@ -340,11 +371,8 @@ int __init acpi_scan_nodes(u64 start, u6
int i;
/* First clean up the node list */
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < MAX_NUMNODES; i++)
cutoff_node(i, start, end);
- if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
- unparse_node(i);
- }
if (acpi_numa <= 0)
return -1;
@@ -354,7 +382,9 @@ int __init acpi_scan_nodes(u64 start, u6
return -1;
}
- memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+ memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
+ memblk_nodeid);
+
if (memnode_shift < 0) {
printk(KERN_ERR
"SRAT: No NUMA node hash function found. Contact
maintainer\n");
@@ -364,7 +394,11 @@ int __init acpi_scan_nodes(u64 start, u6
/* Finally register nodes */
for_each_node_mask(i, nodes_parsed)
+ {
+ if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
+ continue;
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ }
for (i = 0; i < NR_CPUS; i++) {
if (cpu_to_node[i] == NUMA_NO_NODE)
continue;
diff -r 1f5f1674e53f -r e5a757ce7845 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h Wed Dec 09 10:41:37 2009 +0000
+++ b/xen/include/asm-x86/mm.h Wed Dec 09 10:42:53 2009 +0000
@@ -368,6 +368,7 @@ int check_descriptor(const struct domain
int check_descriptor(const struct domain *, struct desc_struct *d);
extern int opt_allow_hugepage;
+extern int mem_hotplug;
/******************************************************************************
* With shadow pagetables, the different kinds of address start
diff -r 1f5f1674e53f -r e5a757ce7845 xen/include/asm-x86/numa.h
--- a/xen/include/asm-x86/numa.h Wed Dec 09 10:41:37 2009 +0000
+++ b/xen/include/asm-x86/numa.h Wed Dec 09 10:42:53 2009 +0000
@@ -19,7 +19,8 @@ struct node {
u64 start,end;
};
-extern int compute_hash_shift(struct node *nodes, int numnodes);
+extern int __init compute_hash_shift(struct node *nodes, int numnodes,
+ int *nodeids);
extern int pxm_to_node(int nid);
#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
@@ -48,7 +49,7 @@ static inline void clear_node_cpumask(in
cpu_clear(cpu, node_to_cpumask[cpu_to_node(cpu)]);
}
-/* Simple perfect hash to map physical addresses to node numbers */
+/* Simple perfect hash to map pdx to node numbers */
extern int memnode_shift;
extern u8 memnodemap[NODEMAPSIZE];
@@ -62,9 +63,9 @@ extern struct node_data node_data[];
static inline __attribute__((pure)) int phys_to_nid(paddr_t addr)
{
- unsigned nid;
- VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
- nid = memnodemap[addr >> memnode_shift];
+ unsigned nid;
+ VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= NODEMAPSIZE);
+ nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift];
VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
return nid;
}
@@ -75,10 +76,11 @@ static inline __attribute__((pure)) int
#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
NODE_DATA(nid)->node_spanned_pages)
-
+extern int valid_numa_range(unsigned long start, unsigned long end, int node);
#else
#define init_cpu_to_node() do {} while (0)
#define clear_node_cpumask(cpu) do {} while (0)
+#define valid_numa_range(start, end, node) {return 1;}
#endif
void srat_parse_regions(u64 addr);
diff -r 1f5f1674e53f -r e5a757ce7845 xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h Wed Dec 09 10:41:37 2009 +0000
+++ b/xen/include/asm-x86/page.h Wed Dec 09 10:42:53 2009 +0000
@@ -257,6 +257,7 @@ void copy_page_sse2(void *, const void *
#define page_to_virt(pg) __page_to_virt(pg)
#define pfn_to_paddr(pfn) __pfn_to_paddr(pfn)
#define paddr_to_pfn(pa) __paddr_to_pfn(pa)
+#define paddr_to_pdx(pa) pfn_to_pdx(paddr_to_pfn(pa))
#endif /* !defined(__ASSEMBLY__) */
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|