WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 2/2] Add support to hotplug entry in SRAT table

To: Keir Fraser <keir.fraser@xxxxxxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH 2/2] Add support to hotplug entry in SRAT table
From: "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx>
Date: Tue, 8 Dec 2009 18:03:54 +0800
Accept-language: en-US
Acceptlanguage: en-US
Cc:
Delivery-date: Tue, 08 Dec 2009 02:05:36 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: Acp37b/4XWLf2PdPSAO7LMCOx0g2Kg==
Thread-topic: [PATCH 2/2] Add support to hotplug entry in SRAT table
Add support to overlapped and sparse node memory arrangement.

Currently xen hypervisor use nodes to keep start/end address of node. It assume 
memory among nodes has no overlap, this is not always true, especially if we 
have memory hotplug support in the system.
This patch backport Linux kernel's memblks to support overlapping among node. 
The memblks will be used both for checking conflict, and caculate memnode_shift.

Also, currently if there is no memory populated in a node when system booting, 
the node will be unparsed later, and the corresponding CPU's numa information 
will be removed also. This patch will keep the CPU information.

One thing need notice is, currently we caculate memnode_shift with all memory, 
including un-populated ones. This should work if the smallest chuck is not so 
small. Other option can be flags in the page_info structure, etc.

The memnodemap is changed from paddr to pdx, both to save space, and also 
because currently most access is from pfn.

A flag is mem_hotplug added if there is hotplug memory range.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>

 arch/x86/mm.c          |    2 +
 arch/x86/numa.c        |   96 +++++++++++++++++++++++++++++++------------------
 arch/x86/srat.c        |   82 +++++++++++++++++++++++++++++------------
 include/asm-x86/mm.h   |    1
 include/asm-x86/numa.h |   14 ++++---
 include/asm-x86/page.h |    1
 6 files changed, 132 insertions(+), 64 deletions(-)

diff -r 74823a302da5 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Tue Dec 08 01:23:33 2009 +0800
+++ b/xen/arch/x86/mm.c Tue Dec 08 01:23:34 2009 +0800
@@ -135,6 +135,8 @@ l1_pgentry_t __attribute__ ((__section__
 #define PTE_UPDATE_WITH_CMPXCHG
 #endif
 
+int mem_hotplug = 0;
+
 /* Private domain structs for DOMID_XEN and DOMID_IO. */
 struct domain *dom_xen, *dom_io;
 
diff -r 74823a302da5 xen/arch/x86/numa.c
--- a/xen/arch/x86/numa.c       Tue Dec 08 01:23:33 2009 +0800
+++ b/xen/arch/x86/numa.c       Tue Dec 08 01:29:22 2009 +0800
@@ -28,6 +28,7 @@ custom_param("numa", numa_setup);
 
 struct node_data node_data[MAX_NUMNODES];
 
+/* Mapping from pdx to node id */
 int memnode_shift;
 u8  memnodemap[NODEMAPSIZE];
 
@@ -52,54 +53,81 @@ int acpi_numa __devinitdata;
  * 0 if memnodmap[] too small (of shift too small)
  * -1 if node overlap or lost ram (shift too big)
  */
-static int __devinit
-populate_memnodemap(const struct node *nodes, int numnodes, int shift)
-{
-       int i; 
-       int res = -1;
-       paddr_t addr, end;
-
-       if (shift >= 64)
-               return -1;
-       memset(memnodemap, 0xff, sizeof(memnodemap));
+static int __init populate_memnodemap(const struct node *nodes,
+                                      int numnodes, int shift, int *nodeids)
+{
+       unsigned long spdx, epdx;
+       int i, res = -1;
+
+       memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
        for (i = 0; i < numnodes; i++) {
-               addr = nodes[i].start;
-               end = nodes[i].end;
-               if (addr >= end)
-                       continue;
-               if ((end >> shift) >= NODEMAPSIZE)
+               spdx = paddr_to_pdx(nodes[i].start);
+               epdx = paddr_to_pdx(nodes[i].end);
+               if (spdx >= epdx)
+                       continue;
+               if ((epdx >> shift) >= NODEMAPSIZE)
                        return 0;
                do {
-                       if (memnodemap[addr >> shift] != 0xff)
+                       if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
                                return -1;
-                       memnodemap[addr >> shift] = i;
-                       addr += (1ULL << shift);
-               } while (addr < end);
+
+                       if (!nodeids)
+                               memnodemap[spdx >> shift] = i;
+                       else
+                               memnodemap[spdx >> shift] = nodeids[i];
+
+                       spdx += (1UL << shift);
+               } while (spdx < epdx);
                res = 1;
-       } 
+       }
        return res;
 }
 
-int __init compute_hash_shift(struct node *nodes, int numnodes)
-{
-       int shift = 20;
-
-       while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
-               shift++;
-
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init extract_lsb_from_nodes(const struct node *nodes,
+                                        int numnodes)
+{
+       int i, nodes_used = 0;
+       unsigned long spdx, epdx;
+       unsigned long bitfield = 0, memtop = 0;
+
+       for (i = 0; i < numnodes; i++) {
+               spdx = paddr_to_pdx(nodes[i].start);
+               epdx = paddr_to_pdx(nodes[i].end);
+               if (spdx >= epdx)
+                       continue;
+               bitfield |= spdx;
+               nodes_used++;
+               if (epdx > memtop)
+                       memtop = epdx;
+       }
+       if (nodes_used <= 1)
+               i = 63;
+       else
+               i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+       return i;
+}
+
+int __init compute_hash_shift(struct node *nodes, int numnodes,
+                             int *nodeids)
+{
+       int shift;
+
+       shift = extract_lsb_from_nodes(nodes, numnodes);
        printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
                shift);
 
-       if (populate_memnodemap(nodes, numnodes, shift) != 1) {
-               printk(KERN_INFO
-       "Your memory is not aligned you need to rebuild your kernel "
-       "with a bigger NODEMAPSIZE shift=%d\n",
-                       shift);
+       if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
+               printk(KERN_INFO "Your memory is not aligned you need to "
+                      "rebuild your kernel with a bigger NODEMAPSIZE "
+                      "shift=%d\n", shift);
                return -1;
        }
        return shift;
 }
-
 /* initialize NODE_DATA given nodeid and start/end */
 void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
 { 
@@ -167,7 +195,7 @@ static int numa_emulation(u64 start_pfn,
                       (nodes[i].end - nodes[i].start) >> 20);
                node_set_online(i);
        }
-       memnode_shift = compute_hash_shift(nodes, numa_fake);
+       memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
        if (memnode_shift < 0) {
                memnode_shift = 0;
                printk(KERN_ERR "No NUMA hash function found. Emulation 
disabled.\n");
diff -r 74823a302da5 xen/arch/x86/srat.c
--- a/xen/arch/x86/srat.c       Tue Dec 08 01:23:33 2009 +0800
+++ b/xen/arch/x86/srat.c       Tue Dec 08 01:31:45 2009 +0800
@@ -27,6 +27,11 @@ static struct node nodes[MAX_NUMNODES] _
 static struct node nodes[MAX_NUMNODES] __initdata;
 static u8 __read_mostly pxm2node[256] = { [0 ... 255] = 0xff };
 
+
+static int num_node_memblks;
+static struct node node_memblk_range[NR_NODE_MEMBLKS];
+static int memblk_nodeid[NR_NODE_MEMBLKS];
+
 /* Too small nodes confuse the VM badly. Usually they result
    from BIOS bugs. */
 #define NODE_MIN_SIZE (4*1024*1024)
@@ -54,17 +59,33 @@ __devinit int setup_node(int pxm)
        return pxm2node[pxm];
 }
 
-static __init int conflicting_nodes(u64 start, u64 end)
-{
-       int i;
-       for_each_node_mask(i, nodes_parsed) {
-               struct node *nd = &nodes[i];
+int valid_numa_range(unsigned long start, unsigned long end, int node)
+{
+       int i;
+
+       for (i = 0; i < num_node_memblks; i++) {
+               struct node *nd = &node_memblk_range[i];
+
+               if (nd->start <= start && nd->end > end &&
+                       memblk_nodeid[i] == node )
+                       return 1;
+       }
+
+       return 0;
+}
+
+static __init int conflicting_memblks(unsigned long start, unsigned long end)
+{
+       int i;
+
+       for (i = 0; i < num_node_memblks; i++) {
+               struct node *nd = &node_memblk_range[i];
                if (nd->start == nd->end)
                        continue;
                if (nd->end > start && nd->start < end)
-                       return i;
+                       return memblk_nodeid[i];
                if (nd->end == end && nd->start == start)
-                       return i;
+                       return memblk_nodeid[i];
        }
        return -1;
 }
@@ -174,6 +195,15 @@ acpi_numa_memory_affinity_init(struct ac
        }
        if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
                return;
+
+       if (num_node_memblks >= NR_NODE_MEMBLKS)
+       {
+               dprintk(XENLOG_WARNING,
+                "Too many numa entry, try bigger NR_NODE_MEMBLKS \n");
+               bad_srat();
+               return;
+       }
+
        start = ma->base_address;
        end = start + ma->length;
        pxm = ma->proximity_domain;
@@ -187,9 +217,15 @@ acpi_numa_memory_affinity_init(struct ac
        }
        /* It is fine to add this area to the nodes data it will be used later*/
        if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
+       {
                printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - 
%"PRIx64" \n",
                                start, end);
-       i = conflicting_nodes(start, end);
+#ifdef CONFIG_X86_64
+               mem_hotplug = 1;
+#endif
+       }
+
+       i = conflicting_memblks(start, end);
        if (i == node) {
                printk(KERN_WARNING
                "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with 
itself (%"
@@ -213,7 +249,12 @@ acpi_numa_memory_affinity_init(struct ac
                        nd->end = end;
        }
        printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, 
pxm,
-              nd->start, nd->end);
+              start, end);
+
+       node_memblk_range[num_node_memblks].start = start;
+       node_memblk_range[num_node_memblks].end = end;
+       memblk_nodeid[num_node_memblks] = node;
+       num_node_memblks++;
 }
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -258,16 +299,6 @@ static int nodes_cover_memory(void)
        return 1;
 }
 
-static void unparse_node(int node)
-{
-       int i;
-       node_clear(node, nodes_parsed);
-       for (i = 0; i < MAX_LOCAL_APIC; i++) {
-               if (apicid_to_node[i] == node)
-                       apicid_to_node[i] = NUMA_NO_NODE;
-       }
-}
-
 void __init acpi_numa_arch_fixup(void) {}
 
 #ifdef __x86_64__
@@ -340,11 +371,8 @@ int __init acpi_scan_nodes(u64 start, u6
        int i;
 
        /* First clean up the node list */
-       for (i = 0; i < MAX_NUMNODES; i++) {
+       for (i = 0; i < MAX_NUMNODES; i++)
                cutoff_node(i, start, end);
-               if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
-                       unparse_node(i);
-       }
 
        if (acpi_numa <= 0)
                return -1;
@@ -354,7 +382,9 @@ int __init acpi_scan_nodes(u64 start, u6
                return -1;
        }
 
-       memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+       memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
+                               memblk_nodeid);
+
        if (memnode_shift < 0) {
                printk(KERN_ERR
                     "SRAT: No NUMA node hash function found. Contact 
maintainer\n");
@@ -364,7 +394,11 @@ int __init acpi_scan_nodes(u64 start, u6
 
        /* Finally register nodes */
        for_each_node_mask(i, nodes_parsed)
+       {
+               if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
+                       continue;
                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+       }
        for (i = 0; i < NR_CPUS; i++) { 
                if (cpu_to_node[i] == NUMA_NO_NODE)
                        continue;
diff -r 74823a302da5 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Tue Dec 08 01:23:33 2009 +0800
+++ b/xen/include/asm-x86/mm.h  Tue Dec 08 01:23:34 2009 +0800
@@ -368,6 +368,7 @@ int check_descriptor(const struct domain
 int check_descriptor(const struct domain *, struct desc_struct *d);
 
 extern int opt_allow_hugepage;
+extern int mem_hotplug;
 
 /******************************************************************************
  * With shadow pagetables, the different kinds of address start 
diff -r 74823a302da5 xen/include/asm-x86/numa.h
--- a/xen/include/asm-x86/numa.h        Tue Dec 08 01:23:33 2009 +0800
+++ b/xen/include/asm-x86/numa.h        Tue Dec 08 01:23:34 2009 +0800
@@ -19,7 +19,8 @@ struct node {
        u64 start,end; 
 };
 
-extern int compute_hash_shift(struct node *nodes, int numnodes);
+extern int __init compute_hash_shift(struct node *nodes, int numnodes,
+                             int *nodeids);
 extern int pxm_to_node(int nid);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
@@ -48,7 +49,7 @@ static inline void clear_node_cpumask(in
        cpu_clear(cpu, node_to_cpumask[cpu_to_node(cpu)]);
 }
 
-/* Simple perfect hash to map physical addresses to node numbers */
+/* Simple perfect hash to map pdx to node numbers */
 extern int memnode_shift; 
 extern u8  memnodemap[NODEMAPSIZE]; 
 
@@ -62,9 +63,9 @@ extern struct node_data node_data[];
 
 static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) 
 { 
-       unsigned nid; 
-       VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
-       nid = memnodemap[addr >> memnode_shift]; 
+       unsigned nid;
+       VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= NODEMAPSIZE);
+       nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; 
        VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
        return nid; 
 } 
@@ -75,10 +76,11 @@ static inline __attribute__((pure)) int 
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
                                 NODE_DATA(nid)->node_spanned_pages)
 
-
+extern int valid_numa_range(unsigned long start, unsigned long end, int node);
 #else
 #define init_cpu_to_node() do {} while (0)
 #define clear_node_cpumask(cpu) do {} while (0)
+#define valid_numa_range(start, end, node) {return 1;}
 #endif
 
 void srat_parse_regions(u64 addr);
diff -r 74823a302da5 xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h        Tue Dec 08 01:23:33 2009 +0800
+++ b/xen/include/asm-x86/page.h        Tue Dec 08 01:23:34 2009 +0800
@@ -257,6 +257,7 @@ void copy_page_sse2(void *, const void *
 #define page_to_virt(pg)    __page_to_virt(pg)
 #define pfn_to_paddr(pfn)   __pfn_to_paddr(pfn)
 #define paddr_to_pfn(pa)    __paddr_to_pfn(pa)
+#define paddr_to_pdx(pa)    pfn_to_pdx(paddr_to_pfn(pa))
 
 #endif /* !defined(__ASSEMBLY__) */
 

Attachment: numa_node.patch
Description: numa_node.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 2/2] Add support to hotplug entry in SRAT table, Jiang, Yunhong <=