diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 9539998..c47b9fa 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -32,6 +32,7 @@ } while (0) #elif defined(__x86_64__) #define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0) +#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) #endif #else #if defined(__i386__) @@ -43,6 +44,7 @@ } while (0) #elif defined(__x86_64__) #define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0) +#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) #endif #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index df20f15..3e3f546 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -942,7 +942,8 @@ void __init setup_arch(char **cmdline_p) /* * Parse SRAT to discover nodes. */ - acpi_numa_init(); + if (acpi_numa > 0) + acpi_numa_init(); #endif initmem_init(0, max_pfn); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 459913b..3a856dc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -11,7 +11,11 @@ #include #include #include +#include #include +#include +#include +#include #include #include @@ -19,6 +23,8 @@ #include #include #include +#include +#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn */ if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { long n = simple_strtol(cmdline, NULL, 0); - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); if (num_nodes < 0) return num_nodes; @@ -522,6 +527,246 @@ out: numa_init_array(); return 0; } + +/************************************************************************/ +#ifdef CONFIG_XEN_NUMA_GUEST +/* XEN PV GUEST NUMA */ +struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout; + +static inline void __init +bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits) +{ + /* We may need to pad the final longword with zeroes. */ + if (nbits & (BITS_PER_LONG-1)) + lp[BITS_TO_LONGS(nbits)-1] = 0; + memcpy(lp, bp, (nbits+7)/8); +} + +static void __init +xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask *xcpumask) +{ + unsigned int nr_cpus; + uint8_t *bytemap; + + bytemap = xcpumask->bits; + + nr_cpus = + min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS); + + cpumask_clear(cpumask); + bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus); +} + +static void __init +xen_dump_numa_layout(struct xen_domain_numa_layout *layout) +{ + unsigned int i, j; + char vcpumaskstr[128]; + printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ", + layout->max_vcpus, layout->max_vnodes); + switch (layout->type) + { + case XEN_DOM_NUMA_CONFINED: + printk("type(CONFINED)\n"); + break; + case XEN_DOM_NUMA_SPLIT: + printk("type(SPLIT)\n"); + break; + case XEN_DOM_NUMA_STRIPED: + printk("type(STRIPED)\n"); + break; + default: + printk("type(UNDEFINED)\n"); + } + + for (i = 0; i < layout->max_vnodes; i++) + { + cpumask_t vcpu_mask; + struct xenctl_cpumask *xvcpumask; + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; + xvcpumask = &vnode_data->vcpu_mask; + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask); + cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask); + printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", + vnode_data->vnode_id, vnode_data->mnode_id, + (unsigned long)vnode_data->nr_pages, vcpumaskstr); + } + + printk("vnode distances :\n"); + for (i = 0; i < layout->max_vnodes; i++) + printk("\tvnode[%u]", i); + for (i = 0; i < layout->max_vnodes; i++) + { + printk("\nvnode[%u]", i); + for (j = 0; j < layout->max_vnodes; j++) + printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]); + printk("\n"); + } + return; +} + +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout) +{ + /* Construct a slit table (using layout->vnode_distance). + * Copy it to acpi_slit. */ + return; +} + +/* Distribute the vcpus over the vnodes according to their affinity */ +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout) +{ + int vcpu, vnode; + + printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n"); + + + for (vnode = 0; vnode < layout->max_vnodes; vnode++) + { + struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode]; + struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask; + cpumask_t vcpu_mask; + + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask); + + for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++) + { + if (cpu_isset(vcpu, vcpu_mask)) + { + if (early_cpu_to_node(vcpu) != NUMA_NO_NODE) + { + printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", + vcpu, early_cpu_to_node(vcpu)); + continue; + } + printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode); + numa_set_node(vcpu, vnode); + } + } + } + return; +} + +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout, + unsigned long start_pfn, unsigned long last_pfn) +{ + int num_vnodes, i; + u64 node_start_addr, node_end_addr, max_addr; + + printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)", + layout->max_vnodes, layout->max_vcpus); + + if (layout->type != XEN_DOM_NUMA_SPLIT) + { + printk(KERN_INFO "xen_numa_emulation : Invalid layout type"); + return -1; + } + + memset(&nodes, 0, sizeof(nodes)); + + num_vnodes = layout->max_vnodes; + BUG_ON(num_vnodes > MAX_NUMNODES); + + max_addr = last_pfn << PAGE_SHIFT; + + node_start_addr = start_pfn << PAGE_SHIFT; + for (i = 0; i < num_vnodes; i++) + { + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; + u64 node_size = vnode_data->nr_pages << PAGE_SHIFT; + + node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */ + + if (i == (num_vnodes-1)) + node_end_addr = max_addr; + else + node_end_addr = node_start_addr + node_size; + /* node_start_addr updated inside the function */ + if (setup_node_range(i, nodes, &node_start_addr, + (node_end_addr-node_start_addr), max_addr+1)) + goto failed; + } + + printk(KERN_INFO "XEN domain numa emulation - setup nodes\n"); + + memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA hash function found.\n"); + goto failed; + } + /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */ + /* + * We need to vacate all active ranges that may have been registered by + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns + * true. NUMA emulation has succeeded so we will not scan ACPI nodes. + */ + remove_all_active_ranges(); + + BUG_ON(acpi_numa >= 0); + for_each_node_mask(i, node_possible_map) { + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } + xen_init_slit_table(layout); + xen_init_numa_array(layout); + return 0; +failed: + return -1; +} + +static int __init +xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout) +{ + int rc; + struct xenmem_numa_op memop; + memop.cmd = XENMEM_get_domain_numa_layout; + memop.u.dinfo.domid = DOMID_SELF; + memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION; + memop.u.dinfo.bufsize = sizeof(*pv_layout); + set_xen_guest_handle(memop.u.dinfo.buf, pv_layout); + + if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop))) + { + printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout failed\n"); + xen_start_info->flags &= ~SIF_NUMA_DOMAIN; + goto done; + } + + if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION) + { + printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n"); + xen_start_info->flags &= ~SIF_NUMA_DOMAIN; + rc = -1; + } + xen_dump_numa_layout(pv_layout); +done: + return rc; +} + +static int __init +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn) +{ + int rc = 0; + if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN)) + { + rc = -1; + printk(KERN_INFO "xen numa emulation disabled\n"); + goto done; + } + if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout))) + goto done; + rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn, last_pfn); +done: + return rc; +} +#else +static inline int __init +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn) +{ + return -1; +} +#endif /* CONFIG_XEN_NUMA_GUEST */ +/************************************************************************/ #endif /* CONFIG_NUMA_EMU */ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) @@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU + if (!xen_pv_numa(start_pfn, last_pfn)) + return; + if (cmdline && !numa_emulation(start_pfn, last_pfn)) return; nodes_clear(node_possible_map); diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 7675f9b..7cf6a4f 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH help Enable support for passing PCI devices through to unprivileged domains. (COMPLETELY UNTESTED) + +config XEN_NUMA_GUEST + bool "Enable support NUMA aware Xen domains" + depends on XEN && X86_64 && NUMA && NUMA_EMU + help + Enable support for NUMA aware Xen domains. With this + option, if the memory for the domain is allocated + from different memory nodes, the domain is made aware + of this through a Virtual NUMA enlightenment. diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index df3e84c..2ee9f0b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -285,6 +285,8 @@ void __init xen_arch_setup(void) printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); disable_acpi(); } + acpi_numa = -1; + numa_off = 1; #endif /* diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h index 32ab005..7f5b85e 100644 --- a/include/xen/interface/memory.h +++ b/include/xen/interface/memory.h @@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map); */ #define XENMEM_machine_memory_map 10 +/* xen guest numa operations */ +#define XENMEM_numa_op 15 + +#define XEN_DOM_NUMA_INTERFACE_VERSION 0x00000001 +#define XENCTL_NR_CPUS 64 +#define XENCTL_BITS_PER_BYTE 8 +#define XENCTL_BITS_TO_BYTES(bits) \ + (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE) + +#define XENCTL_DECLARE_BITMAP(name,bits) \ + uint8_t name[XENCTL_BITS_TO_BYTES(bits)] +struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); }; + +/* Not used in guest */ +#define XENMEM_machine_numa_layout 0x01 +struct xenmem_node_data { + uint32_t node_id; + uint64_t node_memsize; + uint64_t node_memfree; + struct xenctl_cpumask cpu_mask; /* node_to_cpumask */ +}; + +/* NUMA layout for the machine. + * Structure has to fit within a page. */ +struct xenmem_machine_numa_layout { + uint32_t max_nodes; + /* Only (max_nodes*max_nodes) entries are filled */ + GUEST_HANDLE(uint32_t) node_distance; + /* max_vnodes entries of xenmem_node_data type */ + GUEST_HANDLE(void) node_data; +}; +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout); + + +#define XENMEM_machine_nodemap 0x02 +struct xenmem_machine_nodemap { + /* On call the size of the available buffer */ + uint32_t bufsize; + + /* memnode map parameters */ + int32_t shift; + uint32_t mapsize; + GUEST_HANDLE(void) map; +}; +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap); + +/* NUMA layout for the domain at the time of startup. + * Structure has to fit within a page. */ +#define XENMEM_set_domain_numa_layout 0x03 +#define XENMEM_get_domain_numa_layout 0x04 + +/* NUMA layout for the domain at the time of startup. + * Structure has to fit within a page. */ +#define XEN_MAX_VNODES 8 + +struct xen_vnode_data { + uint32_t vnode_id; + uint32_t mnode_id; + uint64_t nr_pages; + struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */ +}; + +#define XEN_DOM_NUMA_CONFINED 0x01 +#define XEN_DOM_NUMA_SPLIT 0x02 +#define XEN_DOM_NUMA_STRIPED 0x03 +struct xen_domain_numa_layout { + uint32_t version; + uint32_t type; + + uint32_t max_vcpus; + uint32_t max_vnodes; + + /* Only (max_vnodes*max_vnodes) entries are filled */ + uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES]; + /* Only (max_vnodes) entries are filled */ + struct xen_vnode_data vnode_data[XEN_MAX_VNODES]; +}; + +struct xenmem_domain_numa_layout { + domid_t domid; + uint32_t version; + + uint32_t bufsize; + GUEST_HANDLE(void) buf; +}; +DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout); + +struct xenmem_numa_op { + uint32_t cmd; + union { + struct xenmem_machine_numa_layout minfo; + struct xenmem_machine_nodemap mnodemap; + struct xenmem_domain_numa_layout dinfo; + } u; +}; +DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op); /* * Prevent the balloon driver from changing the memory reservation diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 9ffaee0..17cb17d 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -494,6 +494,8 @@ struct dom0_vga_console_info { /* These flags are passed in the 'flags' field of start_info_t. */ #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ +#define SIF_NUMA_DOMAIN (1<<3) /* Is the domain NUMA aware ? */ #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ typedef uint64_t cpumap_t; @@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16]; #define __mk_unsigned_long(x) x ## UL #define mk_unsigned_long(x) __mk_unsigned_long(x) +DEFINE_GUEST_HANDLE(uint32_t); DEFINE_GUEST_HANDLE(uint64_t); #else /* __ASSEMBLY__ */