Xen project Mailing List

Re: [Xen-devel] [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest

To: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>

From: Elena Ufimtseva <ufimtseva@xxxxxxxxx>

Date: Wed, 28 Aug 2013 09:41:27 -0400

Cc: "xen-devel@xxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxx>

Delivery-date: Wed, 28 Aug 2013 13:41:55 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

Hi Konrad Thank you for comments, I will resend the fix in the new patch. Regarding memblocks contrustion they are done taking into account e820 map for this domain. I will place some comments in code as you mentioned. Elena On Tue, Aug 27, 2013 at 10:33 AM, Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> wrote: > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote: >> Uses subop hypercall to request XEN about vnuma topology. >> Sets the memory blocks (aligned by XEN), cpus, distance table >> on boot. NUMA support should be compiled in kernel. > > Needs a bit more details (which I am sure you will add in the > next postings). Mostly: > - How E820 and NUMA information mesh > - What the hypercall provides > - What changeset in the Xen provides this hypercall > - >> >> NUMA support should be compiled in kernel. >> >> Memory blocks are aligned by xen. Xen is aware of guest initial >> RAM layout. >> If the memory blocks are incorrect, call for default linux numa >> dummy init. > > Unfortunatly that won't happen if you boot this under dom0. It will > find on some AMD machines the AMD Northbridge and try to scan that. > And blow up. > > If you look at the git commit that did the 'numa = 0' you will see > the backstory. > > I think part of enablement of ' numa = 1' is to wrap it with > > if (xen_initial_domain() && xen_supports_this_hypercall()) > numa = 1; > > in the #2 patch. > >> >> Requires XEN with vNUMA support. >> >> Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git >> latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376 >> >> TODO: >> Change subop from XENMEM to SYSCTL. >> >> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx> >> --- >> arch/x86/include/asm/xen/vnuma-xen.h | 36 +++++++++++ >> arch/x86/mm/numa.c | 8 +++ >> arch/x86/xen/enlighten.c | 115 >> ++++++++++++++++++++++++++++++++++ >> include/xen/interface/memory.h | 1 + >> 4 files changed, 160 insertions(+) >> create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h >> >> diff --git a/arch/x86/include/asm/xen/vnuma-xen.h >> b/arch/x86/include/asm/xen/vnuma-xen.h >> new file mode 100644 >> index 0000000..73c1bde >> --- /dev/null >> +++ b/arch/x86/include/asm/xen/vnuma-xen.h >> @@ -0,0 +1,36 @@ >> +#ifndef _ASM_X86_VNUMA_XEN_H >> +#define _ASM_X86_VNUMA_XEN_H >> + >> +#ifdef CONFIG_XEN >> +int __initdata xen_numa_init(void); >> + >> +struct vnuma_memblk { >> + uint64_t start, end; >> +}; >> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk); >> + >> +struct vnuma_topology_info { >> + domid_t domid; >> + uint16_t nr_nodes; >> + GUEST_HANDLE(vnuma_memblk) vmemblk; >> + GUEST_HANDLE(int) vdistance; >> + GUEST_HANDLE(int) cpu_to_node; >> + GUEST_HANDLE(int) vnode_to_pnode; > > A comment explaining what those values are meant to have would be good. > Perhaps with a simple example. > >> +}; > > The structure is not very 32-bit friendly. Would it be possible > to add some padding so that the size of this structure is the > same on 32-bit and 64-bit please? > > Perhaps: > domid_t domid; > uint16_t nr_nodes; > uint32_t _pad; > GUEST_HANDLE(vnuma_memblk) vmemblk; > GUEST_HANDLE(int) vdistance; > GUEST_HANDLE(int) cpu_to_node; > GUEST_HANDLE(int) vnode_to_pnode; > > That should make the offsets be the same on both 32 and 64-bit > I think. > >> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info); >> + >> +struct xen_domctl { >> + uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ >> + domid_t domain; >> + union { >> + struct vnuma_topology_info vnuma; >> + } u; > > Move that 'u' one tab back please. > >> +}; >> +typedef struct xen_domctl xen_domctl_t; > > Ewww. No typdefs in the Linux code please. > >> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t); >> + >> +#else >> +int __init xen_numa_init(void) {} >> +#endif >> + >> +#endif /* _ASM_X86_VNUMA_XEN_H */ >> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c >> index 8bf93ba..3ec7855 100644 >> --- a/arch/x86/mm/numa.c >> +++ b/arch/x86/mm/numa.c >> @@ -20,6 +20,10 @@ >> >> #include "numa_internal.h" >> >> +#ifdef CONFIG_XEN >> +#include "asm/xen/vnuma-xen.h" >> +#endif >> + >> int __initdata numa_off; >> nodemask_t numa_nodes_parsed __initdata; >> >> @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void) >> void __init x86_numa_init(void) >> { >> if (!numa_off) { >> +#ifdef CONFIG_XEN >> + if (xen_pv_domain() && !numa_init(xen_numa_init)) > > I would remove the xen_pv_domain() check and add that in enlighten.c code. > >> + return; >> +#endif >> #ifdef CONFIG_X86_NUMAQ >> if (!numa_init(numaq_numa_init)) >> return; >> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c >> index 193097e..4658e9d 100644 >> --- a/arch/x86/xen/enlighten.c >> +++ b/arch/x86/xen/enlighten.c >> @@ -33,6 +33,7 @@ >> #include <linux/memblock.h> >> #include <linux/edd.h> >> >> +#include <asm/numa.h> >> #include <xen/xen.h> >> #include <xen/events.h> >> #include <xen/interface/xen.h> >> @@ -69,6 +70,7 @@ >> #include <asm/mwait.h> >> #include <asm/pci_x86.h> >> #include <asm/pat.h> >> +#include <asm/xen/vnuma-xen.h> >> >> #ifdef CONFIG_ACPI >> #include <linux/acpi.h> >> @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm >> __refconst = { >> .x2apic_available = xen_x2apic_para_available, >> }; >> EXPORT_SYMBOL(x86_hyper_xen_hvm); >> + >> +/* Xen PV NUMA topology initialization */ >> +int __initdata xen_numa_init(void) >> +{ >> + struct vnuma_memblk *vblk; >> + struct vnuma_topology_info numa_topo; >> + uint64_t size, start, end; >> + int i, j, cpu, rc; >> + u64 phys, physd, physc; >> + u64 new_end_pfn; >> + size_t mem_size; >> + >> + int *vdistance, *cpu_to_node; >> + unsigned long dist_size, cpu_to_node_size; > > You need an extra line here; > >> + numa_topo.domid = DOMID_SELF; >> + rc = -EINVAL; > > Both of those can be part of the decleration of variables above. Like so: > int rc = -EINVAL; > struct vnuma_topology_info numa_topo = { > .domid = DOMID_SELF; > }; > >> + /* block mem reservation for memblks */ >> + mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk); >> + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, >> PAGE_SIZE); >> + if (!phys) { >> + pr_info("vNUMA: can't allocate memory for membloks size >> %zu\n", mem_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + if (memblock_reserve(phys, mem_size) < 0) { >> + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu >> \n", mem_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + vblk = __va(phys); >> + set_xen_guest_handle(numa_topo.vmemblk, vblk); >> + >> + dist_size = num_possible_cpus() * num_possible_cpus() * >> sizeof(*numa_topo.vdistance); >> + physd = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), dist_size, >> PAGE_SIZE); >> + if (!physd) { >> + pr_info("vNUMA: can't allocate memory for distance size >> %zu\n", dist_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + if (memblock_reserve(physd, dist_size) < 0) { >> + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu >> \n", dist_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + vdistance = __va(physd); >> + set_xen_guest_handle(numa_topo.vdistance, (int *)vdistance); >> + >> + /* allocate memblock for vcpu to node mask of max size */ >> + cpu_to_node_size = num_possible_cpus() * >> sizeof(*numa_topo.cpu_to_node); >> + physc = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), >> cpu_to_node_size, PAGE_SIZE); >> + if (!physc) { >> + pr_info("vNUMA: can't allocate memory for distance size >> %zu\n", cpu_to_node_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + if (memblock_reserve(physc, cpu_to_node_size) < 0) { >> + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu\n", >> cpu_to_node_size); >> + goto errout; >> + } >> + cpu_to_node = __va(physc); >> + set_xen_guest_handle(numa_topo.cpu_to_node, (int *)cpu_to_node); >> + if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0) > > I think you need to do rc = HYPERVISOR... > > and then if you get rc = -ENOSYS (so not implemented) change the rc to zero. > >> + goto errout; >> + if (numa_topo.nr_nodes == 0) >> + /* Pass to DUMMY numa */ > > Also change rc to zero. > >> + goto errout; >> + if (numa_topo.nr_nodes > num_possible_cpus()) { >> + pr_info("vNUMA: Node without cpu is not supported, nodes = >> %d\n", numa_topo.nr_nodes); >> + goto errout; >> + } >> + new_end_pfn = 0; >> + /* We have sizes in pages received from hypervisor, no holes and >> ordered */ > > From toolstack you mean. It is the one setting it up. And I hope it sets it up > based on the E820 it has constructed as well? Otherwise it would be a bit > awkward if those two were different. > >> + for (i = 0; i < numa_topo.nr_nodes; i++) { >> + start = vblk[i].start; >> + end = vblk[i].end; >> + size = end - start; >> + pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = >> %#010Lx\n", >> + i, size, start, end); > > pr_debug perhaps? > >> + if (numa_add_memblk(i, start, end)) { >> + pr_info("vNUMA: Could not add memblk[%d]\n", i); >> + rc = -EAGAIN; > > Can you unroll the NUMA topology if this fails? Or is that not a problem? > >> + goto errout; >> + } >> + node_set(i, numa_nodes_parsed); >> + } >> + setup_nr_node_ids(); >> + /* Setting the cpu, apicid to node */ >> + for_each_cpu(cpu, cpu_possible_mask) { >> + pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, >> cpu_to_node[cpu]); > > pr_debug >> + set_apicid_to_node(cpu, cpu_to_node[cpu]); >> + numa_set_node(cpu, cpu_to_node[cpu]); >> + __apicid_to_node[cpu] = cpu_to_node[cpu]; >> + cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]); >> + } >> + setup_nr_node_ids(); > > How come this is being called twice? It should have a comment > explaining this extra call. > >> + rc = 0; >> + for (i = 0; i < numa_topo.nr_nodes; i++) >> + for (j = 0; j < numa_topo.nr_nodes; j++) { >> + numa_set_distance(i, j, *(vdistance + >> j*numa_topo.nr_nodes + i)); > > You could simply this by just doing: > > int idx = (j * numa_topo.nr_nodes) + i; > > ... *(vdistance + idx)); > >> + pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance >> + j*numa_topo.nr_nodes + i)); > > pr_debug > >> + } >> +errout: >> + if (phys) >> + memblock_free(__pa(phys), mem_size); >> + if (physd) >> + memblock_free(__pa(physd), dist_size); >> + if (physc) >> + memblock_free(__pa(physc), cpu_to_node_size); >> + if (rc) >> + pr_debug("XEN: hypercall failed to get vNUMA topology.\n"); > > And include the 'rc' value in the output please. >> + return rc; >> +} >> + >> #endif >> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h >> index 2ecfe4f..16b8d87 100644 >> --- a/include/xen/interface/memory.h >> +++ b/include/xen/interface/memory.h >> @@ -263,4 +263,5 @@ struct xen_remove_from_physmap { >> }; >> DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap); >> >> +#define XENMEM_get_vnuma_info 25 >> #endif /* __XEN_PUBLIC_MEMORY_H__ */ >> -- >> 1.7.10.4 >> -- Elena _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.