Xen project Mailing List

During the review of the 3rd commit of the NUMA claims v1 series, it was found to be concerning (performance-wise) add add another array like this that randomly written from all nodes: +/* Per-node counts of free pages */ +static unsigned long pernode_avail_pages[MAX_NUMNODES]; As solution, it was suggested to introduce per_node() paralleling per_cpu(), or (less desirable) to make sure one particular cache line would only ever be written from a single node. It was mentioned that node_need_scrub[] could/should use it, and I assume others may benefit too. per_cpu() is a simple standard blueprint that is easy to copy, add per_node(), paralleling per_cpu() as the preferred suggestion: It is entirely derived from per_cpu(), with a few differences: - No add/remove callback: Nodes are onlined on boot and never offlined. - As per_node(avail_pages) and pernode(outstanding_claims) are used by the buddy allocator itself, and the buddy allocator is used to alloc the per_node() memory from the local NUMA node, there is a catch: per_node() must already be working to have a working buddy allocator: - Init per_node() before the buddy allocator is ready as it needs to be setup before its use, e.g. to init per_node(avail_pages)! Use an early static __initdata array during early boot and migrate it to the NUMA-node-local xenheap before we enable the secondary CPUs. Cc: Jan Beulich <jbeulich@xxxxxxxx> Signed-off-by: Bernhard Kaindl <bernhard.kaindl@xxxxxxxxx> --- Changes: - This is patch is new in v3 to resolve the the suggestion from the review. - The previous patch #2 is removed from the series as not required, which is best visualized by how claims are used: - Claim needed memory - Allocate all domain memory - Cancel a possible leftover claim - Finish building the domain and unpause it. As it makes no sense to repeat "Claim needed memory" at any time, the change made had no practical significance. It can be applied later as a tiny, not important cleanup, e.g. with multi-node claims. Implementation note on this patch (not needed for the commit message): Instead of the __initdata array, I tried to alloc bootmem, but it caused paging_init() to panic with not enough memory for p2m on a very large 4-Socket, 480 pCPU, 4TiB RAM host (or it caused boot to hang after the microcode updates of the 480 pCPUs) The static __initdata array is freed after init and does not affect bootmem allocation. PPS: Yes, node_need_scrub[] should use it too, do it after this series. --- xen/arch/arm/xen.lds.S | 1 + xen/arch/ppc/xen.lds.S | 1 + xen/arch/riscv/xen.lds.S | 1 + xen/arch/x86/xen.lds.S | 1 + xen/common/numa.c | 53 ++++++++++++++++++++++++++++++++++++++- xen/include/xen/numa.h | 15 +++++++++++ xen/include/xen/xen.lds.h | 8 ++++++ 7 files changed, 79 insertions(+), 1 deletion(-) diff --git a/xen/arch/arm/xen.lds.S b/xen/arch/arm/xen.lds.S index db17ff1efa..d296a95dd3 100644 --- a/xen/arch/arm/xen.lds.S +++ b/xen/arch/arm/xen.lds.S @@ -176,6 +176,7 @@ SECTIONS *(.bss.stack_aligned) *(.bss.page_aligned) PERCPU_BSS + PERNODE_BSS *(.bss .bss.*) . = ALIGN(POINTER_ALIGN); __bss_end = .; diff --git a/xen/arch/ppc/xen.lds.S b/xen/arch/ppc/xen.lds.S index 1de0b77fc6..29d1b5da58 100644 --- a/xen/arch/ppc/xen.lds.S +++ b/xen/arch/ppc/xen.lds.S @@ -151,6 +151,7 @@ SECTIONS *(.bss.stack_aligned) *(.bss.page_aligned) PERCPU_BSS + PERNODE_BSS *(.bss .bss.*) . = ALIGN(POINTER_ALIGN); __bss_end = .; diff --git a/xen/arch/riscv/xen.lds.S b/xen/arch/riscv/xen.lds.S index edcadff90b..e154427353 100644 --- a/xen/arch/riscv/xen.lds.S +++ b/xen/arch/riscv/xen.lds.S @@ -146,6 +146,7 @@ SECTIONS *(.bss.stack_aligned) *(.bss.page_aligned) PERCPU_BSS + PERNODE_BSS *(.sbss .sbss.* .bss .bss.*) . = ALIGN(POINTER_ALIGN); __bss_end = .; diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S index 966e514f20..95040cd516 100644 --- a/xen/arch/x86/xen.lds.S +++ b/xen/arch/x86/xen.lds.S @@ -327,6 +327,7 @@ SECTIONS __bss_start = .; *(.bss.page_aligned*) PERCPU_BSS + PERNODE_BSS *(.bss .bss.*) . = ALIGN(POINTER_ALIGN); __bss_end = .; diff --git a/xen/common/numa.c b/xen/common/numa.c index ad75955a16..5e66471159 100644 --- a/xen/common/numa.c +++ b/xen/common/numa.c @@ -320,6 +320,51 @@ static bool __init nodes_cover_memory(void) return true; } +/* Defined on the BSS in xen.lds.S, used for area sizes and relative offsets */ +extern const char __pernode_start[]; +extern const char __pernode_end[]; + +unsigned long __read_mostly __pernode_offset[MAX_NUMNODES]; + +#define EARLY_PERNODE_AREA_SIZE (SMP_CACHE_BYTES) + +static char early_pernode_area[MAX_NUMNODES][EARLY_PERNODE_AREA_SIZE] + __initdata __cacheline_aligned; + +/* per_node() needs to be ready before the first alloc call using the heap */ +static void __init early_init_pernode_areas(void) +{ + unsigned int node; + + if (__pernode_end - __pernode_start > EARLY_PERNODE_AREA_SIZE) + panic("per_node() area too small, increase EARLY_PERNODE_AREA_SIZE"); + + for_each_online_node(node) + __pernode_offset[node] = early_pernode_area[node] - __pernode_start; +} + +/* Before going SMP, migrate the per_node memory areas to their NUMA nodes */ +static int __init init_pernode_areas(void) +{ + const int pernode_size = __pernode_end - __pernode_start; /* size in BSS */ + unsigned int node; + + for_each_online_node(node) + { + char *p = alloc_xenheap_pages(get_order_from_bytes(pernode_size), + MEMF_node(node)); + + if ( !p ) + return -ENOMEM; + /* migrate the pernode data from the bootmem area to the xenheap */ + memcpy(p, early_pernode_area[node], SMP_CACHE_BYTES); + __pernode_offset[node] = p - __pernode_start; + } + return 0; +} + +presmp_initcall(init_pernode_areas); + /* Use discovered information to actually set up the nodes. */ static bool __init numa_process_nodes(paddr_t start, paddr_t end) { @@ -617,7 +662,7 @@ static int __init numa_emulation(unsigned long start_pfn, } #endif -void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +static void __init init_nodes(unsigned long start_pfn, unsigned long end_pfn) { unsigned int i; paddr_t start = pfn_to_paddr(start_pfn); @@ -656,6 +701,12 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) setup_node_bootmem(0, start, end); } +void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + init_nodes(start_pfn, end_pfn); + early_init_pernode_areas(); /* With all nodes registered, init per_node() */ +} + void numa_add_cpu(unsigned int cpu) { cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h index f6c1f27ca1..729c400d64 100644 --- a/xen/include/xen/numa.h +++ b/xen/include/xen/numa.h @@ -152,4 +152,19 @@ static inline nodeid_t mfn_to_nid(mfn_t mfn) #define page_to_nid(pg) mfn_to_nid(page_to_mfn(pg)) +/* Per NUMA node data area handling based on per-cpu data area handling. */ +extern unsigned long __pernode_offset[]; + +#define DECLARE_PER_NODE(type, name) \ + extern __typeof__(type) pernode__ ## name + +#define __DEFINE_PER_NODE(attr, type, name) \ + attr __typeof__(type) pernode_ ## name + +#define DEFINE_PER_NODE(type, name) \ + __DEFINE_PER_NODE(__section(".bss.pernode"), type, _ ## name) + +#define per_node(var, node) \ + (*RELOC_HIDE(&pernode__##var, __pernode_offset[node])) + #endif /* _XEN_NUMA_H */ diff --git a/xen/include/xen/xen.lds.h b/xen/include/xen/xen.lds.h index b126dfe887..a32423dcec 100644 --- a/xen/include/xen/xen.lds.h +++ b/xen/include/xen/xen.lds.h @@ -174,6 +174,14 @@ #define LOCK_PROFILE_DATA #endif +/* Per-node BSS for declaring per_node vars, based on per_cpu, but simpler */ +#define PERNODE_BSS \ + . = ALIGN(PAGE_SIZE); \ + __pernode_start = .; \ + *(.bss.pernode) \ + . = ALIGN(SMP_CACHE_BYTES); \ + __pernode_end = .; \ + #define PERCPU_BSS \ . = ALIGN(PAGE_SIZE); \ __per_cpu_start = .; \ -- 2.43.0

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.

[PATCH v3 1/7] xen/numa: Add per_node() variables paralleling per_cpu() variables