[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH] x86/NUMA: make init_node_heap() respect Xen heap limit
CC George (x86 MM maintainer) On Thu, Aug 27, 2015 at 02:37:17AM -0600, Jan Beulich wrote: > On NUMA systems, where we try to use node local memory for the basic > control structures of the buddy allocator, this special case needs to > take into consideration a possible address width limit placed on the > Xen heap. In turn this (but also other, more abstract considerations) > requires that xenheap_max_mfn() not be called more than once (at most > we might permit it to be called a second time with a larger value than > was passed the first time), and be called only before calling > end_boot_allocator(). > > While inspecting all the involved code, a couple of off-by-one issues > were found (and are being corrected here at once): > - arch_init_memory() cleared one too many page table slots > - the highmem_start based invocation of xenheap_max_mfn() passed too > big a value > - xenheap_max_mfn() calculated the wrong bit count in edge cases > > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> > > --- a/xen/arch/x86/mm.c > +++ b/xen/arch/x86/mm.c > @@ -369,7 +369,7 @@ void __init arch_init_memory(void) > > for ( i = 0; i < l3_table_offset(split_va); ++i ) > l3tab[i] = l3idle[i]; > - for ( ; i <= L3_PAGETABLE_ENTRIES; ++i ) > + for ( ; i < L3_PAGETABLE_ENTRIES; ++i ) > l3tab[i] = l3e_empty(); > split_l4e = l4e_from_pfn(virt_to_mfn(l3tab), > __PAGE_HYPERVISOR); > --- a/xen/arch/x86/setup.c > +++ b/xen/arch/x86/setup.c > @@ -1002,7 +1002,7 @@ void __init noreturn __start_xen(unsigne > > setup_max_pdx(raw_max_page); > if ( highmem_start ) > - xenheap_max_mfn(PFN_DOWN(highmem_start)); > + xenheap_max_mfn(PFN_DOWN(highmem_start - 1)); > > /* > * Walk every RAM region and map it in its entirety (on x86/64, at least) > @@ -1183,9 +1183,6 @@ void __init noreturn __start_xen(unsigne > > numa_initmem_init(0, raw_max_page); > > - end_boot_allocator(); > - system_state = SYS_STATE_boot; > - > if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) ) > { > unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1); > @@ -1194,6 +1191,8 @@ void __init noreturn __start_xen(unsigne > if ( !highmem_start ) > xenheap_max_mfn(limit); > > + end_boot_allocator(); > + > /* Pass the remaining memory to the allocator. */ > for ( i = 0; i < boot_e820.nr_map; i++ ) > { > @@ -1217,6 +1216,10 @@ void __init noreturn __start_xen(unsigne > opt_tmem = 0; > } > } > + else > + end_boot_allocator(); > + > + system_state = SYS_STATE_boot; > > vm_init(); > console_init_ring(); > --- a/xen/common/page_alloc.c > +++ b/xen/common/page_alloc.c > @@ -404,13 +404,19 @@ void get_outstanding_claims(uint64_t *fr > spin_unlock(&heap_lock); > } > > +static bool_t __read_mostly first_node_initialised; > +#ifndef CONFIG_SEPARATE_XENHEAP > +static unsigned int __read_mostly xenheap_bits; > +#else > +#define xenheap_bits 0 > +#endif > + > static unsigned long init_node_heap(int node, unsigned long mfn, > unsigned long nr, bool_t *use_tail) > { > /* First node to be discovered has its heap metadata statically alloced. > */ > static heap_by_zone_and_order_t _heap_static; > static unsigned long avail_static[NR_ZONES]; > - static int first_node_initialised; > unsigned long needed = (sizeof(**_heap) + > sizeof(**avail) * NR_ZONES + > PAGE_SIZE - 1) >> PAGE_SHIFT; > @@ -428,14 +434,18 @@ static unsigned long init_node_heap(int > } > #ifdef DIRECTMAP_VIRT_END > else if ( *use_tail && nr >= needed && > - (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) ) > + (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) && > + (!xenheap_bits || > + !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) ) > { > _heap[node] = mfn_to_virt(mfn + nr - needed); > avail[node] = mfn_to_virt(mfn + nr - 1) + > PAGE_SIZE - sizeof(**avail) * NR_ZONES; > } > else if ( nr >= needed && > - (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) ) > + (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) && > + (!xenheap_bits || > + !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) ) > { > _heap[node] = mfn_to_virt(mfn); > avail[node] = mfn_to_virt(mfn + needed - 1) + > @@ -1543,11 +1553,13 @@ void free_xenheap_pages(void *v, unsigne > > #else > > -static unsigned int __read_mostly xenheap_bits; > - > void __init xenheap_max_mfn(unsigned long mfn) > { > - xenheap_bits = flsl(mfn) + PAGE_SHIFT; > + ASSERT(!first_node_initialised); > + ASSERT(!xenheap_bits); > + BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG); > + xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS); > + printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits); > } > > void init_xenheap_pages(paddr_t ps, paddr_t pe) > > > x86/NUMA: make init_node_heap() respect Xen heap limit > > On NUMA systems, where we try to use node local memory for the basic > control structures of the buddy allocator, this special case needs to > take into consideration a possible address width limit placed on the > Xen heap. In turn this (but also other, more abstract considerations) > requires that xenheap_max_mfn() not be called more than once (at most > we might permit it to be called a second time with a larger value than > was passed the first time), and be called only before calling > end_boot_allocator(). > > While inspecting all the involved code, a couple of off-by-one issues > were found (and are being corrected here at once): > - arch_init_memory() cleared one too many page table slots > - the highmem_start based invocation of xenheap_max_mfn() passed too > big a value > - xenheap_max_mfn() calculated the wrong bit count in edge cases > > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> > > --- a/xen/arch/x86/mm.c > +++ b/xen/arch/x86/mm.c > @@ -369,7 +369,7 @@ void __init arch_init_memory(void) > > for ( i = 0; i < l3_table_offset(split_va); ++i ) > l3tab[i] = l3idle[i]; > - for ( ; i <= L3_PAGETABLE_ENTRIES; ++i ) > + for ( ; i < L3_PAGETABLE_ENTRIES; ++i ) > l3tab[i] = l3e_empty(); > split_l4e = l4e_from_pfn(virt_to_mfn(l3tab), > __PAGE_HYPERVISOR); > --- a/xen/arch/x86/setup.c > +++ b/xen/arch/x86/setup.c > @@ -1002,7 +1002,7 @@ void __init noreturn __start_xen(unsigne > > setup_max_pdx(raw_max_page); > if ( highmem_start ) > - xenheap_max_mfn(PFN_DOWN(highmem_start)); > + xenheap_max_mfn(PFN_DOWN(highmem_start - 1)); > > /* > * Walk every RAM region and map it in its entirety (on x86/64, at least) > @@ -1183,9 +1183,6 @@ void __init noreturn __start_xen(unsigne > > numa_initmem_init(0, raw_max_page); > > - end_boot_allocator(); > - system_state = SYS_STATE_boot; > - > if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) ) > { > unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1); > @@ -1194,6 +1191,8 @@ void __init noreturn __start_xen(unsigne > if ( !highmem_start ) > xenheap_max_mfn(limit); > > + end_boot_allocator(); > + > /* Pass the remaining memory to the allocator. */ > for ( i = 0; i < boot_e820.nr_map; i++ ) > { > @@ -1217,6 +1216,10 @@ void __init noreturn __start_xen(unsigne > opt_tmem = 0; > } > } > + else > + end_boot_allocator(); > + > + system_state = SYS_STATE_boot; > > vm_init(); > console_init_ring(); > --- a/xen/common/page_alloc.c > +++ b/xen/common/page_alloc.c > @@ -404,13 +404,19 @@ void get_outstanding_claims(uint64_t *fr > spin_unlock(&heap_lock); > } > > +static bool_t __read_mostly first_node_initialised; > +#ifndef CONFIG_SEPARATE_XENHEAP > +static unsigned int __read_mostly xenheap_bits; > +#else > +#define xenheap_bits 0 > +#endif > + > static unsigned long init_node_heap(int node, unsigned long mfn, > unsigned long nr, bool_t *use_tail) > { > /* First node to be discovered has its heap metadata statically alloced. > */ > static heap_by_zone_and_order_t _heap_static; > static unsigned long avail_static[NR_ZONES]; > - static int first_node_initialised; > unsigned long needed = (sizeof(**_heap) + > sizeof(**avail) * NR_ZONES + > PAGE_SIZE - 1) >> PAGE_SHIFT; > @@ -428,14 +434,18 @@ static unsigned long init_node_heap(int > } > #ifdef DIRECTMAP_VIRT_END > else if ( *use_tail && nr >= needed && > - (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) ) > + (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) && > + (!xenheap_bits || > + !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) ) > { > _heap[node] = mfn_to_virt(mfn + nr - needed); > avail[node] = mfn_to_virt(mfn + nr - 1) + > PAGE_SIZE - sizeof(**avail) * NR_ZONES; > } > else if ( nr >= needed && > - (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) ) > + (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) && > + (!xenheap_bits || > + !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) ) > { > _heap[node] = mfn_to_virt(mfn); > avail[node] = mfn_to_virt(mfn + needed - 1) + > @@ -1543,11 +1553,13 @@ void free_xenheap_pages(void *v, unsigne > > #else > > -static unsigned int __read_mostly xenheap_bits; > - > void __init xenheap_max_mfn(unsigned long mfn) > { > - xenheap_bits = flsl(mfn) + PAGE_SHIFT; > + ASSERT(!first_node_initialised); > + ASSERT(!xenheap_bits); > + BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG); > + xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS); > + printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits); > } > > void init_xenheap_pages(paddr_t ps, paddr_t pe) _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |