[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] x86/NUMA: make init_node_heap() respect Xen heap limit



CC George (x86 MM maintainer)

On Thu, Aug 27, 2015 at 02:37:17AM -0600, Jan Beulich wrote:
> On NUMA systems, where we try to use node local memory for the basic
> control structures of the buddy allocator, this special case needs to
> take into consideration a possible address width limit placed on the
> Xen heap. In turn this (but also other, more abstract considerations)
> requires that xenheap_max_mfn() not be called more than once (at most
> we might permit it to be called a second time with a larger value than
> was passed the first time), and be called only before calling
> end_boot_allocator().
> 
> While inspecting all the involved code, a couple of off-by-one issues
> were found (and are being corrected here at once):
> - arch_init_memory() cleared one too many page table slots
> - the highmem_start based invocation of xenheap_max_mfn() passed too
>   big a value
> - xenheap_max_mfn() calculated the wrong bit count in edge cases
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> 
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -369,7 +369,7 @@ void __init arch_init_memory(void)
>  
>                      for ( i = 0; i < l3_table_offset(split_va); ++i )
>                          l3tab[i] = l3idle[i];
> -                    for ( ; i <= L3_PAGETABLE_ENTRIES; ++i )
> +                    for ( ; i < L3_PAGETABLE_ENTRIES; ++i )
>                          l3tab[i] = l3e_empty();
>                      split_l4e = l4e_from_pfn(virt_to_mfn(l3tab),
>                                               __PAGE_HYPERVISOR);
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -1002,7 +1002,7 @@ void __init noreturn __start_xen(unsigne
>  
>      setup_max_pdx(raw_max_page);
>      if ( highmem_start )
> -        xenheap_max_mfn(PFN_DOWN(highmem_start));
> +        xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
>  
>      /*
>       * Walk every RAM region and map it in its entirety (on x86/64, at least)
> @@ -1183,9 +1183,6 @@ void __init noreturn __start_xen(unsigne
>  
>      numa_initmem_init(0, raw_max_page);
>  
> -    end_boot_allocator();
> -    system_state = SYS_STATE_boot;
> -
>      if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
>      {
>          unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
> @@ -1194,6 +1191,8 @@ void __init noreturn __start_xen(unsigne
>          if ( !highmem_start )
>              xenheap_max_mfn(limit);
>  
> +        end_boot_allocator();
> +
>          /* Pass the remaining memory to the allocator. */
>          for ( i = 0; i < boot_e820.nr_map; i++ )
>          {
> @@ -1217,6 +1216,10 @@ void __init noreturn __start_xen(unsigne
>             opt_tmem = 0;
>          }
>      }
> +    else
> +        end_boot_allocator();
> +
> +    system_state = SYS_STATE_boot;
>  
>      vm_init();
>      console_init_ring();
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -404,13 +404,19 @@ void get_outstanding_claims(uint64_t *fr
>      spin_unlock(&heap_lock);
>  }
>  
> +static bool_t __read_mostly first_node_initialised;
> +#ifndef CONFIG_SEPARATE_XENHEAP
> +static unsigned int __read_mostly xenheap_bits;
> +#else
> +#define xenheap_bits 0
> +#endif
> +
>  static unsigned long init_node_heap(int node, unsigned long mfn,
>                                      unsigned long nr, bool_t *use_tail)
>  {
>      /* First node to be discovered has its heap metadata statically alloced. 
> */
>      static heap_by_zone_and_order_t _heap_static;
>      static unsigned long avail_static[NR_ZONES];
> -    static int first_node_initialised;
>      unsigned long needed = (sizeof(**_heap) +
>                              sizeof(**avail) * NR_ZONES +
>                              PAGE_SIZE - 1) >> PAGE_SHIFT;
> @@ -428,14 +434,18 @@ static unsigned long init_node_heap(int 
>      }
>  #ifdef DIRECTMAP_VIRT_END
>      else if ( *use_tail && nr >= needed &&
> -              (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) )
> +              (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) &&
> +              (!xenheap_bits ||
> +               !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
>      {
>          _heap[node] = mfn_to_virt(mfn + nr - needed);
>          avail[node] = mfn_to_virt(mfn + nr - 1) +
>                        PAGE_SIZE - sizeof(**avail) * NR_ZONES;
>      }
>      else if ( nr >= needed &&
> -              (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) )
> +              (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) &&
> +              (!xenheap_bits ||
> +               !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
>      {
>          _heap[node] = mfn_to_virt(mfn);
>          avail[node] = mfn_to_virt(mfn + needed - 1) +
> @@ -1543,11 +1553,13 @@ void free_xenheap_pages(void *v, unsigne
>  
>  #else
>  
> -static unsigned int __read_mostly xenheap_bits;
> -
>  void __init xenheap_max_mfn(unsigned long mfn)
>  {
> -    xenheap_bits = flsl(mfn) + PAGE_SHIFT;
> +    ASSERT(!first_node_initialised);
> +    ASSERT(!xenheap_bits);
> +    BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
> +    xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
> +    printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
>  }
>  
>  void init_xenheap_pages(paddr_t ps, paddr_t pe)
> 
> 

> x86/NUMA: make init_node_heap() respect Xen heap limit
> 
> On NUMA systems, where we try to use node local memory for the basic
> control structures of the buddy allocator, this special case needs to
> take into consideration a possible address width limit placed on the
> Xen heap. In turn this (but also other, more abstract considerations)
> requires that xenheap_max_mfn() not be called more than once (at most
> we might permit it to be called a second time with a larger value than
> was passed the first time), and be called only before calling
> end_boot_allocator().
> 
> While inspecting all the involved code, a couple of off-by-one issues
> were found (and are being corrected here at once):
> - arch_init_memory() cleared one too many page table slots
> - the highmem_start based invocation of xenheap_max_mfn() passed too
>   big a value
> - xenheap_max_mfn() calculated the wrong bit count in edge cases
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> 
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -369,7 +369,7 @@ void __init arch_init_memory(void)
>  
>                      for ( i = 0; i < l3_table_offset(split_va); ++i )
>                          l3tab[i] = l3idle[i];
> -                    for ( ; i <= L3_PAGETABLE_ENTRIES; ++i )
> +                    for ( ; i < L3_PAGETABLE_ENTRIES; ++i )
>                          l3tab[i] = l3e_empty();
>                      split_l4e = l4e_from_pfn(virt_to_mfn(l3tab),
>                                               __PAGE_HYPERVISOR);
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -1002,7 +1002,7 @@ void __init noreturn __start_xen(unsigne
>  
>      setup_max_pdx(raw_max_page);
>      if ( highmem_start )
> -        xenheap_max_mfn(PFN_DOWN(highmem_start));
> +        xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
>  
>      /*
>       * Walk every RAM region and map it in its entirety (on x86/64, at least)
> @@ -1183,9 +1183,6 @@ void __init noreturn __start_xen(unsigne
>  
>      numa_initmem_init(0, raw_max_page);
>  
> -    end_boot_allocator();
> -    system_state = SYS_STATE_boot;
> -
>      if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
>      {
>          unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
> @@ -1194,6 +1191,8 @@ void __init noreturn __start_xen(unsigne
>          if ( !highmem_start )
>              xenheap_max_mfn(limit);
>  
> +        end_boot_allocator();
> +
>          /* Pass the remaining memory to the allocator. */
>          for ( i = 0; i < boot_e820.nr_map; i++ )
>          {
> @@ -1217,6 +1216,10 @@ void __init noreturn __start_xen(unsigne
>             opt_tmem = 0;
>          }
>      }
> +    else
> +        end_boot_allocator();
> +
> +    system_state = SYS_STATE_boot;
>  
>      vm_init();
>      console_init_ring();
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -404,13 +404,19 @@ void get_outstanding_claims(uint64_t *fr
>      spin_unlock(&heap_lock);
>  }
>  
> +static bool_t __read_mostly first_node_initialised;
> +#ifndef CONFIG_SEPARATE_XENHEAP
> +static unsigned int __read_mostly xenheap_bits;
> +#else
> +#define xenheap_bits 0
> +#endif
> +
>  static unsigned long init_node_heap(int node, unsigned long mfn,
>                                      unsigned long nr, bool_t *use_tail)
>  {
>      /* First node to be discovered has its heap metadata statically alloced. 
> */
>      static heap_by_zone_and_order_t _heap_static;
>      static unsigned long avail_static[NR_ZONES];
> -    static int first_node_initialised;
>      unsigned long needed = (sizeof(**_heap) +
>                              sizeof(**avail) * NR_ZONES +
>                              PAGE_SIZE - 1) >> PAGE_SHIFT;
> @@ -428,14 +434,18 @@ static unsigned long init_node_heap(int 
>      }
>  #ifdef DIRECTMAP_VIRT_END
>      else if ( *use_tail && nr >= needed &&
> -              (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) )
> +              (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) &&
> +              (!xenheap_bits ||
> +               !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
>      {
>          _heap[node] = mfn_to_virt(mfn + nr - needed);
>          avail[node] = mfn_to_virt(mfn + nr - 1) +
>                        PAGE_SIZE - sizeof(**avail) * NR_ZONES;
>      }
>      else if ( nr >= needed &&
> -              (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) )
> +              (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) &&
> +              (!xenheap_bits ||
> +               !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
>      {
>          _heap[node] = mfn_to_virt(mfn);
>          avail[node] = mfn_to_virt(mfn + needed - 1) +
> @@ -1543,11 +1553,13 @@ void free_xenheap_pages(void *v, unsigne
>  
>  #else
>  
> -static unsigned int __read_mostly xenheap_bits;
> -
>  void __init xenheap_max_mfn(unsigned long mfn)
>  {
> -    xenheap_bits = flsl(mfn) + PAGE_SHIFT;
> +    ASSERT(!first_node_initialised);
> +    ASSERT(!xenheap_bits);
> +    BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
> +    xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
> +    printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
>  }
>  
>  void init_xenheap_pages(paddr_t ps, paddr_t pe)


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.