[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v5 1/7] x86/xpti: avoid copying L4 page table contents when possible



On 06/04/18 08:52, Juergen Gross wrote:
> For mitigation of Meltdown the current L4 page table is copied to the
> cpu local root page table each time a 64 bit pv guest is entered.
>
> Copying can be avoided in cases where the guest L4 page table hasn't
> been modified while running the hypervisor, e.g. when handling
> interrupts or any hypercall not modifying the L4 page table or %cr3.
>
> So add a per-cpu flag whether the copying should be performed and set

"flag indicating whether"

> that flag only when loading a new %cr3 or modifying the L4 page table.
> This includes synchronization of the cpu local root page table with
> other cpus, so add a special synchronization flag for that case.
>
> A simple performance check (compiling the hypervisor via "make -j 4")
> in dom0 with 4 vcpus shows a significant improvement:
>
> - real time drops from 113 seconds to 109 seconds
> - system time drops from 165 seconds to 155 seconds
>
> Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
> Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx>
> ---
> V4:
> - move setting of root_pgt_changed flag in flush_area_local() out of
>   irq disabled section (Jan Beulich)
> - move setting of root_pgt_changed in make_cr3() to _toggle_guest_pt()
>   (Jan Beulich)
> - remove most conditionals in write_ptbase() (Jan Beulich)
> - don't set root_pgt_changed in do_mmu_update() for modification of
>   the user page table (Jan Beulich)
>
> V3:
> - set flag locally only if affected L4 is active (Jan Beulich)
> - add setting flag to flush_area_mask() (Jan Beulich)
> - set flag in make_cr3() only if called for current active vcpu
>
> To be applied on top of Jan's "Meltdown band-aid overhead reduction"
> series
> ---
>  xen/arch/x86/flushtlb.c           |  3 +++
>  xen/arch/x86/mm.c                 | 38 +++++++++++++++++++++++++-------------
>  xen/arch/x86/pv/domain.c          |  2 ++
>  xen/arch/x86/smp.c                |  2 +-
>  xen/arch/x86/x86_64/asm-offsets.c |  1 +
>  xen/arch/x86/x86_64/entry.S       |  8 ++++++--
>  xen/include/asm-x86/current.h     |  8 ++++++++
>  xen/include/asm-x86/flushtlb.h    |  2 ++
>  8 files changed, 48 insertions(+), 16 deletions(-)
>
> diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
> index 8a7a76b8ff..38cedf3b22 100644
> --- a/xen/arch/x86/flushtlb.c
> +++ b/xen/arch/x86/flushtlb.c
> @@ -160,5 +160,8 @@ unsigned int flush_area_local(const void *va, unsigned 
> int flags)
>  
>      local_irq_restore(irqfl);
>  
> +    if ( flags & FLUSH_ROOT_PGTBL )
> +        get_cpu_info()->root_pgt_changed = true;
> +
>      return flags;
>  }
> diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
> index 6d39d2c8ab..fd89685486 100644
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -504,6 +504,8 @@ void make_cr3(struct vcpu *v, mfn_t mfn)
>  
>  void write_ptbase(struct vcpu *v)
>  {
> +    if ( this_cpu(root_pgt) )

I'm not sure this conditional is wise.  A call hitting here has changed
the root pgt, irrespective of whether the exit path cares.

> +        get_cpu_info()->root_pgt_changed = true;
>      write_cr3(v->arch.cr3);
>  }
>  
> @@ -3701,18 +3703,28 @@ long do_mmu_update(
>                          break;
>                      rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
>                                        cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
> -                    /*
> -                     * No need to sync if all uses of the page can be 
> accounted
> -                     * to the page lock we hold, its pinned status, and uses 
> on
> -                     * this (v)CPU.
> -                     */
> -                    if ( !rc && !cpu_has_no_xpti &&
> -                         ((page->u.inuse.type_info & PGT_count_mask) >
> -                          (1 + !!(page->u.inuse.type_info & PGT_pinned) +
> -                           (pagetable_get_pfn(curr->arch.guest_table) == 
> mfn) +
> -                           (pagetable_get_pfn(curr->arch.guest_table_user) ==
> -                            mfn))) )
> -                        sync_guest = true;
> +                    if ( !rc && !cpu_has_no_xpti )
> +                    {
> +                        bool local_in_use = false;
> +
> +                        if ( pagetable_get_pfn(curr->arch.guest_table) == 
> mfn )
> +                        {
> +                            local_in_use = true;
> +                            get_cpu_info()->root_pgt_changed = true;
> +                        }
> +
> +                        /*
> +                         * No need to sync if all uses of the page can be
> +                         * accounted to the page lock we hold, its pinned
> +                         * status, and uses on this (v)CPU.
> +                         */
> +                        if ( (page->u.inuse.type_info & PGT_count_mask) >
> +                             (1 + !!(page->u.inuse.type_info & PGT_pinned) +
> +                              
> (pagetable_get_pfn(curr->arch.guest_table_user) ==
> +                               mfn) +
> +                              local_in_use) )

These two lines can be folded together.

> +                            sync_guest = true;
> +                    }
>                      break;
>  
>                  case PGT_writable_page:
> @@ -3827,7 +3839,7 @@ long do_mmu_update(
>  
>          cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
>          if ( !cpumask_empty(mask) )
> -            flush_mask(mask, FLUSH_TLB_GLOBAL);
> +            flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL);
>      }
>  
>      perfc_add(num_page_updates, i);
> diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
> index 01c62e2d45..42522a2db3 100644
> --- a/xen/arch/x86/pv/domain.c
> +++ b/xen/arch/x86/pv/domain.c
> @@ -223,6 +223,8 @@ static void _toggle_guest_pt(struct vcpu *v)
>  {
>      v->arch.flags ^= TF_kernel_mode;
>      update_cr3(v);
> +    get_cpu_info()->root_pgt_changed = true;
> +
>      /* Don't flush user global mappings from the TLB. Don't tick TLB clock. 
> */
>      asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
>  
> diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
> index 033dd05958..63e819ca38 100644
> --- a/xen/arch/x86/smp.c
> +++ b/xen/arch/x86/smp.c
> @@ -208,7 +208,7 @@ void invalidate_interrupt(struct cpu_user_regs *regs)
>      ack_APIC_irq();
>      perfc_incr(ipis);
>      if ( (flags & FLUSH_VCPU_STATE) && __sync_local_execstate() )
> -        flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL);
> +        flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL);
>      if ( flags & ~(FLUSH_VCPU_STATE | FLUSH_ORDER_MASK) )
>          flush_area_local(flush_va, flags);
>      cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
> diff --git a/xen/arch/x86/x86_64/asm-offsets.c 
> b/xen/arch/x86/x86_64/asm-offsets.c
> index a2fea94f4c..9e2aefb00f 100644
> --- a/xen/arch/x86/x86_64/asm-offsets.c
> +++ b/xen/arch/x86/x86_64/asm-offsets.c
> @@ -143,6 +143,7 @@ void __dummy__(void)
>      OFFSET(CPUINFO_shadow_spec_ctrl, struct cpu_info, shadow_spec_ctrl);
>      OFFSET(CPUINFO_use_shadow_spec_ctrl, struct cpu_info, 
> use_shadow_spec_ctrl);
>      OFFSET(CPUINFO_bti_ist_info, struct cpu_info, bti_ist_info);
> +    OFFSET(CPUINFO_root_pgt_changed, struct cpu_info, root_pgt_changed);
>      DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info));
>      BLANK();
>  
> diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
> index 45d9842d09..30c9da5446 100644
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -128,10 +128,13 @@ restore_all_guest:
>          /* Copy guest mappings and switch to per-CPU root page table. */
>          mov   VCPU_cr3(%rbx), %r9
>          GET_STACK_END(dx)
> -        mov   STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
> +        mov   STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rax

I think this assembly block is correct (based on the register
requirements at .Lrag_copy_done).  However (as a check of whether I've
understood it correctly), I think it would be more obviously correct as:

mov   STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi   (no delta)
mov   %rdi, %rax  (move from lower down)
cmpb  $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx)

and (IIRC) is slightly better static instruction scheduling for in-order
processors.

~Andrew

> +        cmpb  $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx)
> +        je    .Lrag_copy_done
> +        movb  $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx)
>          movabs $PADDR_MASK & PAGE_MASK, %rsi
>          movabs $DIRECTMAP_VIRT_START, %rcx
> -        mov   %rdi, %rax
> +        mov   %rax, %rdi
>          and   %rsi, %rdi
>          jz    .Lrag_keep_cr3
>          and   %r9, %rsi
> @@ -148,6 +151,7 @@ restore_all_guest:
>          sub   $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \
>                  ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi
>          rep movsq
> +.Lrag_copy_done:
>          mov   STACK_CPUINFO_FIELD(cr4)(%rdx), %rdi
>          mov   %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
>          mov   %rdi, %rsi
> diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
> index 3a0e1eef36..f2491b4423 100644
> --- a/xen/include/asm-x86/current.h
> +++ b/xen/include/asm-x86/current.h
> @@ -59,6 +59,14 @@ struct cpu_info {
>      bool         use_shadow_spec_ctrl;
>      uint8_t      bti_ist_info;
>  
> +    /*
> +     * The following field controls copying of the L4 page table of 64-bit
> +     * PV guests to the per-cpu root page table on entering the guest 
> context.
> +     * If set the L4 page table is being copied to the root page table and
> +     * the field will be reset.
> +     */
> +    bool         root_pgt_changed;
> +
>      unsigned long __pad;
>      /* get_stack_bottom() must be 16-byte aligned */
>  };
> diff --git a/xen/include/asm-x86/flushtlb.h b/xen/include/asm-x86/flushtlb.h
> index 2cade9cbfb..052f0fa403 100644
> --- a/xen/include/asm-x86/flushtlb.h
> +++ b/xen/include/asm-x86/flushtlb.h
> @@ -103,6 +103,8 @@ void write_cr3(unsigned long cr3);
>  #define FLUSH_VA_VALID   0x800
>   /* Flush CPU state */
>  #define FLUSH_VCPU_STATE 0x1000
> + /* Flush the per-cpu root page table */
> +#define FLUSH_ROOT_PGTBL 0x2000
>  
>  /* Flush local TLBs/caches. */
>  unsigned int flush_area_local(const void *va, unsigned int flags);


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.