[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3] x86: use VMLOAD for PV context switch


  • To: Jan Beulich <JBeulich@xxxxxxxx>, xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxxx>
  • From: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
  • Date: Tue, 25 Sep 2018 17:14:54 +0100
  • Autocrypt: addr=andrew.cooper3@xxxxxxxxxx; prefer-encrypt=mutual; keydata= xsFNBFLhNn8BEADVhE+Hb8i0GV6mihnnr/uiQQdPF8kUoFzCOPXkf7jQ5sLYeJa0cQi6Penp VtiFYznTairnVsN5J+ujSTIb+OlMSJUWV4opS7WVNnxHbFTPYZVQ3erv7NKc2iVizCRZ2Kxn srM1oPXWRic8BIAdYOKOloF2300SL/bIpeD+x7h3w9B/qez7nOin5NzkxgFoaUeIal12pXSR Q354FKFoy6Vh96gc4VRqte3jw8mPuJQpfws+Pb+swvSf/i1q1+1I4jsRQQh2m6OTADHIqg2E ofTYAEh7R5HfPx0EXoEDMdRjOeKn8+vvkAwhviWXTHlG3R1QkbE5M/oywnZ83udJmi+lxjJ5 YhQ5IzomvJ16H0Bq+TLyVLO/VRksp1VR9HxCzItLNCS8PdpYYz5TC204ViycobYU65WMpzWe LFAGn8jSS25XIpqv0Y9k87dLbctKKA14Ifw2kq5OIVu2FuX+3i446JOa2vpCI9GcjCzi3oHV e00bzYiHMIl0FICrNJU0Kjho8pdo0m2uxkn6SYEpogAy9pnatUlO+erL4LqFUO7GXSdBRbw5 gNt25XTLdSFuZtMxkY3tq8MFss5QnjhehCVPEpE6y9ZjI4XB8ad1G4oBHVGK5LMsvg22PfMJ ISWFSHoF/B5+lHkCKWkFxZ0gZn33ju5n6/FOdEx4B8cMJt+cWwARAQABzSlBbmRyZXcgQ29v cGVyIDxhbmRyZXcuY29vcGVyM0BjaXRyaXguY29tPsLBegQTAQgAJAIbAwULCQgHAwUVCgkI CwUWAgMBAAIeAQIXgAUCWKD95wIZAQAKCRBlw/kGpdefoHbdD/9AIoR3k6fKl+RFiFpyAhvO 59ttDFI7nIAnlYngev2XUR3acFElJATHSDO0ju+hqWqAb8kVijXLops0gOfqt3VPZq9cuHlh IMDquatGLzAadfFx2eQYIYT+FYuMoPZy/aTUazmJIDVxP7L383grjIkn+7tAv+qeDfE+txL4 SAm1UHNvmdfgL2/lcmL3xRh7sub3nJilM93RWX1Pe5LBSDXO45uzCGEdst6uSlzYR/MEr+5Z JQQ32JV64zwvf/aKaagSQSQMYNX9JFgfZ3TKWC1KJQbX5ssoX/5hNLqxMcZV3TN7kU8I3kjK mPec9+1nECOjjJSO/h4P0sBZyIUGfguwzhEeGf4sMCuSEM4xjCnwiBwftR17sr0spYcOpqET ZGcAmyYcNjy6CYadNCnfR40vhhWuCfNCBzWnUW0lFoo12wb0YnzoOLjvfD6OL3JjIUJNOmJy RCsJ5IA/Iz33RhSVRmROu+TztwuThClw63g7+hoyewv7BemKyuU6FTVhjjW+XUWmS/FzknSi dAG+insr0746cTPpSkGl3KAXeWDGJzve7/SBBfyznWCMGaf8E2P1oOdIZRxHgWj0zNr1+ooF /PzgLPiCI4OMUttTlEKChgbUTQ+5o0P080JojqfXwbPAyumbaYcQNiH1/xYbJdOFSiBv9rpt TQTBLzDKXok86M7BTQRS4TZ/ARAAkgqudHsp+hd82UVkvgnlqZjzz2vyrYfz7bkPtXaGb9H4 Rfo7mQsEQavEBdWWjbga6eMnDqtu+FC+qeTGYebToxEyp2lKDSoAsvt8w82tIlP/EbmRbDVn 7bhjBlfRcFjVYw8uVDPptT0TV47vpoCVkTwcyb6OltJrvg/QzV9f07DJswuda1JH3/qvYu0p vjPnYvCq4NsqY2XSdAJ02HrdYPFtNyPEntu1n1KK+gJrstjtw7KsZ4ygXYrsm/oCBiVW/OgU g/XIlGErkrxe4vQvJyVwg6YH653YTX5hLLUEL1NS4TCo47RP+wi6y+TnuAL36UtK/uFyEuPy wwrDVcC4cIFhYSfsO0BumEI65yu7a8aHbGfq2lW251UcoU48Z27ZUUZd2Dr6O/n8poQHbaTd 6bJJSjzGGHZVbRP9UQ3lkmkmc0+XCHmj5WhwNNYjgbbmML7y0fsJT5RgvefAIFfHBg7fTY/i kBEimoUsTEQz+N4hbKwo1hULfVxDJStE4sbPhjbsPCrlXf6W9CxSyQ0qmZ2bXsLQYRj2xqd1 bpA+1o1j2N4/au1R/uSiUFjewJdT/LX1EklKDcQwpk06Af/N7VZtSfEJeRV04unbsKVXWZAk uAJyDDKN99ziC0Wz5kcPyVD1HNf8bgaqGDzrv3TfYjwqayRFcMf7xJaL9xXedMcAEQEAAcLB XwQYAQgACQUCUuE2fwIbDAAKCRBlw/kGpdefoG4XEACD1Qf/er8EA7g23HMxYWd3FXHThrVQ HgiGdk5Yh632vjOm9L4sd/GCEACVQKjsu98e8o3ysitFlznEns5EAAXEbITrgKWXDDUWGYxd pnjj2u+GkVdsOAGk0kxczX6s+VRBhpbBI2PWnOsRJgU2n10PZ3mZD4Xu9kU2IXYmuW+e5KCA vTArRUdCrAtIa1k01sPipPPw6dfxx2e5asy21YOytzxuWFfJTGnVxZZSCyLUO83sh6OZhJkk b9rxL9wPmpN/t2IPaEKoAc0FTQZS36wAMOXkBh24PQ9gaLJvfPKpNzGD8XWR5HHF0NLIJhgg 4ZlEXQ2fVp3XrtocHqhu4UZR4koCijgB8sB7Tb0GCpwK+C4UePdFLfhKyRdSXuvY3AHJd4CP 4JzW0Bzq/WXY3XMOzUTYApGQpnUpdOmuQSfpV9MQO+/jo7r6yPbxT7CwRS5dcQPzUiuHLK9i nvjREdh84qycnx0/6dDroYhp0DFv4udxuAvt1h4wGwTPRQZerSm4xaYegEFusyhbZrI0U9tJ B8WrhBLXDiYlyJT6zOV2yZFuW47VrLsjYnHwn27hmxTC/7tvG3euCklmkn9Sl9IAKFu29RSo d5bD8kMSCYsTqtTfT6W4A3qHGvIDta3ptLYpIAOD2sY3GYq2nf3Bbzx81wZK14JdDDHUX2Rs 6+ahAA==
  • Cc: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>, Brian Woods <brian.woods@xxxxxxx>, Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx>
  • Delivery-date: Tue, 25 Sep 2018 16:55:02 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>
  • Openpgp: preference=signencrypt

On 18/09/18 13:28, Jan Beulich wrote:
> Besides the mentioned oddity with measured performance, I've also
> noticed a significant difference (of at least 150 clocks) between
> measuring immediately around the calls to svm_load_segs() and measuring
> immediately inside the function.

This is a little concerning.  It either means that there is a bug with
your timing, or (along the same line as why I think the prefetch makes a
difference), the mapping of of svm_load_segs() is reliably TLB-cold in
the context switch path.

> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -52,6 +52,7 @@
>  #include <asm/hvm/hvm.h>
>  #include <asm/hvm/nestedhvm.h>
>  #include <asm/hvm/support.h>
> +#include <asm/hvm/svm/svm.h>
>  #include <asm/hvm/viridian.h>
>  #include <asm/debugreg.h>
>  #include <asm/msr.h>
> @@ -1281,11 +1282,35 @@ static void load_segments(struct vcpu *n
>      struct cpu_user_regs *uregs = &n->arch.user_regs;
>      int all_segs_okay = 1;
>      unsigned int dirty_segment_mask, cpu = smp_processor_id();
> +    bool fs_gs_done = false;
>  
>      /* Load and clear the dirty segment mask. */
>      dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
>      per_cpu(dirty_segment_mask, cpu) = 0;
>  
> +#ifdef CONFIG_HVM
> +    if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
> +         !((uregs->fs | uregs->gs) & ~3) &&
> +         /*
> +          * The remaining part is just for optimization: If only shadow GS
> +          * needs loading, there's nothing to be gained here.

VMLOAD also loads LDT, and LLDT is fully serialising, so an even heavier
perf hit than wrmsr.

> +          */
> +         (n->arch.pv.fs_base | n->arch.pv.gs_base_user) )
> +    {
> +        fs_gs_done = n->arch.flags & TF_kernel_mode
> +            ? svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
> +                            uregs->fs, n->arch.pv.fs_base,
> +                            uregs->gs, n->arch.pv.gs_base_kernel,
> +                            n->arch.pv.gs_base_user)
> +            : svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
> +                            uregs->fs, n->arch.pv.fs_base,
> +                            uregs->gs, n->arch.pv.gs_base_user,
> +                            n->arch.pv.gs_base_kernel);

This looks like a confusing way of writing:

    {
        unsigned long gsb = n->arch.flags & TF_kernel_mode
            ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user;
        unsigned long gss = n->arch.flags & TF_kernel_mode
            ? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel;

        fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
                                   uregs->fs, n->arch.pv.fs_base,
                                   uregs->gs, gsb, gss);
    }


AFAICT?

> +    }
> +#endif
> +    if ( !fs_gs_done )
> +        load_LDT(n);
> +
>      /* Either selector != 0 ==> reload. */
>      if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
>      {
> @@ -1301,7 +1326,7 @@ static void load_segments(struct vcpu *n
>      }
>  
>      /* Either selector != 0 ==> reload. */
> -    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) )
> +    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) && 
> !fs_gs_done )
>      {
>          all_segs_okay &= loadsegment(fs, uregs->fs);
>          /* non-nul selector updates fs_base */
> @@ -1310,7 +1335,7 @@ static void load_segments(struct vcpu *n
>      }
>  
>      /* Either selector != 0 ==> reload. */
> -    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) )
> +    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) && 
> !fs_gs_done  )

One too many spaces.

>      {
>          all_segs_okay &= loadsegment(gs, uregs->gs);
>          /* non-nul selector updates gs_base_user */
> @@ -1318,7 +1343,7 @@ static void load_segments(struct vcpu *n
>              dirty_segment_mask &= ~DIRTY_GS_BASE;
>      }
>  
> -    if ( !is_pv_32bit_vcpu(n) )
> +    if ( !fs_gs_done && !is_pv_32bit_vcpu(n) )
>      {
>          /* This can only be non-zero if selector is NULL. */
>          if ( n->arch.pv.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
> @@ -1653,6 +1678,12 @@ static void __context_switch(void)
>  
>      write_ptbase(n);
>  
> +#if defined(CONFIG_PV) && defined(CONFIG_HVM)

From a comments in code point of view, this is the most useful location
to have something along the lines of:

/* Prefetch the VMCB if we expect to use it later in the context switch */

> +    if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd) 
> &&
> +         !cpu_has_fsgsbase && cpu_has_svm )
> +        svm_load_segs(0, 0, 0, 0, 0, 0, 0);
> +#endif
> +
>      if ( need_full_gdt(nd) &&
>           ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
>      {
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -1630,6 +1646,66 @@ static void svm_init_erratum_383(const s
>      }
>  }
>  
> +#ifdef CONFIG_PV
> +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
> +                   unsigned int fs_sel, unsigned long fs_base,
> +                   unsigned int gs_sel, unsigned long gs_base,
> +                   unsigned long gs_shadow)
> +{
> +    unsigned int cpu = smp_processor_id();
> +    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
> +
> +    if ( unlikely(!vmcb) )
> +        return false;

When can this error path ever be taken?

> +
> +    if ( !ldt_base )
> +    {
> +        /*
> +         * The actual structure field used here was arbitrarily chosen.
> +         * Empirically it doesn't seem to matter much which element is used,
> +         * and a clear explanation of the otherwise poor performance has not
> +         * been found/provided so far.
> +         */
> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );

prefetchw(), which already exists and is used.

~Andrew

> +        return true;
> +    }
> +
> +    if ( likely(!ldt_ents) )
> +        memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr));
> +    else
> +    {
> +        /* Keep GDT in sync. */
> +        struct desc_struct *desc = this_cpu(gdt_table) + LDT_ENTRY -
> +                                   FIRST_RESERVED_GDT_ENTRY;
> +
> +        _set_tssldt_desc(desc, ldt_base, ldt_ents * 8 - 1, SYS_DESC_ldt);
> +
> +        vmcb->ldtr.sel = LDT_ENTRY << 3;
> +        vmcb->ldtr.attr = SYS_DESC_ldt | (_SEGMENT_P >> 8);
> +        vmcb->ldtr.limit = ldt_ents * 8 - 1;
> +        vmcb->ldtr.base = ldt_base;
> +    }
> +
> +    ASSERT(!(fs_sel & ~3));
> +    vmcb->fs.sel = fs_sel;
> +    vmcb->fs.attr = 0;
> +    vmcb->fs.limit = 0;
> +    vmcb->fs.base = fs_base;
> +
> +    ASSERT(!(gs_sel & ~3));
> +    vmcb->gs.sel = gs_sel;
> +    vmcb->gs.attr = 0;
> +    vmcb->gs.limit = 0;
> +    vmcb->gs.base = gs_base;
> +
> +    vmcb->kerngsbase = gs_shadow;
> +
> +    svm_vmload_pa(per_cpu(host_vmcb, cpu));
> +
> +    return true;
> +}
> +#endif
> +
>  static int _svm_cpu_up(bool bsp)
>  {
>      uint64_t msr_content;


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.