[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v5 7/7] xen/x86: use PCID feature



Avoid flushing the complete TLB when switching %cr3 for mitigation of
Meltdown by using the PCID feature if available.

We are using 4 PCID values for a 64 bit pv domain subject to XPTI and
2 values for the non-XPTI case:

- guest active and in kernel mode
- guest active and in user mode
- hypervisor active and guest in user mode (XPTI only)
- hypervisor active and guest in kernel mode (XPTI only)

We use PCID only if PCID _and_ INVPCID are supported. With PCID in use
we disable global pages in cr4. A command line parameter controls in
which cases PCID is being used.

As the non-XPTI case has shown not to perform better with PCID at least
on some machines the default is to use PCID only for domains subject to
XPTI.

With PCID enabled we always disable global pages. This avoids having to
either flush the complete TLB or do a cycle through all PCID values
when invalidating a single global page.

pv_guest_cr4_to_real_cr4() is switched from a macro to a real function
now as it has become more complex.

Performance for the XPTI case improves a lot: parallel make of the Xen
hypervisor on my machine reduced elapsed time from 107 to 93 seconds,
system time was reduced from 140 to 103 seconds.

Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
---
V5:
- use X86_CR3_ADDR_MASK instead of ~X86_CR3_PCID_MASK (Jan Beulich)
- add some const qualifiers (Jan Beulich)
- mask X86_CR3_ADDR_MASK with PADDR_MASK (Jan Beulich)
- add flushing the TLB from old PCID related entries in write_cr3_cr4()
  (Jan Beulich)

V4:
- add cr3 mask for page table address and use that in dbg_pv_va2mfn()
  (Jan Beulich)
- use invpcid_flush_all_nonglobals() instead of invpcid_flush_all()
  (Jan Beulich)
- use PCIDs 0/1 when running in Xen or without XPTI, 2/3 with XPTI in
  guest (Jan Beulich)
- ASSERT cr4.pge and cr4.pcide are never active at the same time
  (Jan Beulich)
- make pv_guest_cr4_to_real_cr4() a real function

V3:
- support PCID for non-XPTI case, too
- add command line parameter for controlling usage of PCID
- check PCID active by using cr4.pcide (Jan Beulich)
---
 docs/misc/xen-command-line.markdown | 12 ++++++
 xen/arch/x86/debug.c                |  2 +-
 xen/arch/x86/domain_page.c          |  2 +-
 xen/arch/x86/domctl.c               |  4 ++
 xen/arch/x86/flushtlb.c             | 54 ++++++++++++++++++++++++--
 xen/arch/x86/mm.c                   | 24 +++++++++++-
 xen/arch/x86/pv/dom0_build.c        |  1 +
 xen/arch/x86/pv/domain.c            | 77 ++++++++++++++++++++++++++++++++++++-
 xen/include/asm-x86/domain.h        | 15 +++-----
 xen/include/asm-x86/processor.h     |  3 ++
 xen/include/asm-x86/pv/domain.h     | 20 ++++++++++
 xen/include/asm-x86/x86-defns.h     |  4 +-
 12 files changed, 199 insertions(+), 19 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 5f6ae654ad..db87fd326d 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1452,6 +1452,18 @@ All numbers specified must be hexadecimal ones.
 
 This option can be specified more than once (up to 8 times at present).
 
+### pcid (x86)
+> `= <boolean> | xpti | noxpti`
+
+> Default: `xpti`
+
+> Can be modified at runtime
+
+If available, control usage of the PCID feature of the processor for
+64-bit pv-domains. PCID can be used either for no domain at all (`false`),
+for all of them (`true`), only for those subject to XPTI (`xpti`) or for
+those not subject to XPTI (`noxpti`).
+
 ### ple\_gap
 > `= <integer>`
 
diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c
index 9159f32db4..0d46f2f45a 100644
--- a/xen/arch/x86/debug.c
+++ b/xen/arch/x86/debug.c
@@ -97,7 +97,7 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t 
pgd3val)
     l3_pgentry_t l3e, *l3t;
     l2_pgentry_t l2e, *l2t;
     l1_pgentry_t l1e, *l1t;
-    unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3);
+    unsigned long cr3 = pgd3val ?: (dp->vcpu[0]->arch.cr3 & X86_CR3_ADDR_MASK);
     mfn_t mfn = maddr_to_mfn(cr3);
 
     DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id, 
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index b5780f201f..b5af0e639a 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -51,7 +51,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
         if ( (v = idle_vcpu[smp_processor_id()]) == current )
             sync_local_execstate();
         /* We must now be running on the idle page table. */
-        ASSERT(read_cr3() == __pa(idle_pg_table));
+        ASSERT((read_cr3() & X86_CR3_ADDR_MASK) == __pa(idle_pg_table));
     }
 
     return v;
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 0704f398c7..a7c8772fa6 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -613,7 +613,11 @@ long arch_do_domctl(
             ret = -EINVAL;
 
         if ( ret == 0 )
+        {
             xpti_domain_init(d);
+            pcid_domain_init(d);
+        }
+
         break;
 
     case XEN_DOMCTL_get_address_size:
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index 5dcd9a2bf6..6c7d57b7aa 100644
--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -12,6 +12,7 @@
 #include <asm/flushtlb.h>
 #include <asm/invpcid.h>
 #include <asm/page.h>
+#include <asm/pv/domain.h>
 
 /* Debug builds: Wrap frequently to stress-test the wrap logic. */
 #ifdef NDEBUG
@@ -93,6 +94,7 @@ void write_cr3_cr4(unsigned long cr3, unsigned long cr4)
 {
     unsigned long flags;
     u32 t;
+    unsigned long old_pcid = read_cr3() & X86_CR3_PCID_MASK;
 
     /* This non-reentrant function is sometimes called in interrupt context. */
     local_irq_save(flags);
@@ -100,12 +102,35 @@ void write_cr3_cr4(unsigned long cr3, unsigned long cr4)
     t = pre_flush();
 
     if ( read_cr4() & X86_CR4_PGE )
+        /*
+         * X86_CR4_PGE set means PCID being inactive.
+         * We have to purge the TLB via flipping cr4.pge.
+         */
         write_cr4(cr4 & ~X86_CR4_PGE);
+    else if ( use_invpcid )
+        /*
+         * If we are using PCID purge the TLB via INVPCID as loading cr3
+         * will affect the new PCID only.
+         * If INVPCID is not supported we don't use PCIDs so loading cr3
+         * will purge the TLB (we are in the "global pages off" branch).
+         * invpcid_flush_all_nonglobals() seems to be faster than
+         * invpcid_flush_all().
+         */
+        invpcid_flush_all_nonglobals();
 
     asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
 
     if ( read_cr4() != cr4 )
         write_cr4(cr4);
+    else if ( old_pcid != (cr3 & X86_CR3_PCID_MASK) )
+        /*
+         * Make sure no TLB entries related to the old PCID created between
+         * flushing the TLB and writing the new %cr3 value remain in the TLB.
+         * Writing %cr3 is documented to be a speculation barrier, OTOH the
+         * performance impact of the additional flush is next to invisible.
+         * So better be save than sorry.
+         */
+        invpcid_flush_single_context(old_pcid);
 
     post_flush(t);
 
@@ -132,11 +157,32 @@ unsigned int flush_area_local(const void *va, unsigned 
int flags)
             /*
              * We don't INVLPG multi-page regions because the 2M/4M/1G
              * region may not have been mapped with a superpage. Also there
-             * are various errata surrounding INVLPG usage on superpages, and
-             * a full flush is in any case not *that* expensive.
+             * are various errata surrounding INVLPG usage on superpages,
+             * and a full flush is in any case not *that* expensive.
              */
-            asm volatile ( "invlpg %0"
-                           : : "m" (*(const char *)(va)) : "memory" );
+            if ( read_cr4() & X86_CR4_PCIDE )
+            {
+                unsigned long addr = (unsigned long)va;
+
+                /*
+                 * Flush the addresses for all potential address spaces.
+                 * We can't check the current domain for being subject to
+                 * XPTI as current might be the idle vcpu while we still have
+                 * some XPTI domain TLB entries.
+                 * Using invpcid is okay here, as with PCID enabled we always
+                 * have global pages disabled.
+                 */
+                invpcid_flush_one(PCID_PV_PRIV, addr);
+                invpcid_flush_one(PCID_PV_USER, addr);
+                if ( !cpu_has_no_xpti )
+                {
+                    invpcid_flush_one(PCID_PV_PRIV | PCID_PV_XPTI, addr);
+                    invpcid_flush_one(PCID_PV_USER | PCID_PV_XPTI, addr);
+                }
+            }
+            else
+                asm volatile ( "invlpg %0"
+                               : : "m" (*(const char *)(va)) : "memory" );
         }
         else
             do_tlb_flush();
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 03aa44be76..d31ada0dc9 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -127,6 +127,7 @@
 #include <asm/processor.h>
 
 #include <asm/hvm/grant_table.h>
+#include <asm/pv/domain.h>
 #include <asm/pv/grant_table.h>
 #include <asm/pv/mm.h>
 
@@ -500,7 +501,26 @@ void free_shared_domheap_page(struct page_info *page)
 
 void make_cr3(struct vcpu *v, mfn_t mfn)
 {
+    struct domain *d = v->domain;
+
     v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT;
+    if ( is_pv_domain(d) && d->arch.pv_domain.pcid )
+        v->arch.cr3 |= get_pcid_bits(v, false);
+}
+
+unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v)
+{
+    const struct domain *d = v->domain;
+    unsigned long cr4;
+
+    cr4 = v->arch.pv_vcpu.ctrlreg[4] & ~X86_CR4_DE;
+    cr4 |= mmu_cr4_features & (X86_CR4_PSE | X86_CR4_SMEP | X86_CR4_SMAP |
+                               X86_CR4_OSXSAVE | X86_CR4_FSGSBASE);
+    cr4 |= (d->arch.pv_domain.xpti || d->arch.pv_domain.pcid) ? 0 : 
X86_CR4_PGE;
+    cr4 |= d->arch.pv_domain.pcid ? X86_CR4_PCIDE : 0;
+    cr4 |= d->arch.vtsc ? X86_CR4_TSD : 0;
+
+    return cr4;
 }
 
 void write_ptbase(struct vcpu *v)
@@ -510,12 +530,14 @@ void write_ptbase(struct vcpu *v)
 
     new_cr4 = (is_pv_vcpu(v) && !is_idle_vcpu(v))
               ? pv_guest_cr4_to_real_cr4(v)
-              : ((read_cr4() & ~X86_CR4_TSD) | X86_CR4_PGE);
+              : ((read_cr4() & ~(X86_CR4_PCIDE | X86_CR4_TSD)) | X86_CR4_PGE);
 
     if ( is_pv_vcpu(v) && v->domain->arch.pv_domain.xpti )
     {
         cpu_info->root_pgt_changed = true;
         cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
+        if ( new_cr4 & X86_CR4_PCIDE )
+            cpu_info->pv_cr3 |= get_pcid_bits(v, true);
         write_cr3_cr4(v->arch.cr3, new_cr4);
     }
     else
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index 77186c19bd..2af0094e95 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -709,6 +709,7 @@ int __init dom0_construct_pv(struct domain *d,
     }
 
     xpti_domain_init(d);
+    pcid_domain_init(d);
 
     d->arch.paging.mode = 0;
 
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index 2bef9c48bc..bfaab414e1 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -10,6 +10,7 @@
 #include <xen/sched.h>
 
 #include <asm/cpufeature.h>
+#include <asm/invpcid.h>
 #include <asm/msr-index.h>
 #include <asm/pv/domain.h>
 
@@ -94,6 +95,70 @@ void xpti_domain_init(struct domain *d)
     }
 }
 
+static __read_mostly enum {
+    PCID_OFF,
+    PCID_ALL,
+    PCID_XPTI,
+    PCID_NOXPTI
+} opt_pcid = PCID_XPTI;
+
+static __init int parse_pcid(const char *s)
+{
+    int rc = 0;
+
+    switch ( parse_bool(s, NULL) )
+    {
+    case 0:
+        opt_pcid = PCID_OFF;
+        break;
+    case 1:
+        opt_pcid = PCID_ALL;
+        break;
+    default:
+        switch ( parse_boolean("xpti", s, NULL) )
+        {
+        case 0:
+            opt_pcid = PCID_NOXPTI;
+            break;
+        case 1:
+            opt_pcid = PCID_XPTI;
+            break;
+        default:
+            rc = -EINVAL;
+            break;
+        }
+        break;
+    }
+
+    return rc;
+}
+custom_runtime_param("pcid", parse_pcid);
+
+void pcid_domain_init(struct domain *d)
+{
+    if ( !is_pv_domain(d) || is_pv_32bit_domain(d) ||
+         !use_invpcid || !cpu_has_pcid )
+        return;
+
+    switch ( opt_pcid )
+    {
+    case PCID_OFF:
+        break;
+    case PCID_ALL:
+        d->arch.pv_domain.pcid = true;
+        break;
+    case PCID_XPTI:
+        d->arch.pv_domain.pcid = d->arch.pv_domain.xpti;
+        break;
+    case PCID_NOXPTI:
+        d->arch.pv_domain.pcid = !d->arch.pv_domain.xpti;
+        break;
+    default:
+        ASSERT_UNREACHABLE();
+        break;
+    }
+}
+
 static void noreturn continue_nonidle_domain(struct vcpu *v)
 {
     check_wakeup_from_wait();
@@ -298,9 +363,19 @@ int pv_domain_initialise(struct domain *d)
 
 static void _toggle_guest_pt(struct vcpu *v)
 {
+    const struct domain *d = v->domain;
+
     v->arch.flags ^= TF_kernel_mode;
     update_cr3(v);
-    get_cpu_info()->root_pgt_changed = true;
+    if ( d->arch.pv_domain.xpti )
+    {
+        struct cpu_info *cpu_info = get_cpu_info();
+
+        cpu_info->root_pgt_changed = true;
+        cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
+                           (d->arch.pv_domain.pcid
+                            ? get_pcid_bits(v, true) : 0);
+    }
 
     /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
     asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index b7894dc8c8..8b66096e7f 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -255,6 +255,8 @@ struct pv_domain
 
     /* XPTI active? */
     bool xpti;
+    /* Use PCID feature? */
+    bool pcid;
 
     /* map_domain_page() mapping cache. */
     struct mapcache_domain mapcache;
@@ -615,19 +617,12 @@ void vcpu_show_registers(const struct vcpu *);
 unsigned long pv_guest_cr4_fixup(const struct vcpu *, unsigned long guest_cr4);
 
 /* Convert between guest-visible and real CR4 values. */
-#define pv_guest_cr4_to_real_cr4(v)                         \
-    (((v)->arch.pv_vcpu.ctrlreg[4]                          \
-      | (mmu_cr4_features                                   \
-         & (X86_CR4_PSE | X86_CR4_SMEP |                    \
-            X86_CR4_SMAP | X86_CR4_OSXSAVE |                \
-            X86_CR4_FSGSBASE))                              \
-      | ((v)->domain->arch.pv_domain.xpti ? 0 : X86_CR4_PGE) \
-      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0))         \
-     & ~X86_CR4_DE)
+unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v);
+
 #define real_cr4_to_pv_guest_cr4(c)                         \
     ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD |      \
              X86_CR4_OSXSAVE | X86_CR4_SMEP |               \
-             X86_CR4_FSGSBASE | X86_CR4_SMAP))
+             X86_CR4_FSGSBASE | X86_CR4_SMAP | X86_CR4_PCIDE))
 
 #define domain_max_vcpus(d) (is_hvm_domain(d) ? HVM_MAX_VCPUS : MAX_VIRT_CPUS)
 
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index db9988ab33..3067a8c58f 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -290,6 +290,9 @@ static inline unsigned long read_cr4(void)
 
 static inline void write_cr4(unsigned long val)
 {
+    /* No global pages in case of PCIDs enabled! */
+    ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE));
+
     get_cpu_info()->cr4 = val;
     asm volatile ( "mov %0,%%cr4" : : "r" (val) );
 }
diff --git a/xen/include/asm-x86/pv/domain.h b/xen/include/asm-x86/pv/domain.h
index 911e5dc07f..3c8c8f4ccc 100644
--- a/xen/include/asm-x86/pv/domain.h
+++ b/xen/include/asm-x86/pv/domain.h
@@ -21,6 +21,24 @@
 #ifndef __X86_PV_DOMAIN_H__
 #define __X86_PV_DOMAIN_H__
 
+/* PCID values for the address spaces of 64-bit pv domains: */
+#define PCID_PV_PRIV      0x0000    /* Used for other domains, too. */
+#define PCID_PV_USER      0x0001
+#define PCID_PV_XPTI      0x0002    /* To be ORed to above values. */
+
+/*
+ * Return additional PCID specific cr3 bits.
+ *
+ * Note that X86_CR3_NOFLUSH will not be readable in cr3. Anyone consuming
+ * v->arch.cr3 should mask away X86_CR3_NOFLUSH and X86_CR3_PCIDMASK in case
+ * the value is used to address the root page table.
+ */
+static inline unsigned long get_pcid_bits(const struct vcpu *v, bool is_xpti)
+{
+    return X86_CR3_NOFLUSH | (is_xpti ? PCID_PV_XPTI : 0) |
+           ((v->arch.flags & TF_kernel_mode) ? PCID_PV_PRIV : PCID_PV_USER);
+}
+
 #ifdef CONFIG_PV
 
 void pv_vcpu_destroy(struct vcpu *v);
@@ -29,6 +47,7 @@ void pv_domain_destroy(struct domain *d);
 int pv_domain_initialise(struct domain *d);
 void xpti_init(void);
 void xpti_domain_init(struct domain *d);
+void pcid_domain_init(struct domain *d);
 
 #else  /* !CONFIG_PV */
 
@@ -40,6 +59,7 @@ static inline void pv_domain_destroy(struct domain *d) {}
 static inline int pv_domain_initialise(struct domain *d) { return -EOPNOTSUPP; 
}
 static inline void xpti_init(void) {}
 static inline void xpti_domain_init(struct domain *d) {}
+static inline void pcid_domain_init(struct domain *d) {}
 
 #endif /* CONFIG_PV */
 
diff --git a/xen/include/asm-x86/x86-defns.h b/xen/include/asm-x86/x86-defns.h
index ff8d66be3c..123560897e 100644
--- a/xen/include/asm-x86/x86-defns.h
+++ b/xen/include/asm-x86/x86-defns.h
@@ -45,7 +45,9 @@
 /*
  * Intel CPU flags in CR3
  */
-#define X86_CR3_NOFLUSH (_AC(1, ULL) << 63)
+#define X86_CR3_NOFLUSH    (_AC(1, ULL) << 63)
+#define X86_CR3_ADDR_MASK  (PAGE_MASK & PADDR_MASK & ~X86_CR3_NOFLUSH)
+#define X86_CR3_PCID_MASK  _AC(0x0fff, ULL) /* Mask for PCID */
 
 /*
  * Intel CPU features in CR4
-- 
2.13.6


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.