[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 9/9] x86/vmx: Don't leak EFER.NXE into guest context



Intel hardware only uses 4 bits in MSR_EFER.  Changes to LME and LMA are
handled automatically via the VMENTRY_CTLS.IA32E_MODE bit.

SCE is handled by ad-hoc logic in context_switch(), vmx_restore_guest_msrs()
and vmx_update_guest_efer(), and works by altering the host SCE value to match
the setting the guest wants.  This works because, in HVM vcpu context, Xen
never needs to execute a SYSCALL or SYSRET instruction.

However, NXE has never been context switched.  Unlike SCE, NXE cannot be
context switched at vcpu boundaries because disabling NXE makes PTE.NX bits
reserved and cause a pagefault when encountered.  This means that the guest
always has Xen's setting in effect, irrespective of the bit it can see and
modify in its virtualised view of MSR_EFER.

This isn't a major problem for production operating systems because they, like
Xen, always turn the NXE on when it is available.  However, it does have an
observable effect on which guest PTE bits are valid, and whether
PFEC_insn_fetch is visible in a #PF error code.

Second generation VT-x hardware has host and guest EFER fields in the VMCS,
and support for loading and saving them automatically.  First generation VT-x
hardware needs to use MSR load/save lists to cause an atomic switch of
MSR_EFER on vmentry/exit.

Therefore we update vmx_init_vmcs_config() to find and use guest/host EFER
support when available (and MSR load/save lists on older hardware) and drop
all ad-hoc alteration of SCE.

There are two complications for shadow guests.  NXE, being a paging setting
needs to remain under host control, but that is fine as it is also Xen which
handles the pagefaults.  Also, it turns out that without EPT enabled, hardware
won't tolerate LME and LMA being different via either the GUEST_EFER VMCS
setting, or via the guest load list.  This doesn't matter in practice as we
intercept all writes to CR0 and reads from MSR_EFER, so can provide
architecturally consistent behaviour from the guests point of view.

As a result of fixing EFER context switching, we can remove the Intel-special
case from hvm_nx_enabled() and let guest_walk_tables() work with the real
guest paging settings.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
CC: Jan Beulich <JBeulich@xxxxxxxx>
CC: Jun Nakajima <jun.nakajima@xxxxxxxxx>
CC: Kevin Tian <kevin.tian@xxxxxxxxx>
CC: Tim Deegan <tim@xxxxxxx>
CC: Wei Liu <wei.liu2@xxxxxxxxxx>
CC: Roger Pau Monné <roger.pau@xxxxxxxxxx>

Slightly RFC.  There are some corrections going to be made to the Intel SDM,
and I'm just waiting for final confirmation.
---
 xen/arch/x86/domain.c              | 10 -----
 xen/arch/x86/hvm/vmx/vmcs.c        |  9 ++--
 xen/arch/x86/hvm/vmx/vmx.c         | 88 ++++++++++++++++++++++++++++++--------
 xen/include/asm-x86/hvm/hvm.h      |  4 +-
 xen/include/asm-x86/hvm/vmx/vmcs.h |  2 +
 5 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 4ff3d2f3..600d7f7 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1723,16 +1723,6 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
     {
         __context_switch();
 
-        if ( is_pv_domain(nextd) &&
-             (is_idle_domain(prevd) ||
-              is_hvm_domain(prevd) ||
-              is_pv_32bit_domain(prevd) != is_pv_32bit_domain(nextd)) )
-        {
-            uint64_t efer = read_efer();
-            if ( !(efer & EFER_SCE) )
-                write_efer(efer | EFER_SCE);
-        }
-
         /* Re-enable interrupts before restoring state which may fault. */
         local_irq_enable();
 
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index e1a8f95..383098d 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -342,8 +342,8 @@ static int vmx_init_vmcs_config(void)
     }
 
     min = VM_EXIT_ACK_INTR_ON_EXIT;
-    opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT |
-          VM_EXIT_CLEAR_BNDCFGS;
+    opt = (VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT |
+           VM_EXIT_LOAD_HOST_EFER | VM_EXIT_CLEAR_BNDCFGS);
     min |= VM_EXIT_IA32E_MODE;
     _vmx_vmexit_control = adjust_vmx_controls(
         "VMExit Control", min, opt, MSR_IA32_VMX_EXIT_CTLS, &mismatch);
@@ -383,7 +383,8 @@ static int vmx_init_vmcs_config(void)
         _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS;
 
     min = 0;
-    opt = VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_BNDCFGS;
+    opt = (VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_GUEST_EFER |
+           VM_ENTRY_LOAD_BNDCFGS);
     _vmx_vmentry_control = adjust_vmx_controls(
         "VMEntry Control", min, opt, MSR_IA32_VMX_ENTRY_CTLS, &mismatch);
 
@@ -1148,6 +1149,8 @@ static int construct_vmcs(struct vcpu *v)
     v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
     __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
     __vmwrite(HOST_CR4, mmu_cr4_features);
+    if ( cpu_has_vmx_efer )
+        __vmwrite(HOST_EFER, read_efer());
 
     /* Host CS:RIP. */
     __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 26e4206..a9fbce9 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -513,15 +513,6 @@ static void vmx_restore_guest_msrs(struct vcpu *v)
     wrmsrl(MSR_LSTAR,          v->arch.hvm_vmx.lstar);
     wrmsrl(MSR_SYSCALL_MASK,   v->arch.hvm_vmx.sfmask);
 
-    if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_SCE )
-    {
-        HVM_DBG_LOG(DBG_LEVEL_2,
-                    "restore guest's EFER with value %lx",
-                    v->arch.hvm_vcpu.guest_efer);
-        write_efer((read_efer() & ~EFER_SCE) |
-                   (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
-    }
-
     if ( cpu_has_rdtscp )
         wrmsr_tsc_aux(hvm_msr_tsc_aux(v));
 }
@@ -1650,22 +1641,81 @@ static void vmx_update_guest_cr(struct vcpu *v, 
unsigned int cr,
 
 static void vmx_update_guest_efer(struct vcpu *v)
 {
-    unsigned long vm_entry_value;
+    unsigned long entry_ctls, guest_efer = v->arch.hvm_vcpu.guest_efer,
+        xen_efer = read_efer();
+
+    if ( paging_mode_shadow(v->domain) )
+    {
+        /*
+         * When using shadow pagetables, EFER.NX is a Xen-owned bit and is not
+         * under guest control.
+         */
+        guest_efer &= ~EFER_NX;
+        guest_efer |= xen_efer & EFER_NX;
+
+        /*
+         * At the time of writing (May 2018), the Intel SDM "VM Entry: Checks
+         * on Guest Control Registers, Debug Registers and MSRs" section says:
+         *
+         *  If the "Load IA32_EFER" VM-entry control is 1, the following
+         *  checks are performed on the field for the IA32_MSR:
+         *   - Bits reserved in the IA32_EFER MSR must be 0.
+         *   - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+         *     the "IA-32e mode guest" VM-entry control.  It must also be
+         *     identical to bit 8 (LME) if bit 31 in the CR0 field
+         *     (corresponding to CR0.PG) is 1.
+         *
+         * Experimentally what actually happens is:
+         *   - Checks for EFER.{LME,LMA} apply uniformly whether using the
+         *     GUEST_EFER VMCS controls, or MSR load/save lists.
+         *   - Without EPT, LME being different to LMA isn't tolerated by
+         *     hardware.  As writes to CR0 are intercepted, it is safe to
+         *     leave LME clear at this point, and fix up both LME and LMA when
+         *     CR0.PG is set.
+         */
+        if ( !(guest_efer & EFER_LMA) )
+            guest_efer &= ~EFER_LME;
+    }
 
     vmx_vmcs_enter(v);
 
-    __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
-    if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
-        vm_entry_value |= VM_ENTRY_IA32E_MODE;
+    /*
+     * The intended guest running mode is derived from VM_ENTRY_IA32E_MODE,
+     * which (architecturally) is the guest's LMA setting.
+     */
+    __vmread(VM_ENTRY_CONTROLS, &entry_ctls);
+
+    entry_ctls &= ~VM_ENTRY_IA32E_MODE;
+    if ( guest_efer & EFER_LMA )
+        entry_ctls |= VM_ENTRY_IA32E_MODE;
+
+    __vmwrite(VM_ENTRY_CONTROLS, entry_ctls);
+
+    /* We expect to use EFER loading in the common case, but... */
+    if ( likely(cpu_has_vmx_efer) )
+        __vmwrite(GUEST_EFER, guest_efer);
+
+    /* ... on Gen1 VT-x hardware, we have to use MSR load/save lists instead. 
*/
     else
-        vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
-    __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
+    {
+        /*
+         * When the guests choice of EFER matches Xen's, remove the load/save
+         * list entries.  It is unnecessary overhead, especially as this is
+         * expected to be the common case for 64bit guests.
+         */
+        if ( guest_efer == xen_efer )
+        {
+            vmx_del_msr(v, MSR_EFER, VMX_MSR_HOST);
+            vmx_del_msr(v, MSR_EFER, VMX_MSR_GUEST_LOADONLY);
+        }
+        else
+        {
+            vmx_add_msr(v, MSR_EFER, xen_efer, VMX_MSR_HOST);
+            vmx_add_msr(v, MSR_EFER, guest_efer, VMX_MSR_GUEST_LOADONLY);
+        }
+    }
 
     vmx_vmcs_exit(v);
-
-    if ( v == current )
-        write_efer((read_efer() & ~EFER_SCE) |
-                   (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
 }
 
 void nvmx_enqueue_n2_exceptions(struct vcpu *v, 
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index ef5e198..fcfc5cf 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -296,10 +296,8 @@ int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t 
dest, uint8_t dest_mode);
     (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_SMEP))
 #define hvm_smap_enabled(v) \
     (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_SMAP))
-/* HVM guests on Intel hardware leak Xen's NX settings into guest context. */
 #define hvm_nx_enabled(v) \
-    ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && cpu_has_nx) ||    \
-     ((v)->arch.hvm_vcpu.guest_efer & EFER_NX))
+    ((v)->arch.hvm_vcpu.guest_efer & EFER_NX)
 #define hvm_pku_enabled(v) \
     (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PKE))
 
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index cfd174c..6c6897c 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -306,6 +306,8 @@ extern u64 vmx_ept_vpid_cap;
     (vmx_cpu_based_exec_control & CPU_BASED_MONITOR_TRAP_FLAG)
 #define cpu_has_vmx_pat \
     (vmx_vmentry_control & VM_ENTRY_LOAD_GUEST_PAT)
+#define cpu_has_vmx_efer \
+    (vmx_vmentry_control & VM_ENTRY_LOAD_GUEST_EFER)
 #define cpu_has_vmx_unrestricted_guest \
     (vmx_secondary_exec_control & SECONDARY_EXEC_UNRESTRICTED_GUEST)
 #define vmx_unrestricted_guest(v)               \
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.