[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH RFC v3 2/6] HVM x86 deprivileged mode: Code for switching into/out of deprivileged mode



The process to switch into and out of deprivileged mode can be likened to
setjmp/longjmp.

Xen is non-preemptive and taking an interrupt/exception, SYSCALL, SYSENTER,
NMI or any IST will currently clobber the Xen privileged stack. We need this
stack to be preserved so that after executing deprivileged mode, we can
return to our previous privileged execution point. This allows us to unwind the
stack, cleaning up memory allocations.

To enter deprivileged mode, we move the interrupt/exception rsp,
SYSENTER rsp and SYSCALL rsp to point to lower down Xen's privileged stack
to prevent them from clobbering it. The IST NMI and DF handlers used to copy
themselves onto the privileged stack. This is no longer the case, they now
leave themselves on their predefined stacks.

This means that we can continue execution from that point. This is similar
behaviour to a context switch.

To exit deprivileged mode, we restore the original interrupt/exception rsp,
SYSENTER rsp and SYSCALL rsp. We can then continue execution from where we left
off, which will unwind the stack and free up resources. This method means that
we do not need to change any other code paths and its invocation will be
transparent to callers. This should allow the feature to be more easily
deployed to different parts of Xen.

The switch to and from deprivileged mode is performed using sysret and syscall
respectively.

Signed-off-by: Ben Catterall <Ben.Catterall@xxxxxxxxxx>

Changed since v1
----------------
 * Added support for AMD SVM
 * Moved to the new stack approach
 * IST handlers no longer copy themselves
 * Updated context switching code to perform a full context-switch.
     This means that depriv mode will execute with host register states not
     (partial) guest register state. This allows for crashing the domain (later
     patch) whilst in depriv mode, alleviates potential security vulnerabilities
     and is necessaryto work around the AMD TR issue.
 * Moved processor-specific code to processor-specific files.
 * Changed call/jmp pair in deprivileged_asm.S to call/ret pair to not confuse
   processor branch predictors.

Changed since v2:
-----------------
 * Coding style: Add space after if, for, etc.
---
 xen/arch/x86/domain.c               |  12 +++
 xen/arch/x86/hvm/Makefile           |   1 +
 xen/arch/x86/hvm/deprivileged.c     | 103 ++++++++++++++++++++++
 xen/arch/x86/hvm/deprivileged_asm.S | 167 ++++++++++++++++++++++++++++++++++++
 xen/arch/x86/hvm/svm/svm.c          | 130 +++++++++++++++++++++++++++-
 xen/arch/x86/hvm/vmx/vmx.c          | 118 +++++++++++++++++++++++++
 xen/arch/x86/mm/hap/hap.c           |   2 +-
 xen/arch/x86/x86_64/asm-offsets.c   |   5 ++
 xen/arch/x86/x86_64/entry.S         |  38 ++++++--
 xen/arch/x86/x86_64/traps.c         |  13 ++-
 xen/include/asm-x86/current.h       |   2 +
 xen/include/asm-x86/hvm/svm/svm.h   |  13 +++
 xen/include/asm-x86/hvm/vcpu.h      |  15 ++++
 xen/include/asm-x86/hvm/vmx/vmx.h   |   2 +
 xen/include/asm-x86/processor.h     |   2 +
 xen/include/asm-x86/system.h        |   3 +
 xen/include/xen/hvm/deprivileged.h  |  45 ++++++++++
 xen/include/xen/sched.h             |  18 +++-
 18 files changed, 674 insertions(+), 15 deletions(-)
 create mode 100644 xen/arch/x86/hvm/deprivileged_asm.S

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 045f6ff..a0e5e70 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -62,6 +62,7 @@
 #include <xen/iommu.h>
 #include <compat/vcpu.h>
 #include <asm/psr.h>
+#include <xen/hvm/deprivileged.h>
 
 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
 DEFINE_PER_CPU(unsigned long, cr4);
@@ -446,6 +447,12 @@ int vcpu_initialise(struct vcpu *v)
     if ( has_hvm_container_domain(d) )
     {
         rc = hvm_vcpu_initialise(v);
+
+        /* Initialise HVM deprivileged mode */
+        printk("HVM initialising deprivileged mode ...");
+        hvm_deprivileged_prepare_vcpu(v);
+        printk("Done.\n");
+
         goto done;
     }
 
@@ -523,7 +530,12 @@ void vcpu_destroy(struct vcpu *v)
     vcpu_destroy_fpu(v);
 
     if ( has_hvm_container_vcpu(v) )
+    {
+        /* Destroy the deprivileged mode on this vcpu */
+        hvm_deprivileged_destroy_vcpu(v);
+
         hvm_vcpu_destroy(v);
+    }
     else
         xfree(v->arch.pv_vcpu.trap_ctxt);
 }
diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile
index df5ebb8..e16960a 100644
--- a/xen/arch/x86/hvm/Makefile
+++ b/xen/arch/x86/hvm/Makefile
@@ -3,6 +3,7 @@ subdir-y += vmx
 
 obj-y += asid.o
 obj-y += deprivileged.o
+obj-y += deprivileged_asm.o
 obj-y += emulate.o
 obj-y += event.o
 obj-y += hpet.o
diff --git a/xen/arch/x86/hvm/deprivileged.c b/xen/arch/x86/hvm/deprivileged.c
index 0075523..5574c50 100644
--- a/xen/arch/x86/hvm/deprivileged.c
+++ b/xen/arch/x86/hvm/deprivileged.c
@@ -536,3 +536,106 @@ struct page_info *hvm_deprivileged_alloc_page(struct 
domain *d)
 
     return pg;
 }
+
+/* Used to prepare each vcpus data for user mode. Call for each HVM vcpu. */
+int hvm_deprivileged_prepare_vcpu(struct vcpu *vcpu)
+{
+    vcpu->arch.hvm_vcpu.depriv_rsp = 0;
+    vcpu->arch.hvm_vcpu.depriv_user_mode = 0;
+    vcpu->arch.hvm_vcpu.depriv_destroy = 0;
+    vcpu->arch.hvm_vcpu.depriv_watchdog_count = 0;
+
+    return 0;
+}
+
+/* Called on destroying each vcpu */
+void hvm_deprivileged_destroy_vcpu(struct vcpu *vcpu)
+{
+
+}
+
+/*
+ * Called to perform a user mode operation.
+ * Execution context is preserved and then we move into user mode.
+ * This method is then jumped into to restore execution context after
+ * exiting user mode.
+ */
+void hvm_deprivileged_user_mode(void)
+{
+    struct vcpu *vcpu = get_current();
+
+    ASSERT( vcpu->arch.hvm_vcpu.depriv_user_mode == 0 );
+    ASSERT( vcpu->arch.hvm_vcpu.depriv_rsp == 0 );
+
+    vcpu->arch.hvm_vcpu.depriv_ctxt_switch_to(vcpu);
+
+    /* The assembly routine to handle moving into/out of deprivileged mode */
+    hvm_deprivileged_user_mode_asm();
+
+    vcpu->arch.hvm_vcpu.depriv_ctxt_switch_from(vcpu);
+
+    vcpu->arch.hvm_vcpu.depriv_user_mode = 0;
+    vcpu->arch.hvm_vcpu.depriv_rsp = 0;
+}
+
+/*
+ * We need to be able to handle interrupts and exceptions whilst in 
deprivileged
+ * mode. Xen is non-preemptable so our privileged mode stack would be clobbered
+ * if we took an exception/interrupt, syscall or sysenter whilst in 
deprivileged
+ * mode.
+ *
+ * To handle this, we setup another set of stacks for interrupts/exceptions,
+ * syscall and sysenter. This is done by
+ * - changing TSS.rsp0 so that interrupts  and exceptions are taken on a part 
of
+ *   the Xen stack past our current rsp.
+ * - moving the syscall and sysenter stacks so these are also moved past our
+ *   execution point.
+ *
+ * This function is called at the point where this rsp is as deep as it will
+ * be on the return path so we can safely clobber after it. It has also been
+ * aligned as needed for a stack ponter.
+ * We do not need to change the IST stack pointers as these are already taken 
on
+ * different stacks so won't clobber our current Xen stack.
+ *
+ * New Stack Layout
+ * ----------------
+ *
+ * Xen's cpu stacks are 8 pages (8-page aligned), arranged as:
+ *
+ * 7 - Primary stack (with a struct cpu_info at the top)
+ * 6 - Primary stack
+ *     - Somewhere in 6 and 7 (depending upon where rsp is when we enter
+ *       deprivileged mode), we set the syscall/sysenter and exception pointer
+ *        so that it is below the current rsp.
+ * 5 - Optionally not preset (MEMORY_GUARD)
+ * 4 - unused
+ * 3 - Syscall trampolines
+ * 2 - MCE IST stack
+ * 1 - NMI IST stack
+ * 0 - Double Fault IST stack
+ */
+void hvm_deprivileged_setup_stacks(unsigned long stack_ptr)
+{
+    get_current()->arch.hvm_vcpu.depriv_setup_stacks(stack_ptr);
+}
+
+/*
+ * Restore the old TSS.rsp0 for the interrupt/exception stack and the
+ * syscall/sysenter stacks.
+ */
+void hvm_deprivileged_restore_stacks(void)
+{
+    get_current()->arch.hvm_vcpu.depriv_restore_stacks();
+}
+
+/*
+ * Called when the user mode operation has completed
+ * Perform C-level processing on return pathx
+ */
+void hvm_deprivileged_finish_user_mode(void)
+{
+    /* If we are not returning from user mode: bail */
+    ASSERT(get_current()->arch.hvm_vcpu.depriv_user_mode == 1);
+
+    hvm_deprivileged_finish_user_mode_asm();
+}
diff --git a/xen/arch/x86/hvm/deprivileged_asm.S 
b/xen/arch/x86/hvm/deprivileged_asm.S
new file mode 100644
index 0000000..07d4216
--- /dev/null
+++ b/xen/arch/x86/hvm/deprivileged_asm.S
@@ -0,0 +1,167 @@
+/*
+ * HVM deprivileged mode assembly code
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/softirq.h>
+#include <asm/asm_defns.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#include <public/xen.h>
+#include <irq_vectors.h>
+#include <xen/hvm/deprivileged.h>
+
+/*
+ * Handles entry into the deprivileged mode and returning from this
+ * mode.
+ *
+ * If we are entering deprivileged mode, then we use a sysret to get there.
+ * If we are returning from deprivileged mode, then we need to unwind the stack
+ * so we push the return address onto the current stack so that we can return
+ * from into this function and then return, unwinding the stack.
+ *
+ * We're doing a sort-of long jump/set jump with copying to a stack to
+ * preserve it and allow returning code to continue executing from
+ * within this method.
+ */
+ENTRY(hvm_deprivileged_user_mode_asm)
+        /* Save our registers */
+        push   %rax
+        push   %rbx
+        push   %rcx
+        push   %rdx
+        push   %rsi
+        push   %rdi
+        push   %rbp
+        push   %r8
+        push   %r9
+        push   %r10
+        push   %r11
+        push   %r12
+        push   %r13
+        push   %r14
+        push   %r15
+        pushfq
+
+        /* Perform a near call to push rip onto the stack */
+        call   1f
+
+        /*
+         * MAGIC: Add to the stored rip the size of the code between
+         * label 1 and label 2. This allows  us to restart execution at label 
2.
+         */
+1:      addq   $2f-1b, (%rsp)
+
+        /*
+         * Setup the stack pointers for exceptions, syscall and sysenter to be
+         * just after our current rsp, adjusted for 16 byte alignment.
+         */
+        mov    %rsp, %rdi
+        and    $-16,  %rdi
+        call   hvm_deprivileged_setup_stacks
+        /*
+         * DO NOT push any more data onto the stack from here unless returning
+         * from user mode. It will be clobbered by exceptions/interrupts,
+         * syscall and sysenter.
+         */
+
+/* USER MODE ENTRY POINT */
+2:
+        GET_CURRENT(%r8)
+        movq   VCPU_depriv_user_mode(%r8), %rdx
+
+        /* If !user_mode  */
+        cmpq   $0, %rdx
+        jne    3f
+        cli
+
+        movq   %rsp, VCPU_depriv_rsp(%r8)        /* The rsp to restore to */
+        movabs $HVM_DEPRIVILEGED_TEXT_ADDR, %rcx /* RIP in user mode */
+
+        /* RFLAGS user mode */
+        movq   $(X86_EFLAGS_IF | X86_EFLAGS_VIP), %r11
+        movq   $1, VCPU_depriv_user_mode(%r8)    /* Now in user mode */
+
+        /*
+         * Stack ptr is set by user mode. If we set rsp to the user mode stack
+         * pointer here and subsequently took an interrupt or exception between
+         * setting it and executing sysret, then the interrupt would use the
+         * user mode stack pointer. This is because the current stack rsp is
+         * used if the exception descriptor's privilege level = CPL.
+         * See Intel manual volume 3A section 6.12.1 and AMD manual volume 2,
+         * section 8.9.3. Also see Intel manual volume 2 and AMD manual 3 on
+         * the sysret instruction.
+         */
+        movq   $HVM_STACK_PTR, %rbx
+        sysretq                         /* Enter deprivileged mode */
+
+3:      call   hvm_deprivileged_restore_stacks
+
+        /*
+         * Restore registers
+         * The return rip has been popped by the ret on the return path
+         */
+        popfq
+        pop    %r15
+        pop    %r14
+        pop    %r13
+        pop    %r12
+        pop    %r11
+        pop    %r10
+        pop    %r9
+        pop    %r8
+        pop    %rbp
+        pop    %rdi
+        pop    %rsi
+        pop    %rdx
+        pop    %rcx
+        pop    %rbx
+        pop    %rax
+        ret
+
+/* Finished in user mode so return */
+ENTRY(hvm_deprivileged_finish_user_mode_asm)
+        /* Reset rsp to the old rsp */
+        cli
+        GET_CURRENT(%rbx)
+        movq   VCPU_depriv_rsp(%rbx), %rsp
+
+        /*
+         * The return address that the near call pushed onto the
+         * buffer is pointed to by rsp, so use that for rip.
+         */
+        /* Go to user mode return code */
+        ret
+
+/* Entry point from the assembly syscall handlers */
+ENTRY(hvm_deprivileged_handle_user_mode)
+
+        /* Handle a user mode hypercall here */
+
+
+        /* We are finished in user mode */
+        call hvm_deprivileged_finish_user_mode
+
+        ret
+
+.section .hvm_deprivileged_enhancement.text,"ax"
+/* HVM deprivileged code */
+ENTRY(hvm_deprivileged_ring3)
+        /*
+         * sysret has loaded eip from rcx and rflags from r11.
+         * CS and SS have been loaded from the MSR for ring 3.
+         * We now need to switch to the user mode stack
+         */
+        movabs $HVM_STACK_PTR, %rsp
+
+        /* Perform user mode processing */
+        movabs $0xff, %rcx
+1: dec  %rcx
+        cmp $0, %rcx
+        jne 1b
+
+        /* Return to ring 0 */
+        syscall
+
+.previous
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index 8de41fa..3393fb5 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -61,6 +61,11 @@
 #include <asm/apic.h>
 #include <asm/debugger.h>
 #include <asm/xstate.h>
+#include <xen/hvm/deprivileged.h>
+
+/* HVM svm MSR_{L}STAR cache */
+DEFINE_PER_CPU(u64, svm_depriv_msr_lstar);
+DEFINE_PER_CPU(u64, svm_depriv_msr_star);
 
 void svm_asm_do_resume(void);
 
@@ -962,12 +967,30 @@ static inline void svm_tsc_ratio_save(struct vcpu *v)
         wrmsrl(MSR_AMD64_TSC_RATIO, DEFAULT_TSC_RATIO);
 }
 
+unsigned long svm_depriv_read_msr_star(void)
+{
+    return this_cpu(svm_depriv_msr_star);
+}
+
+void svm_depriv_write_msr_star(unsigned long star)
+{
+    this_cpu(svm_depriv_msr_star) = star;
+}
+unsigned long svm_depriv_read_msr_lstar(void)
+{
+    return this_cpu(svm_depriv_msr_lstar);
+}
+
+void svm_depriv_write_msr_lstar(unsigned long lstar)
+{
+    this_cpu(svm_depriv_msr_lstar) = lstar;
+}
+
 static inline void svm_tsc_ratio_load(struct vcpu *v)
 {
     if ( cpu_has_tsc_ratio && !v->domain->arch.vtsc ) 
         wrmsrl(MSR_AMD64_TSC_RATIO, vcpu_tsc_ratio(v));
 }
-
 static void svm_ctxt_switch_from(struct vcpu *v)
 {
     int cpu = smp_processor_id();
@@ -1030,6 +1053,93 @@ static void svm_ctxt_switch_to(struct vcpu *v)
         wrmsrl(MSR_TSC_AUX, hvm_msr_tsc_aux(v));
 }
 
+static void svm_depriv_ctxt_switch_from(struct vcpu *v)
+{
+
+    svm_ctxt_switch_to(v);
+    vcpu_restore_fpu_eager(v);
+
+    /* Restore the efer and saved msr registers */
+    write_efer(v->arch.hvm_vcpu.depriv_efer);
+}
+
+/* Setup our stack pointers for interrupts/exceptions, and SYSCALL. */
+static void svm_depriv_setup_stacks(unsigned long stack_ptr)
+{
+    struct vcpu *vcpu = get_current();
+    struct tss_struct *tss = &this_cpu(init_tss);
+    unsigned char *stub_page;
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned int offset;
+
+    /* Save the current rsp0 */
+    vcpu->arch.hvm_vcpu.depriv_tss_rsp0 = tss->rsp0;
+
+    /* Setup the stack for interrupts/exceptions */
+    tss->rsp0 = stack_ptr;
+
+    /* Stacks for syscall and sysenter */
+    stub_page = map_domain_page(_mfn(this_cpu(stubs.mfn)));
+
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_ptr,
+                                   (unsigned long)lstar_enter);
+
+    stub_va += offset;
+
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_ptr,
+                                   (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
+}
+
+static void svm_depriv_restore_stacks(void)
+{
+    struct vcpu* vcpu = get_current();
+    struct tss_struct *tss = &this_cpu(init_tss);
+    unsigned char *stub_page;
+    unsigned long stack_bottom = get_stack_bottom();
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned int offset;
+
+    stub_page = map_domain_page(_mfn(this_cpu(stubs.mfn)));
+
+    /* Restore the old rsp0 */
+    tss->rsp0 = vcpu->arch.hvm_vcpu.depriv_tss_rsp0;
+
+    /* Restore the old syscall/sysenter stacks */
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_bottom,
+                                   (unsigned long)lstar_enter);
+    stub_va += offset;
+
+    /* Trampoline for SYSCALL entry from compatibility mode. */
+    offset += write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                    stub_va, stack_bottom,
+                                    (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
+}
+
+static void svm_depriv_ctxt_switch_to(struct vcpu *v)
+{
+    vcpu_save_fpu(v);
+    svm_ctxt_switch_from(v);
+
+    v->arch.hvm_vcpu.depriv_efer = read_efer();
+
+    /* Flip the SCE bit to allow sysret/call */
+    write_efer(v->arch.hvm_vcpu.depriv_efer | EFER_SCE);
+}
+
+
 static void noreturn svm_do_resume(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -1156,6 +1266,12 @@ static int svm_vcpu_initialise(struct vcpu *v)
 
     v->arch.hvm_svm.launch_core = -1;
 
+    /* HVM deprivileged mode operations */
+    v->arch.hvm_vcpu.depriv_ctxt_switch_to   = svm_depriv_ctxt_switch_to;
+    v->arch.hvm_vcpu.depriv_ctxt_switch_from = svm_depriv_ctxt_switch_from;
+    v->arch.hvm_vcpu.depriv_setup_stacks     = svm_depriv_setup_stacks;
+    v->arch.hvm_vcpu.depriv_restore_stacks   = svm_depriv_restore_stacks;
+
     if ( (rc = svm_create_vmcb(v)) != 0 )
     {
         dprintk(XENLOG_WARNING,
@@ -2547,7 +2663,19 @@ void svm_vmexit_handler(struct cpu_user_regs *regs)
         {
             uint16_t port = (vmcb->exitinfo1 >> 16) & 0xFFFF;
             int bytes = ((vmcb->exitinfo1 >> 4) & 0x07);
+
             int dir = (vmcb->exitinfo1 & 1) ? IOREQ_READ : IOREQ_WRITE;
+            /* DEBUG: Run only for a specific port */
+            if(port == 0x1000)
+            {
+                if( guest_cpu_user_regs()->eax == 0x1)
+                {
+                    hvm_deprivileged_user_mode();
+                }
+                __update_guest_eip(regs, vmcb->exitinfo2 - vmcb->rip);
+                break;
+            }
+
             if ( handle_pio(port, bytes, dir) )
                 __update_guest_eip(regs, vmcb->exitinfo2 - vmcb->rip);
         }
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 2582cdd..1ec23f9 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -59,6 +59,8 @@
 #include <asm/event.h>
 #include <asm/monitor.h>
 #include <public/arch-x86/cpuid.h>
+#include <xen/hvm/deprivileged.h>
+
 
 static bool_t __initdata opt_force_ept;
 boolean_param("force-ept", opt_force_ept);
@@ -68,6 +70,11 @@ enum handler_return { HNDL_done, HNDL_unhandled, 
HNDL_exception_raised };
 static void vmx_ctxt_switch_from(struct vcpu *v);
 static void vmx_ctxt_switch_to(struct vcpu *v);
 
+static void vmx_depriv_ctxt_switch_from(struct vcpu *v);
+static void vmx_depriv_ctxt_switch_to(struct vcpu *v);
+static void vmx_depriv_setup_stacks(unsigned long stack_ptr);
+static void vmx_depriv_restore_stacks(void);
+
 static int  vmx_alloc_vlapic_mapping(struct domain *d);
 static void vmx_free_vlapic_mapping(struct domain *d);
 static void vmx_install_vlapic_mapping(struct vcpu *v);
@@ -110,6 +117,12 @@ static int vmx_vcpu_initialise(struct vcpu *v)
     v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
     v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
 
+    /* HVM deprivileged mode operations */
+    v->arch.hvm_vcpu.depriv_ctxt_switch_to   = vmx_depriv_ctxt_switch_to;
+    v->arch.hvm_vcpu.depriv_ctxt_switch_from = vmx_depriv_ctxt_switch_from;
+    v->arch.hvm_vcpu.depriv_setup_stacks     = vmx_depriv_setup_stacks;
+    v->arch.hvm_vcpu.depriv_restore_stacks   = vmx_depriv_restore_stacks;
+
     if ( (rc = vmx_create_vmcs(v)) != 0 )
     {
         dprintk(XENLOG_WARNING,
@@ -272,6 +285,7 @@ long_mode_do_msr_write(unsigned int msr, uint64_t 
msr_content)
     case MSR_LSTAR:
         if ( !is_canonical_address(msr_content) )
             goto uncanonical_address;
+
         WRITE_MSR(LSTAR);
         break;
 
@@ -707,6 +721,98 @@ static void vmx_fpu_leave(struct vcpu *v)
     }
 }
 
+static void vmx_depriv_setup_stacks(unsigned long stack_ptr)
+{
+    struct vcpu *vcpu = get_current();
+    struct tss_struct *tss = &this_cpu(init_tss);
+    unsigned char *stub_page;
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned int offset;
+
+    /* Save the current rsp0 */
+    vcpu->arch.hvm_vcpu.depriv_tss_rsp0 = tss->rsp0;
+
+    /* Setup the stack for interrupts/exceptions */
+    tss->rsp0 = stack_ptr;
+
+    /* Stacks for syscall and sysenter */
+    stub_page = map_domain_page(_mfn(this_cpu(stubs.mfn)));
+
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_ptr,
+                                   (unsigned long)lstar_enter);
+
+    stub_va += offset;
+
+    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+         boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
+    {
+        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_ptr);
+    }
+
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_ptr,
+                                   (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
+}
+
+static void vmx_depriv_restore_stacks(void)
+{
+    struct vcpu* vcpu = get_current();
+    struct tss_struct *tss = &this_cpu(init_tss);
+    unsigned char *stub_page;
+    unsigned long stack_bottom = get_stack_bottom();
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned int offset;
+
+    stub_page = map_domain_page(_mfn(this_cpu(stubs.mfn)));
+
+    /* Restore the old rsp0 */
+    tss->rsp0 = vcpu->arch.hvm_vcpu.depriv_tss_rsp0;
+
+    /* Restore the old syscall/sysenter stacks */
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_bottom,
+                                   (unsigned long)lstar_enter);
+    stub_va += offset;
+
+    wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
+
+    /* Trampoline for SYSCALL entry from compatibility mode. */
+    offset += write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                    stub_va, stack_bottom,
+                                    (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
+}
+
+static void vmx_depriv_ctxt_switch_from(struct vcpu *v)
+{
+    vmx_ctxt_switch_to(v);
+    vcpu_save_fpu(v);
+
+    /* Restore the efer and saved msr registers */
+    write_efer(v->arch.hvm_vcpu.depriv_efer);
+}
+
+static void vmx_depriv_ctxt_switch_to(struct vcpu *v)
+{
+    vcpu_save_fpu(v);
+    vmx_ctxt_switch_from(v);
+
+    v->arch.hvm_vcpu.depriv_efer = read_efer();
+
+    /* Flip the SCE bit to allow sysret/call */
+    write_efer(v->arch.hvm_vcpu.depriv_efer | EFER_SCE);
+}
+
 static void vmx_ctxt_switch_from(struct vcpu *v)
 {
     /*
@@ -3341,6 +3447,18 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
             uint16_t port = (exit_qualification >> 16) & 0xFFFF;
             int bytes = (exit_qualification & 0x07) + 1;
             int dir = (exit_qualification & 0x08) ? IOREQ_READ : IOREQ_WRITE;
+
+            /* DEBUG: Run only for a specific port */
+            if(port == 0x1000)
+            {
+                if( guest_cpu_user_regs()->eax == 0x1)
+                {
+                    hvm_deprivileged_user_mode();
+                }
+                update_guest_eip(); /* Safe: IN, OUT */
+                break;
+            }
+
             if ( handle_pio(port, bytes, dir) )
                 update_guest_eip(); /* Safe: IN, OUT */
         }
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 4048929..5633e82 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -40,7 +40,7 @@
 #include <asm/domain.h>
 #include <xen/numa.h>
 #include <asm/hvm/nestedhvm.h>
-
+#include <asm/hvm/vmx/vmx.h>
 #include "private.h"
 #include <xen/hvm/deprivileged.h>
 
diff --git a/xen/arch/x86/x86_64/asm-offsets.c 
b/xen/arch/x86/x86_64/asm-offsets.c
index 447c650..7af824a 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -115,6 +115,11 @@ void __dummy__(void)
     OFFSET(VCPU_nsvm_hap_enabled, struct vcpu, 
arch.hvm_vcpu.nvcpu.u.nsvm.ns_hap_enabled);
     BLANK();
 
+    OFFSET(VCPU_depriv_rsp, struct vcpu, arch.hvm_vcpu.depriv_rsp);
+    OFFSET(VCPU_depriv_user_mode, struct vcpu, arch.hvm_vcpu.depriv_user_mode);
+    OFFSET(VCPU_depriv_destroy, struct vcpu, arch.hvm_vcpu.depriv_destroy);
+    BLANK();
+
     OFFSET(DOMAIN_is_32bit_pv, struct domain, arch.is_32bit_pv);
     BLANK();
 
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 74677a2..9590065 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -102,6 +102,18 @@ restore_all_xen:
         RESTORE_ALL adj=8
         iretq
 
+
+/* Returning from user mode */
+ENTRY(handle_hvm_user_mode)
+
+        call hvm_deprivileged_handle_user_mode
+
+        /* fallthrough */
+hvm_depriv_mode:
+
+        /* Go back into user mode */
+        jmp   restore_all_guest
+
 /*
  * When entering SYSCALL from kernel mode:
  *  %rax                            = hypercall vector
@@ -128,6 +140,11 @@ ENTRY(lstar_enter)
         pushq $0
         SAVE_VOLATILE TRAP_syscall
         GET_CURRENT(%rbx)
+
+        /* Were we in Xen's ring 3?  */
+        cmpq  $1, VCPU_depriv_user_mode(%rbx)
+        je    handle_hvm_user_mode
+
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
@@ -487,6 +504,10 @@ ENTRY(common_interrupt)
 /* No special register assumptions. */
 ENTRY(ret_from_intr)
         GET_CURRENT(%rbx)
+
+        /* If we are in Xen's user mode */
+        cmpq  $1,VCPU_depriv_user_mode(%rbx)
+        je    hvm_depriv_mode
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
         movq  VCPU_domain(%rbx),%rax
@@ -509,6 +530,10 @@ handle_exception_saved:
         GET_CURRENT(%rbx)
         PERFC_INCR(exceptions, %rax, %rbx)
         callq *(%rdx,%rax,8)
+
+        /* If we are in Xen's user mode */
+        cmpq  $1, VCPU_depriv_user_mode(%rbx)
+        je    hvm_depriv_mode
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
         leaq  VCPU_trap_bounce(%rbx),%rdx
@@ -636,15 +661,7 @@ ENTRY(nmi)
         movl  $TRAP_nmi,4(%rsp)
 handle_ist_exception:
         SAVE_ALL CLAC
-        testb $3,UREGS_cs(%rsp)
-        jz    1f
-        /* Interrupted guest context. Copy the context to stack bottom. */
-        GET_CPUINFO_FIELD(guest_cpu_user_regs,%rdi)
-        movq  %rsp,%rsi
-        movl  $UREGS_kernel_sizeof/8,%ecx
-        movq  %rdi,%rsp
-        rep   movsq
-1:      movq  %rsp,%rdi
+        movq  %rsp,%rdi
         movzbl UREGS_entry_vector(%rsp),%eax
         leaq  exception_table(%rip),%rdx
         callq *(%rdx,%rax,8)
@@ -664,6 +681,9 @@ handle_ist_exception:
         movl  $EVENT_CHECK_VECTOR,%edi
         call  send_IPI_self
 1:      movq  VCPU_domain(%rbx),%rax
+        /* This also handles Xen ring3 return for us.
+         * So, there is no need to explicitly do a user mode check.
+         */
         cmpb  $0,DOMAIN_is_32bit_pv(%rax)
         je    restore_all_guest
         jmp   compat_restore_all_guest
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index 0846a19..c7e6077 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -24,6 +24,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <public/callback.h>
+#include <asm/hvm/svm/svm.h>
 
 
 static void print_xen_info(void)
@@ -337,7 +338,7 @@ unsigned long do_iret(void)
     return 0;
 }
 
-static unsigned int write_stub_trampoline(
+unsigned int write_stub_trampoline(
     unsigned char *stub, unsigned long stub_va,
     unsigned long stack_bottom, unsigned long target_va)
 {
@@ -368,8 +369,6 @@ static unsigned int write_stub_trampoline(
 }
 
 DEFINE_PER_CPU(struct stubs, stubs);
-void lstar_enter(void);
-void cstar_enter(void);
 
 void __devinit subarch_percpu_traps_init(void)
 {
@@ -385,6 +384,14 @@ void __devinit subarch_percpu_traps_init(void)
 
     /* Trampoline for SYSCALL entry from 64-bit mode. */
     wrmsrl(MSR_LSTAR, stub_va);
+
+    /*
+     * HVM deprivileged mode on AMD. The writes for MSR_{L}STAR
+     * are not trapped so we need to keep a copy of the host's msrs
+     */
+    svm_depriv_write_msr_star((unsigned long)((FLAT_RING3_CS32<<16) | 
__HYPERVISOR_CS) << 32);
+    svm_depriv_write_msr_lstar(stub_va);
+
     offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
                                    stub_va, stack_bottom,
                                    (unsigned long)lstar_enter);
diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
index f011d2d..c1dae3a 100644
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -23,6 +23,8 @@
  * 2 - MCE IST stack
  * 1 - NMI IST stack
  * 0 - Double Fault IST stack
+ *
+ * NOTE: This layout changes slightly in HVM deprivileged mode.
  */
 
 /*
diff --git a/xen/include/asm-x86/hvm/svm/svm.h 
b/xen/include/asm-x86/hvm/svm/svm.h
index d60ec23..45dd125 100644
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -110,4 +110,17 @@ extern void svm_host_osvw_init(void);
 #define _NPT_PFEC_in_gpt       33
 #define NPT_PFEC_in_gpt        (1UL<<_NPT_PFEC_in_gpt)
 
+/*
+ * HVM deprivileged mode svm cache of host MSR_{L}STARs
+ * The svm mode does not trap guest writes to these so we
+ * need to preserve them.
+ */
+DECLARE_PER_CPU(u64, svm_depriv_msr_lstar);
+DECLARE_PER_CPU(u64, svm_depriv_msr_star);
+
+unsigned long svm_depriv_read_msr_star(void);
+void svm_depriv_write_msr_star(unsigned long star);
+unsigned long svm_depriv_read_msr_lstar(void);
+void svm_depriv_write_msr_lstar(unsigned long lstar);
+
 #endif /* __ASM_X86_HVM_SVM_H__ */
diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h
index f553814..f7df9d4 100644
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -202,6 +202,21 @@ struct hvm_vcpu {
     void (*fpu_exception_callback)(void *, struct cpu_user_regs *);
     void *fpu_exception_callback_arg;
 
+    /* Context switching for HVM deprivileged mode */
+    void (*depriv_ctxt_switch_to)(struct vcpu *v);
+    void (*depriv_ctxt_switch_from)(struct vcpu *v);
+    void (*depriv_setup_stacks)(unsigned long stack_ptr);
+    void (*depriv_restore_stacks)(void);
+
+    /* HVM deprivileged mode state */
+    struct segment_register depriv_tr;
+    unsigned long depriv_rsp;      /* rsp of our stack to restore our data to 
*/
+    unsigned long depriv_user_mode;   /* Are we in user mode */
+    unsigned long depriv_efer;
+    unsigned long depriv_tss_rsp0;
+    unsigned long depriv_destroy;
+    unsigned long depriv_watchdog_count;
+    
     /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */
     struct hvm_trap     inject_trap;
 
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h 
b/xen/include/asm-x86/hvm/vmx/vmx.h
index 3fbfa44..98e269e 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -565,4 +565,6 @@ typedef struct {
     u16 eptp_index;
 } ve_info_t;
 
+struct vmx_msr_state *get_host_msr_state(void);
+
 #endif /* __ASM_X86_HVM_VMX_VMX_H__ */
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index f507f5e..0fde516 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -547,6 +547,8 @@ void sysenter_entry(void);
 void sysenter_eflags_saved(void);
 void compat_hypercall(void);
 void int80_direct_trap(void);
+void lstar_enter(void);
+void cstar_enter(void);
 
 #define STUBS_PER_PAGE (PAGE_SIZE / STUB_BUF_SIZE)
 
diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h
index 25a6a2a..e092f36 100644
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -240,5 +240,8 @@ void init_idt_traps(void);
 void load_system_tables(void);
 void percpu_traps_init(void);
 void subarch_percpu_traps_init(void);
+unsigned int write_stub_trampoline(
+    unsigned char *stub, unsigned long stub_va,
+    unsigned long stack_bottom, unsigned long target_va);
 
 #endif
diff --git a/xen/include/xen/hvm/deprivileged.h 
b/xen/include/xen/hvm/deprivileged.h
index defc89d..5915224 100644
--- a/xen/include/xen/hvm/deprivileged.h
+++ b/xen/include/xen/hvm/deprivileged.h
@@ -1,5 +1,7 @@
 #ifndef __X86_HVM_DEPRIVILEGED
 
+/* This is also included in the HVM deprivileged mode .S file */
+#ifndef __ASSEMBLY__
 #define __X86_HVM_DEPRIVILEGED
 
 #include <asm/page.h>
@@ -75,11 +77,46 @@ int hvm_deprivileged_map_l1(struct domain *d,
 /* Used to allocate a page for the deprivileged mode */
 struct page_info *hvm_deprivileged_alloc_page(struct domain *d);
 
+/* Used to prepare each vcpu's data for user mode. Call for each HVM vcpu. */
+int hvm_deprivileged_prepare_vcpu(struct vcpu *vcpu);
+
+/* Destroy each vcpu's data for Xen user mode. Again, call for each vcpu. */
+void hvm_deprivileged_destroy_vcpu(struct vcpu *vcpu);
+
+/* Called to perform a user mode operation. */
+void hvm_deprivileged_user_mode(void);
+
+/* Called when the user mode operation has completed */
+void hvm_deprivileged_finish_user_mode(void);
+
+/* Called to move into and then out of user mode. Needed for accessing
+ * assembly features.
+ */
+void hvm_deprivileged_user_mode_asm(void);
+
+/* Called on the return path to return to the correct execution point */
+void hvm_deprivileged_finish_user_mode_asm(void);
+
+/* Handle any syscalls that the user mode makes */
+void hvm_deprivileged_handle_user_mode(void);
+
+/* Use to setup the stacks for deprivileged mode */
+void hvm_deprivileged_setup_stacks(unsigned long stack_ptr);
+
+/* Use to restore the stacks for deprivileged mode */
+void hvm_deprivileged_restore_stacks(void);
+
+/* The ring 3 code */
+void hvm_deprivileged_ring3(void);
+
 /* The segments where the user mode .text and .data are stored */
 extern unsigned long __hvm_deprivileged_text_start[];
 extern unsigned long __hvm_deprivileged_text_end[];
 extern unsigned long __hvm_deprivileged_data_start[];
 extern unsigned long __hvm_deprivileged_data_end[];
+
+#endif
+
 #define HVM_DEPRIV_STACK_SIZE (PAGE_SIZE << 1)
 #define HVM_DEPRIV_STACK_ORDER 1
 #define HVM_DEPRIV_DATA_SECTION_SIZE \
@@ -92,4 +129,12 @@ extern unsigned long __hvm_deprivileged_data_end[];
 #define HVM_DEPRIV_ALIAS 1
 #define HVM_DEPRIV_COPY 0
 
+/*
+ * The user mode stack pointer.
+ * The stack grows down so set this to top of the stack region. Then,
+ * as this is 0-indexed, move into the stack, not just after it.
+ * Subtract 16 bytes for correct stack alignment.
+ */
+#define HVM_STACK_PTR (HVM_DEPRIVILEGED_STACK_ADDR + HVM_DEPRIV_STACK_SIZE - 
16)
+
 #endif
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 66f4f5e..6c05969 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -137,7 +137,7 @@ void evtchn_destroy_final(struct domain *d); /* from 
complete_domain_destroy */
 
 struct waitqueue_vcpu;
 
-struct vcpu 
+struct vcpu
 {
     int              vcpu_id;
 
@@ -158,6 +158,22 @@ struct vcpu
 
     void            *sched_priv;    /* scheduler-specific data */
 
+    /* HVM deprivileged mode state */
+    void *stack;             /* Location of stack to save data onto */
+    unsigned long rsp;       /* rsp of our stack to restore our data to */
+    unsigned long user_mode; /* Are we (possibly moving into) in user mode? */
+
+    /* The mstar of the processor that we are currently executing on.
+     *  we need to save this because Xen does lazy saving of these.
+     */
+    unsigned long int msr_lstar; /* lstar */
+    unsigned long int msr_star;
+
+    /* Debug info */
+    unsigned long int old_rsp;
+    unsigned long int old_processor;
+    unsigned long int old_msr_lstar;
+    unsigned long int old_msr_star;
     struct vcpu_runstate_info runstate;
 #ifndef CONFIG_COMPAT
 # define runstate_guest(v) ((v)->runstate_guest)
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.