[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 24/24] PVH xen: introduce vmexit handler for PVH



This patch contains vmx exit handler for PVH guest. Note it contains
a macro dbgp1 to print vmexit reasons and a lot of other data
to go with it. It can be enabled by setting pvhdbg to 1. This can be
very useful debugging for the first few months of testing, after which
it can be removed at the maintainer's discretion.

Changes in V2:
  - Move non VMX generic code to arch/x86/hvm/pvh.c
  - Remove get_gpr_ptr() and use existing decode_register() instead.
  - Defer call to pvh vmx exit handler until interrupts are enabled. So the
    caller vmx_pvh_vmexit_handler() handles the NMI/EXT-INT/TRIPLE_FAULT now.
  - Fix the CPUID (wrongly) clearing bit 24. No need to do this now, set
    the correct feature bits in CR4 during vmcs creation.
  - Fix few hard tabs.

Changes in V3:
  - Lot of cleanup and rework in PVH vm exit handler.
  - add parameter to emulate_forced_invalid_op().

Changes in V5:
  - Move pvh.c and emulate_forced_invalid_op related changes to another patch.
  - Formatting.
  - Remove vmx_pvh_read_descriptor().
  - Use SS DPL instead of CS.RPL for CPL.
  - Remove pvh_user_cpuid() and call pv_cpuid for user mode also.

Changes in V6:
  - Replace domain_crash_synchronous() with domain_crash().

Changes in V7:
  - Don't read all selectors on every vmexit. Do that only for the
    IO instruction vmexit.
  - Add couple checks and set guest_cr[4] in access_cr4().
  - Add period after all comments in case that's an issue.
  - Move making pv_cpuid and emulate_privileged_op public here.

Changes in V8:
  - Mainly, don't read selectors on vmexit. The macros now come to VMCS
    to read selectors on demand.

Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>
---
 xen/arch/x86/hvm/vmx/pvh.c |  451 +++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 450 insertions(+), 1 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/pvh.c b/xen/arch/x86/hvm/vmx/pvh.c
index fb55ac8..508b740 100644
--- a/xen/arch/x86/hvm/vmx/pvh.c
+++ b/xen/arch/x86/hvm/vmx/pvh.c
@@ -20,9 +20,458 @@
 #include <asm/hvm/nestedhvm.h>
 #include <asm/xstate.h>
 
-/* Implemented in the next patch */
+#ifndef NDEBUG
+static int pvhdbg = 0;
+#define dbgp1(...) do { (pvhdbg == 1) ? printk(__VA_ARGS__) : 0; } while ( 0 )
+#else
+#define dbgp1(...) ((void)0)
+#endif
+
+/* Returns : 0 == msr read successfully. */
+static int vmxit_msr_read(struct cpu_user_regs *regs)
+{
+    u64 msr_content = 0;
+
+    switch ( regs->ecx )
+    {
+    case MSR_IA32_MISC_ENABLE:
+        rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
+        msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+                       MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
+        break;
+
+    default:
+        /* PVH fixme: see hvm_msr_read_intercept(). */
+        rdmsrl(regs->ecx, msr_content);
+        break;
+    }
+    regs->eax = (uint32_t)msr_content;
+    regs->edx = (uint32_t)(msr_content >> 32);
+    vmx_update_guest_eip();
+
+    dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n", regs->ecx, regs->eax,
+          regs->edx, regs->rip, regs->rsp);
+
+    return 0;
+}
+
+/* Returns : 0 == msr written successfully. */
+static int vmxit_msr_write(struct cpu_user_regs *regs)
+{
+    uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32);
+
+    dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n", regs->ecx,
+          regs->eax, regs->edx);
+
+    if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY )
+    {
+        vmx_update_guest_eip();
+        return 0;
+    }
+    return 1;
+}
+
+static int vmxit_debug(struct cpu_user_regs *regs)
+{
+    struct vcpu *vp = current;
+    unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+
+    write_debugreg(6, exit_qualification | 0xffff0ff0);
+
+    /* gdbsx or another debugger. Never pause dom0. */
+    if ( vp->domain->domain_id != 0 && vp->domain->debugger_attached )
+        domain_pause_for_debugger();
+    else
+        hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
+
+    return 0;
+}
+
+/* Returns: rc == 0: handled the MTF vmexit. */
+static int vmxit_mtf(struct cpu_user_regs *regs)
+{
+    struct vcpu *vp = current;
+    int rc = -EINVAL, ss = vp->arch.hvm_vcpu.single_step;
+
+    vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control);
+    vp->arch.hvm_vcpu.single_step = 0;
+
+    if ( vp->domain->debugger_attached && ss )
+    {
+        domain_pause_for_debugger();
+        rc = 0;
+    }
+    return rc;
+}
+
+static int vmxit_int3(struct cpu_user_regs *regs)
+{
+    int ilen = vmx_get_instruction_length();
+    struct vcpu *vp = current;
+    struct hvm_trap trap_info = {
+        .vector = TRAP_int3,
+        .type = X86_EVENTTYPE_SW_EXCEPTION,
+        .error_code = HVM_DELIVER_NO_ERROR_CODE,
+        .insn_len = ilen
+    };
+
+    /* gdbsx or another debugger. Never pause dom0. */
+    if ( vp->domain->domain_id != 0 && vp->domain->debugger_attached )
+    {
+        regs->eip += ilen;
+        dbgp1("[%d]PVH: domain pause for debugger\n", smp_processor_id());
+        current->arch.gdbsx_vcpu_event = TRAP_int3;
+        domain_pause_for_debugger();
+        return 0;
+    }
+    hvm_inject_trap(&trap_info);
+
+    return 0;
+}
+
+/* Just like HVM, PVH should be using "cpuid" from the kernel mode. */
+static int vmxit_invalid_op(struct cpu_user_regs *regs)
+{
+    if ( guest_kernel_mode(current, regs) || !emulate_forced_invalid_op(regs) )
+        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+
+    return 0;
+}
+
+/* Returns: rc == 0: handled the exception. */
+static int vmxit_exception(struct cpu_user_regs *regs)
+{
+    int vector = (__vmread(VM_EXIT_INTR_INFO)) & INTR_INFO_VECTOR_MASK;
+    int rc = -ENOSYS;
+
+    dbgp1(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector,
+          __vmread(GUEST_CS_SELECTOR), regs->eip);
+
+    switch ( vector )
+    {
+    case TRAP_debug:
+        rc = vmxit_debug(regs);
+        break;
+
+    case TRAP_int3:
+        rc = vmxit_int3(regs);
+        break;
+
+    case TRAP_invalid_op:
+        rc = vmxit_invalid_op(regs);
+        break;
+
+    case TRAP_no_device:
+        hvm_funcs.fpu_dirty_intercept();
+        rc = 0;
+        break;
+
+    default:
+        printk(XENLOG_G_WARNING
+               "PVH: Unhandled trap:%d. IP:%lx\n", vector, regs->eip);
+    }
+    return rc;
+}
+
+static int vmxit_vmcall(struct cpu_user_regs *regs)
+{
+    if ( hvm_do_hypercall(regs) != HVM_HCALL_preempted )
+        vmx_update_guest_eip();
+    return 0;
+}
+
+/* Returns: rc == 0: success. */
+static int access_cr0(struct cpu_user_regs *regs, uint acc_typ, uint64_t *regp)
+{
+    struct vcpu *vp = current;
+
+    if ( acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+    {
+        unsigned long new_cr0 = *regp;
+        unsigned long old_cr0 = __vmread(GUEST_CR0);
+
+        dbgp1("PVH:writing to CR0. RIP:%lx val:0x%lx\n", regs->rip, *regp);
+        if ( (u32)new_cr0 != new_cr0 )
+        {
+            printk(XENLOG_G_WARNING
+                   "Guest setting upper 32 bits in CR0: %lx", new_cr0);
+            return -EPERM;
+        }
+
+        new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
+        /* ET is reserved and should always be 1. */
+        new_cr0 |= X86_CR0_ET;
+
+        /* A pvh is not expected to change to real mode. */
+        if ( (new_cr0 & (X86_CR0_PE | X86_CR0_PG)) !=
+             (X86_CR0_PG | X86_CR0_PE) )
+        {
+            printk(XENLOG_G_WARNING
+                   "PVH attempting to turn off PE/PG. CR0:%lx\n", new_cr0);
+            return -EPERM;
+        }
+        /* TS going from 1 to 0 */
+        if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 & X86_CR0_TS) == 0) )
+            vmx_fpu_enter(vp);
+
+        vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = new_cr0;
+        __vmwrite(GUEST_CR0, new_cr0);
+        __vmwrite(CR0_READ_SHADOW, new_cr0);
+    }
+    else
+        *regp = __vmread(GUEST_CR0);
+
+    return 0;
+}
+
+/* Returns: rc == 0: success. */
+static int access_cr4(struct cpu_user_regs *regs, uint acc_typ, uint64_t *regp)
+{
+    if ( acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+    {
+        struct vcpu *vp = current;
+        u64 old_val = __vmread(GUEST_CR4);
+        u64 new = *regp;
+
+        if ( new & HVM_CR4_GUEST_RESERVED_BITS(vp) )
+        {
+            printk(XENLOG_G_WARNING
+                   "PVH guest attempts to set reserved bit in CR4: %lx", new);
+            hvm_inject_hw_exception(TRAP_gp_fault, 0);
+            return 0;
+        }
+
+        if ( !(new & X86_CR4_PAE) && hvm_long_mode_enabled(vp) )
+        {
+            printk(XENLOG_G_WARNING "Guest cleared CR4.PAE while "
+                   "EFER.LMA is set");
+            hvm_inject_hw_exception(TRAP_gp_fault, 0);
+            return 0;
+        }
+
+        vp->arch.hvm_vcpu.guest_cr[4] = new;
+
+        if ( (old_val ^ new) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
+            vpid_sync_all();
+
+        __vmwrite(CR4_READ_SHADOW, new);
+
+        new &= ~X86_CR4_PAE;         /* PVH always runs with hap enabled. */
+        new |= X86_CR4_VMXE | X86_CR4_MCE;
+        __vmwrite(GUEST_CR4, new);
+    }
+    else
+        *regp = __vmread(CR4_READ_SHADOW);
+
+    return 0;
+}
+
+/* Returns: rc == 0: success, else -errno. */
+static int vmxit_cr_access(struct cpu_user_regs *regs)
+{
+    unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+    uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
+    int cr, rc = -EINVAL;
+
+    switch ( acc_typ )
+    {
+    case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
+    case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
+    {
+        uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
+        uint64_t *regp = decode_register(gpr, regs, 0);
+        cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
+
+        if ( regp == NULL )
+            break;
+
+        switch ( cr )
+        {
+        case 0:
+            rc = access_cr0(regs, acc_typ, regp);
+            break;
+
+        case 3:
+            printk(XENLOG_G_ERR "PVH: unexpected cr3 vmexit. rip:%lx\n",
+                   regs->rip);
+            domain_crash(current->domain);
+            break;
+
+        case 4:
+            rc = access_cr4(regs, acc_typ, regp);
+            break;
+        }
+        if ( rc == 0 )
+            vmx_update_guest_eip();
+        break;
+    }
+
+    case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
+    {
+        struct vcpu *vp = current;
+        unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] & ~X86_CR0_TS;
+        vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = cr0;
+
+        vmx_fpu_enter(vp);
+        __vmwrite(GUEST_CR0, cr0);
+        __vmwrite(CR0_READ_SHADOW, cr0);
+        vmx_update_guest_eip();
+        rc = 0;
+    }
+    }
+    return rc;
+}
+
+/*
+ * Note: A PVH guest sets IOPL natively by setting bits in the eflags, and not
+ *       via hypercalls used by a PV.
+ */
+static int vmxit_io_instr(struct cpu_user_regs *regs)
+{
+    struct segment_register seg;
+    int requested = (regs->rflags & X86_EFLAGS_IOPL) >> 12;
+    int curr_lvl = (regs->rflags & X86_EFLAGS_VM) ? 3 : 0;
+
+    if ( curr_lvl == 0 )
+    {
+        hvm_get_segment_register(current, x86_seg_ss, &seg);
+        curr_lvl = seg.attr.fields.dpl;
+    }
+    if ( requested >= curr_lvl && emulate_privileged_op(regs) )
+        return 0;
+
+    hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
+    return 0;
+}
+
+static int pvh_ept_handle_violation(unsigned long qualification,
+                                    paddr_t gpa, struct cpu_user_regs *regs)
+{
+    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
+    p2m_type_t p2mt;
+    mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt);
+
+    printk(XENLOG_G_ERR "EPT violation %#lx (%c%c%c/%c%c%c), "
+           "gpa %#"PRIpaddr", mfn %#lx, type %i. IP:0x%lx RSP:0x%lx\n",
+           qualification,
+           (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
+           (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
+           (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
+           (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
+           (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
+           (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
+           gpa, mfn_x(mfn), p2mt, regs->rip, regs->rsp);
+
+    ept_walk_table(current->domain, gfn);
+
+    if ( qualification & EPT_GLA_VALID )
+    {
+        gla = __vmread(GUEST_LINEAR_ADDRESS);
+        printk(XENLOG_G_ERR " --- GLA %#lx\n", gla);
+    }
+    hvm_inject_hw_exception(TRAP_gp_fault, 0);
+    return 0;
+}
+
+/*
+ * Main vm exit handler for PVH . Called from vmx_vmexit_handler().
+ * Note: vmx_asm_vmexit_handler updates rip/rsp/eflags in regs{} struct.
+ */
 void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs)
 {
+    unsigned long exit_qualification;
+    unsigned int exit_reason = __vmread(VM_EXIT_REASON);
+    int rc=0, ccpu = smp_processor_id();
+    struct vcpu *v = current;
+
+    dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx CR0:%lx\n",
+          ccpu, exit_reason, regs->rip, regs->rsp, regs->rflags,
+          __vmread(GUEST_CR0));
+
+    switch ( (uint16_t)exit_reason )
+    {
+    /* NMI and machine_check are handled by the caller, we handle rest here */
+    case EXIT_REASON_EXCEPTION_NMI:      /* 0 */
+        rc = vmxit_exception(regs);
+        break;
+
+    case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */
+        break;              /* handled in vmx_vmexit_handler() */
+
+    case EXIT_REASON_PENDING_VIRT_INTR:  /* 7 */
+        /* Disable the interrupt window. */
+        v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+        __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+        break;
+
+    case EXIT_REASON_CPUID:              /* 10 */
+        pv_cpuid(regs);
+        vmx_update_guest_eip();
+        break;
+
+    case EXIT_REASON_HLT:                /* 12 */
+        vmx_update_guest_eip();
+        hvm_hlt(regs->eflags);
+        break;
+
+    case EXIT_REASON_VMCALL:             /* 18 */
+        rc = vmxit_vmcall(regs);
+        break;
+
+    case EXIT_REASON_CR_ACCESS:          /* 28 */
+        rc = vmxit_cr_access(regs);
+        break;
+
+    case EXIT_REASON_DR_ACCESS:          /* 29 */
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        vmx_dr_access(exit_qualification, regs);
+        break;
+
+    case EXIT_REASON_IO_INSTRUCTION:     /* 30 */
+        vmxit_io_instr(regs);
+        break;
+
+    case EXIT_REASON_MSR_READ:           /* 31 */
+        rc = vmxit_msr_read(regs);
+        break;
+
+    case EXIT_REASON_MSR_WRITE:          /* 32 */
+        rc = vmxit_msr_write(regs);
+        break;
+
+    case EXIT_REASON_MONITOR_TRAP_FLAG:  /* 37 */
+        rc = vmxit_mtf(regs);
+        break;
+
+    case EXIT_REASON_MCE_DURING_VMENTRY: /* 41 */
+        break;              /* handled in vmx_vmexit_handler() */
+
+    case EXIT_REASON_EPT_VIOLATION:      /* 48 */
+    {
+        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        rc = pvh_ept_handle_violation(exit_qualification, gpa, regs);
+        break;
+    }
+
+    default:
+        rc = 1;
+        printk(XENLOG_G_ERR
+               "PVH: Unexpected exit reason:%#x\n", exit_reason);
+    }
+
+    if ( rc )
+    {
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        printk(XENLOG_G_WARNING
+               "PVH: [%d] exit_reas:%d %#x qual:%ld 0x%lx cr0:0x%016lx\n",
+               ccpu, exit_reason, exit_reason, exit_qualification,
+               exit_qualification, __vmread(GUEST_CR0));
+        printk(XENLOG_G_WARNING "PVH: RIP:%lx RSP:%lx EFLAGS:%lx CR3:%lx\n",
+               regs->rip, regs->rsp, regs->rflags, __vmread(GUEST_CR3));
+        domain_crash(v->domain);
+    }
 }
 
 /* 
-- 
1.7.2.3


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.