Xen project Mailing List

Re: [Xen-devel] [V10 PATCH 23/23] PVH xen: introduce vmexit handler for PVH

To: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>

From: George Dunlap <dunlapg@xxxxxxxxx>

Date: Mon, 12 Aug 2013 17:00:36 +0100

Cc: "xen-devel@xxxxxxxxxxxxxxxxxxx" <Xen-devel@xxxxxxxxxxxxxxxxxxx>, "keir.xen@xxxxxxxxx" <keir.xen@xxxxxxxxx>

Delivery-date: Mon, 12 Aug 2013 16:00:57 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

On Wed, Jul 24, 2013 at 2:59 AM, Mukesh Rathor <mukesh.rathor@xxxxxxxxxx> wrote: > This patch contains vmx exit handler for PVH guest. Note it contains > a macro dbgp1 to print vmexit reasons and a lot of other data > to go with it. It can be enabled by setting pvhdbg to 1. This can be > very useful debugging for the first few months of testing, after which > it can be removed at the maintainer's discretion. > > Changes in V2: > - Move non VMX generic code to arch/x86/hvm/pvh.c > - Remove get_gpr_ptr() and use existing decode_register() instead. > - Defer call to pvh vmx exit handler until interrupts are enabled. So the > caller vmx_pvh_vmexit_handler() handles the NMI/EXT-INT/TRIPLE_FAULT now. > - Fix the CPUID (wrongly) clearing bit 24. No need to do this now, set > the correct feature bits in CR4 during vmcs creation. > - Fix few hard tabs. > > Changes in V3: > - Lot of cleanup and rework in PVH vm exit handler. > - add parameter to emulate_forced_invalid_op(). > > Changes in V5: > - Move pvh.c and emulate_forced_invalid_op related changes to another patch. > - Formatting. > - Remove vmx_pvh_read_descriptor(). > - Use SS DPL instead of CS.RPL for CPL. > - Remove pvh_user_cpuid() and call pv_cpuid for user mode also. > > Changes in V6: > - Replace domain_crash_synchronous() with domain_crash(). > > Changes in V7: > - Don't read all selectors on every vmexit. Do that only for the > IO instruction vmexit. > - Add couple checks and set guest_cr[4] in access_cr4(). > - Add period after all comments in case that's an issue. > - Move making pv_cpuid and emulate_privileged_op public here. > > Changes in V8: > - Mainly, don't read selectors on vmexit. The macros now come to VMCS > to read selectors on demand. Overall I have the same comment here as I had for the VMCS patch: the code looks 98% identical. Substantial differences seem to be: - emulation of privileged ops - cpuid - cr4 handling It seems like it would be much better to share the codepath and just put "is_pvh_domain()" in the places where it needs to be different. More comments / questions inline. > > Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx> > --- > xen/arch/x86/hvm/vmx/pvh.c | 451 > +++++++++++++++++++++++++++++++++++++++++++- > 1 files changed, 450 insertions(+), 1 deletions(-) > > diff --git a/xen/arch/x86/hvm/vmx/pvh.c b/xen/arch/x86/hvm/vmx/pvh.c > index 8e61d23..ba11967 100644 > --- a/xen/arch/x86/hvm/vmx/pvh.c > +++ b/xen/arch/x86/hvm/vmx/pvh.c > @@ -20,9 +20,458 @@ > #include <asm/hvm/nestedhvm.h> > #include <asm/xstate.h> > > -/* Implemented in the next patch */ > +#ifndef NDEBUG > +static int pvhdbg = 0; > +#define dbgp1(...) do { (pvhdbg == 1) ? printk(__VA_ARGS__) : 0; } while ( 0 > ) > +#else > +#define dbgp1(...) ((void)0) > +#endif > + > +/* Returns : 0 == msr read successfully. */ > +static int vmxit_msr_read(struct cpu_user_regs *regs) > +{ > + u64 msr_content = 0; > + > + switch ( regs->ecx ) > + { > + case MSR_IA32_MISC_ENABLE: > + rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); > + msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | > + MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; > + break; > + > + default: > + /* PVH fixme: see hvm_msr_read_intercept(). */ > + rdmsrl(regs->ecx, msr_content); > + break; So at the moment you basically pass through all MSR reads (adding BTS_UNAVAIL and PEBS_UNAVAIL to MISC_ENABLE), but send MSR writes through the hvm code? That sounds like it's asking for trouble... > + } > + regs->eax = (uint32_t)msr_content; > + regs->edx = (uint32_t)(msr_content >> 32); > + vmx_update_guest_eip(); > + > + dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n", regs->ecx, > regs->eax, > + regs->edx, regs->rip, regs->rsp); > + > + return 0; > +} > + > +/* Returns : 0 == msr written successfully. */ > +static int vmxit_msr_write(struct cpu_user_regs *regs) > +{ > + uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32); > + > + dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n", regs->ecx, > + regs->eax, regs->edx); > + > + if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) > + { > + vmx_update_guest_eip(); > + return 0; > + } > + return 1; > +} > + > +static int vmxit_debug(struct cpu_user_regs *regs) > +{ > + struct vcpu *vp = current; > + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); > + > + write_debugreg(6, exit_qualification | 0xffff0ff0); > + > + /* gdbsx or another debugger. Never pause dom0. */ > + if ( vp->domain->domain_id != 0 && vp->domain->debugger_attached ) > + domain_pause_for_debugger(); > + else > + hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE); Hmm, strangely enough, the HVM handler for this doesn't seem to deliver this exception -- or if it does, I can't quite figure out where. What you have here seems like the correct thing to do, but I would be interested in knowing the reason for the HVM behavior. > + > + return 0; > +} > + > +/* Returns: rc == 0: handled the MTF vmexit. */ > +static int vmxit_mtf(struct cpu_user_regs *regs) > +{ > + struct vcpu *vp = current; > + int rc = -EINVAL, ss = vp->arch.hvm_vcpu.single_step; > + > + vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; > + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control); > + vp->arch.hvm_vcpu.single_step = 0; > + > + if ( vp->domain->debugger_attached && ss ) > + { > + domain_pause_for_debugger(); > + rc = 0; > + } > + return rc; > +} > + > +static int vmxit_int3(struct cpu_user_regs *regs) > +{ > + int ilen = vmx_get_instruction_length(); > + struct vcpu *vp = current; > + struct hvm_trap trap_info = { > + .vector = TRAP_int3, > + .type = X86_EVENTTYPE_SW_EXCEPTION, > + .error_code = HVM_DELIVER_NO_ERROR_CODE, > + .insn_len = ilen > + }; > + > + /* gdbsx or another debugger. Never pause dom0. */ > + if ( vp->domain->domain_id != 0 && vp->domain->debugger_attached ) > + { > + regs->eip += ilen; > + dbgp1("[%d]PVH: domain pause for debugger\n", smp_processor_id()); > + current->arch.gdbsx_vcpu_event = TRAP_int3; > + domain_pause_for_debugger(); > + return 0; > + } > + hvm_inject_trap(&trap_info); > + > + return 0; > +} > + > +/* Just like HVM, PVH should be using "cpuid" from the kernel mode. */ > +static int vmxit_invalid_op(struct cpu_user_regs *regs) > +{ > + if ( guest_kernel_mode(current, regs) || > !emulate_forced_invalid_op(regs) ) > + hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); > + > + return 0; > +} > + > +/* Returns: rc == 0: handled the exception. */ > +static int vmxit_exception(struct cpu_user_regs *regs) > +{ > + int vector = (__vmread(VM_EXIT_INTR_INFO)) & INTR_INFO_VECTOR_MASK; > + int rc = -ENOSYS; The vmx code here has some handler for faults that happen during a guest IRET -- is that an issue for PVH? > + > + dbgp1(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector, > + __vmread(GUEST_CS_SELECTOR), regs->eip); > + > + switch ( vector ) > + { > + case TRAP_debug: > + rc = vmxit_debug(regs); > + break; > + > + case TRAP_int3: > + rc = vmxit_int3(regs); > + break; > + > + case TRAP_invalid_op: > + rc = vmxit_invalid_op(regs); > + break; > + > + case TRAP_no_device: > + hvm_funcs.fpu_dirty_intercept(); > + rc = 0; > + break; > + > + default: > + printk(XENLOG_G_WARNING > + "PVH: Unhandled trap:%d. IP:%lx\n", vector, regs->eip); > + } > + return rc; > +} > + > +static int vmxit_vmcall(struct cpu_user_regs *regs) > +{ > + if ( hvm_do_hypercall(regs) != HVM_HCALL_preempted ) > + vmx_update_guest_eip(); > + return 0; > +} > + > +/* Returns: rc == 0: success. */ > +static int access_cr0(struct cpu_user_regs *regs, uint acc_typ, uint64_t > *regp) > +{ > + struct vcpu *vp = current; > + > + if ( acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR ) > + { > + unsigned long new_cr0 = *regp; > + unsigned long old_cr0 = __vmread(GUEST_CR0); > + > + dbgp1("PVH:writing to CR0. RIP:%lx val:0x%lx\n", regs->rip, *regp); > + if ( (u32)new_cr0 != new_cr0 ) > + { > + printk(XENLOG_G_WARNING > + "Guest setting upper 32 bits in CR0: %lx", new_cr0); > + return -EPERM; > + } > + > + new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS; > + /* ET is reserved and should always be 1. */ > + new_cr0 |= X86_CR0_ET; > + > + /* A pvh is not expected to change to real mode. */ > + if ( (new_cr0 & (X86_CR0_PE | X86_CR0_PG)) != > + (X86_CR0_PG | X86_CR0_PE) ) > + { > + printk(XENLOG_G_WARNING > + "PVH attempting to turn off PE/PG. CR0:%lx\n", new_cr0); > + return -EPERM; > + } > + /* TS going from 1 to 0 */ > + if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 & X86_CR0_TS) == 0) ) > + vmx_fpu_enter(vp); > + > + vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = new_cr0; > + __vmwrite(GUEST_CR0, new_cr0); > + __vmwrite(CR0_READ_SHADOW, new_cr0); > + } > + else > + *regp = __vmread(GUEST_CR0); The HVM code here just uses hvm_vcpu.guest_cr[] -- is there any reason not to do the same here? And in any case, shouldn't it be CR0_READ_SHADOW? > + > + return 0; > +} > + > +/* Returns: rc == 0: success. */ > +static int access_cr4(struct cpu_user_regs *regs, uint acc_typ, uint64_t > *regp) > +{ > + if ( acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR ) > + { > + struct vcpu *vp = current; > + u64 old_val = __vmread(GUEST_CR4); > + u64 new = *regp; > + > + if ( new & HVM_CR4_GUEST_RESERVED_BITS(vp) ) > + { > + printk(XENLOG_G_WARNING > + "PVH guest attempts to set reserved bit in CR4: %lx", > new); > + hvm_inject_hw_exception(TRAP_gp_fault, 0); > + return 0; > + } > + > + if ( !(new & X86_CR4_PAE) && hvm_long_mode_enabled(vp) ) > + { > + printk(XENLOG_G_WARNING "Guest cleared CR4.PAE while " > + "EFER.LMA is set"); > + hvm_inject_hw_exception(TRAP_gp_fault, 0); > + return 0; > + } > + > + vp->arch.hvm_vcpu.guest_cr[4] = new; > + > + if ( (old_val ^ new) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) > + vpid_sync_all(); Is it actually allowed for a PVH guest to change modes like this? I realize that at the moment you're only supporting HAP, but that may not always be true; would it make sense to call paging_update_paging_modes() here instead? > + > + __vmwrite(CR4_READ_SHADOW, new); > + > + new &= ~X86_CR4_PAE; /* PVH always runs with hap enabled. */ > + new |= X86_CR4_VMXE | X86_CR4_MCE; > + __vmwrite(GUEST_CR4, new); Should you be updating hvm_vcpu.hw_cr[4] to this value? > + } > + else > + *regp = __vmread(CR4_READ_SHADOW); Same as above re guest_cr[] > + > + return 0; > +} > + > +/* Returns: rc == 0: success, else -errno. */ > +static int vmxit_cr_access(struct cpu_user_regs *regs) > +{ > + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); > + uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification); > + int cr, rc = -EINVAL; > + > + switch ( acc_typ ) > + { > + case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR: > + case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR: > + { > + uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification); > + uint64_t *regp = decode_register(gpr, regs, 0); > + cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification); > + > + if ( regp == NULL ) > + break; > + > + switch ( cr ) > + { > + case 0: > + rc = access_cr0(regs, acc_typ, regp); > + break; > + > + case 3: > + printk(XENLOG_G_ERR "PVH: unexpected cr3 vmexit. rip:%lx\n", > + regs->rip); > + domain_crash(current->domain); > + break; > + > + case 4: > + rc = access_cr4(regs, acc_typ, regp); > + break; > + } > + if ( rc == 0 ) > + vmx_update_guest_eip(); > + break; > + } > + > + case VMX_CONTROL_REG_ACCESS_TYPE_CLTS: > + { > + struct vcpu *vp = current; > + unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] & ~X86_CR0_TS; > + vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = cr0; > + > + vmx_fpu_enter(vp); > + __vmwrite(GUEST_CR0, cr0); > + __vmwrite(CR0_READ_SHADOW, cr0); > + vmx_update_guest_eip(); > + rc = 0; > + } > + } > + return rc; > +} > + > +/* > + * Note: A PVH guest sets IOPL natively by setting bits in the eflags, and > not > + * via hypercalls used by a PV. > + */ > +static int vmxit_io_instr(struct cpu_user_regs *regs) > +{ > + struct segment_register seg; > + int requested = (regs->rflags & X86_EFLAGS_IOPL) >> 12; > + int curr_lvl = (regs->rflags & X86_EFLAGS_VM) ? 3 : 0; > + > + if ( curr_lvl == 0 ) > + { > + hvm_get_segment_register(current, x86_seg_ss, &seg); > + curr_lvl = seg.attr.fields.dpl; > + } > + if ( requested >= curr_lvl && emulate_privileged_op(regs) ) > + return 0; > + > + hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); > + return 0; > +} > + > +static int pvh_ept_handle_violation(unsigned long qualification, > + paddr_t gpa, struct cpu_user_regs *regs) > +{ > + unsigned long gla, gfn = gpa >> PAGE_SHIFT; > + p2m_type_t p2mt; > + mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt); > + > + printk(XENLOG_G_ERR "EPT violation %#lx (%c%c%c/%c%c%c), " > + "gpa %#"PRIpaddr", mfn %#lx, type %i. IP:0x%lx RSP:0x%lx\n", > + qualification, > + (qualification & EPT_READ_VIOLATION) ? 'r' : '-', > + (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-', > + (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-', > + (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-', > + (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-', > + (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-', > + gpa, mfn_x(mfn), p2mt, regs->rip, regs->rsp); > + > + ept_walk_table(current->domain, gfn); > + > + if ( qualification & EPT_GLA_VALID ) > + { > + gla = __vmread(GUEST_LINEAR_ADDRESS); > + printk(XENLOG_G_ERR " --- GLA %#lx\n", gla); > + } > + hvm_inject_hw_exception(TRAP_gp_fault, 0); > + return 0; > +} > + > +/* > + * Main vm exit handler for PVH . Called from vmx_vmexit_handler(). > + * Note: vmx_asm_vmexit_handler updates rip/rsp/eflags in regs{} struct. > + */ > void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs) > { > + unsigned long exit_qualification; > + unsigned int exit_reason = __vmread(VM_EXIT_REASON); > + int rc=0, ccpu = smp_processor_id(); > + struct vcpu *v = current; > + > + dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx > CR0:%lx\n", > + ccpu, exit_reason, regs->rip, regs->rsp, regs->rflags, > + __vmread(GUEST_CR0)); > + > + switch ( (uint16_t)exit_reason ) > + { > + /* NMI and machine_check are handled by the caller, we handle rest here > */ > + case EXIT_REASON_EXCEPTION_NMI: /* 0 */ > + rc = vmxit_exception(regs); > + break; > + > + case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */ > + break; /* handled in vmx_vmexit_handler() */ This seems to be a weird way to do things, but I see this is what they do in vmx_vmexit_handler() as well, so I guess it makes sense to follow suit. What about EXIT_REASON_TRIPLE_FAULT? [End of in-line comments] > + > + case EXIT_REASON_PENDING_VIRT_INTR: /* 7 */ > + /* Disable the interrupt window. */ > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; > + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); > + break; > + > + case EXIT_REASON_CPUID: /* 10 */ > + pv_cpuid(regs); > + vmx_update_guest_eip(); > + break; > + > + case EXIT_REASON_HLT: /* 12 */ > + vmx_update_guest_eip(); > + hvm_hlt(regs->eflags); > + break; > + > + case EXIT_REASON_VMCALL: /* 18 */ > + rc = vmxit_vmcall(regs); > + break; > + > + case EXIT_REASON_CR_ACCESS: /* 28 */ > + rc = vmxit_cr_access(regs); > + break; > + > + case EXIT_REASON_DR_ACCESS: /* 29 */ > + exit_qualification = __vmread(EXIT_QUALIFICATION); > + vmx_dr_access(exit_qualification, regs); > + break; > + > + case EXIT_REASON_IO_INSTRUCTION: /* 30 */ > + vmxit_io_instr(regs); > + break; > + > + case EXIT_REASON_MSR_READ: /* 31 */ > + rc = vmxit_msr_read(regs); > + break; > + > + case EXIT_REASON_MSR_WRITE: /* 32 */ > + rc = vmxit_msr_write(regs); > + break; > + > + case EXIT_REASON_MONITOR_TRAP_FLAG: /* 37 */ > + rc = vmxit_mtf(regs); > + break; > + > + case EXIT_REASON_MCE_DURING_VMENTRY: /* 41 */ > + break; /* handled in vmx_vmexit_handler() */ > + > + case EXIT_REASON_EPT_VIOLATION: /* 48 */ > + { > + paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS); > + exit_qualification = __vmread(EXIT_QUALIFICATION); > + rc = pvh_ept_handle_violation(exit_qualification, gpa, regs); > + break; > + } > + > + default: > + rc = 1; > + printk(XENLOG_G_ERR > + "PVH: Unexpected exit reason:%#x\n", exit_reason); > + } > + > + if ( rc ) > + { > + exit_qualification = __vmread(EXIT_QUALIFICATION); > + printk(XENLOG_G_WARNING > + "PVH: [%d] exit_reas:%d %#x qual:%ld 0x%lx cr0:0x%016lx\n", > + ccpu, exit_reason, exit_reason, exit_qualification, > + exit_qualification, __vmread(GUEST_CR0)); > + printk(XENLOG_G_WARNING "PVH: RIP:%lx RSP:%lx EFLAGS:%lx CR3:%lx\n", > + regs->rip, regs->rsp, regs->rflags, __vmread(GUEST_CR3)); > + domain_crash(v->domain); > + } > } > > /* > -- > 1.7.2.3 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@xxxxxxxxxxxxx > http://lists.xen.org/xen-devel _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.