Xen project Mailing List

Re: [Xen-devel] [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

To: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>

From: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>

Date: Mon, 18 Mar 2013 12:32:06 -0400

Cc: "Xen-devel@xxxxxxxxxxxxxxxxxxx" <Xen-devel@xxxxxxxxxxxxxxxxxxx>

Delivery-date: Mon, 18 Mar 2013 16:35:24 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

On Fri, Mar 15, 2013 at 05:41:45PM -0700, Mukesh Rathor wrote: > The heart of this patch is vmx exit handler for PVH guest. It is nicely > isolated in a separate module as preferred by most of us. A call to it > is added to vmx_pvh_vmexit_handler(). > > Changes in V2: > - Move non VMX generic code to arch/x86/hvm/pvh.c > - Remove get_gpr_ptr() and use existing decode_register() instead. > - Defer call to pvh vmx exit handler until interrupts are enabled. So the > caller vmx_pvh_vmexit_handler() handles the NMI/EXT-INT/TRIPLE_FAULT now. > - Fix the CPUID (wrongly) clearing bit 24. No need to do this now, set > the correct feature bits in CR4 during vmcs creation. > - Fix few hard tabs. > > Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx> > --- > xen/arch/x86/hvm/Makefile | 3 +- > xen/arch/x86/hvm/pvh.c | 220 ++++++++++++++ > xen/arch/x86/hvm/vmx/Makefile | 1 + > xen/arch/x86/hvm/vmx/vmx.c | 7 + > xen/arch/x86/hvm/vmx/vmx_pvh.c | 587 > +++++++++++++++++++++++++++++++++++++ > xen/include/asm-x86/hvm/vmx/vmx.h | 7 +- > xen/include/asm-x86/pvh.h | 6 + > 7 files changed, 829 insertions(+), 2 deletions(-) > create mode 100644 xen/arch/x86/hvm/pvh.c > create mode 100644 xen/arch/x86/hvm/vmx/vmx_pvh.c > create mode 100644 xen/include/asm-x86/pvh.h > > diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile > index eea5555..65ff9f3 100644 > --- a/xen/arch/x86/hvm/Makefile > +++ b/xen/arch/x86/hvm/Makefile > @@ -22,4 +22,5 @@ obj-y += vlapic.o > obj-y += vmsi.o > obj-y += vpic.o > obj-y += vpt.o > -obj-y += vpmu.o > \ No newline at end of file > +obj-y += vpmu.o > +obj-y += pvh.o > diff --git a/xen/arch/x86/hvm/pvh.c b/xen/arch/x86/hvm/pvh.c > new file mode 100644 > index 0000000..c12c4b7 > --- /dev/null > +++ b/xen/arch/x86/hvm/pvh.c > @@ -0,0 +1,220 @@ > +/* > + * Copyright (C) 2013, Mukesh Rathor, Oracle Corp. All rights reserved. > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public > + * License v2 as published by the Free Software Foundation. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + * > + * You should have received a copy of the GNU General Public > + * License along with this program; if not, write to the > + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, > + * Boston, MA 021110-1307, USA. Don't use the address anymore. They might change their location (if they haven't already - its a pretty price location). > + */ > + > +#include <xen/hypercall.h> > +#include <xen/guest_access.h> > +#include <asm/p2m.h> > +#include <asm/traps.h> > +#include <asm/hvm/vmx/vmx.h> > +#include <public/sched.h> > + > +static int pvh_grant_table_op( > + unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int > count) > +{ > + switch (cmd) > + { > + case GNTTABOP_map_grant_ref: > + case GNTTABOP_unmap_grant_ref: > + case GNTTABOP_setup_table: > + case GNTTABOP_copy: > + case GNTTABOP_query_size: > + case GNTTABOP_set_version: > + return do_grant_table_op(cmd, uop, count); > + } > + return -ENOSYS; > +} > + > +static long pvh_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg) > +{ > + long rc = -ENOSYS; > + > + switch ( cmd ) > + { > + case VCPUOP_register_runstate_memory_area: > + case VCPUOP_get_runstate_info: > + case VCPUOP_set_periodic_timer: > + case VCPUOP_stop_periodic_timer: > + case VCPUOP_set_singleshot_timer: > + case VCPUOP_stop_singleshot_timer: > + case VCPUOP_is_up: > + case VCPUOP_up: > + case VCPUOP_initialise: > + rc = do_vcpu_op(cmd, vcpuid, arg); > + > + /* pvh boot vcpu setting context for bringing up smp vcpu */ > + if (cmd == VCPUOP_initialise) > + vmx_vmcs_enter(current); > + } > + return rc; > +} > + > +static long pvh_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg) > +{ > + switch ( cmd ) > + { > + case PHYSDEVOP_map_pirq: > + case PHYSDEVOP_unmap_pirq: > + case PHYSDEVOP_eoi: > + case PHYSDEVOP_irq_status_query: > + case PHYSDEVOP_get_free_pirq: > + return do_physdev_op(cmd, arg); > + > + default: > + if ( IS_PRIV(current->domain) ) > + return do_physdev_op(cmd, arg); > + } > + return -ENOSYS; > +} > + > +static long do_pvh_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg) > +{ > + long rc = -EINVAL; > + struct xen_hvm_param harg; > + struct domain *d; > + > + if ( copy_from_guest(&harg, arg, 1) ) > + return -EFAULT; > + > + rc = rcu_lock_target_domain_by_id(harg.domid, &d); > + if ( rc != 0 ) > + return rc; > + > + if (is_hvm_domain(d)) { Formatting is odd compared to the other 'if ( (something)' > + /* pvh dom0 is building an hvm guest */ > + rcu_unlock_domain(d); > + return do_hvm_op(op, arg); > + } > + > + rc = -ENOSYS; > + if (op == HVMOP_set_param) { > + if (harg.index == HVM_PARAM_CALLBACK_IRQ) { > + struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; > + uint64_t via = harg.value; > + uint8_t via_type = (uint8_t)(via >> 56) + 1; > + > + if (via_type == HVMIRQ_callback_vector) { > + hvm_irq->callback_via_type = HVMIRQ_callback_vector; > + hvm_irq->callback_via.vector = (uint8_t)via; > + rc = 0; > + } Perhaps it would make sense to also print out the -ENOSYS ones for development purposes? Say: gdprintk(XENLOG_DEBUG, "d%, %s setting HVMOP_set_param[%d] - ENOSYS\n", d->domain_id, __func__); And for the ops case: gdprintk(XENLOG_DEBUG, "d%, %s - ENOSYS\n", ..) ? > + } > + } > + rcu_unlock_domain(d); > + return rc; > +} > + > +typedef unsigned long pvh_hypercall_t( > + unsigned long, unsigned long, unsigned long, unsigned long, unsigned > long, > + unsigned long); No way to re-use the something standard? I am not sure what is 'PVH' specific to it? It looks like a garden variety normal hypercalls. Perhaps just copy-n-paste the hvm_hypercall_t for right now? And the later they can be exported in a common header file? > + > +int hcall_a[NR_hypercalls]; > + > +static pvh_hypercall_t *pvh_hypercall64_table[NR_hypercalls] = { > + [__HYPERVISOR_platform_op] = (pvh_hypercall_t *)do_platform_op, > + [__HYPERVISOR_memory_op] = (pvh_hypercall_t *)do_memory_op, Could you include a comment saying why timers are not good? > + /* [__HYPERVISOR_set_timer_op] = (pvh_hypercall_t *)do_set_timer_op, > */ > + [__HYPERVISOR_xen_version] = (pvh_hypercall_t *)do_xen_version, > + [__HYPERVISOR_console_io] = (pvh_hypercall_t *)do_console_io, > + [__HYPERVISOR_grant_table_op] = (pvh_hypercall_t *)pvh_grant_table_op, > + [__HYPERVISOR_vcpu_op] = (pvh_hypercall_t *)pvh_vcpu_op, > + [__HYPERVISOR_mmuext_op] = (pvh_hypercall_t *)do_mmuext_op, > + [__HYPERVISOR_xsm_op] = (pvh_hypercall_t *)do_xsm_op, > + [__HYPERVISOR_sched_op] = (pvh_hypercall_t *)do_sched_op, > + [__HYPERVISOR_event_channel_op]= (pvh_hypercall_t *)do_event_channel_op, > + [__HYPERVISOR_physdev_op] = (pvh_hypercall_t *)pvh_physdev_op, > + [__HYPERVISOR_hvm_op] = (pvh_hypercall_t *)do_pvh_hvm_op, Most of these follow the 'do_X'. Then for the pvh ones you have: 'pvh_X', with one exception: do_pvh_hvm_op ? Should it be just 'pvh_hvm_op' instead? > + [__HYPERVISOR_sysctl] = (pvh_hypercall_t *)do_sysctl, > + [__HYPERVISOR_domctl] = (pvh_hypercall_t *)do_domctl > +}; > + > +/* fixme: Do we need to worry about this and slow things down in this path? > */ > +static int pvh_long_mode_enabled(void) > +{ > + /* A 64bit linux guest should always run in this mode with CS.L selecting > + * either 64bit mode or 32bit compat mode */ I think they are called 'native' or '64-bit mode' per the AMD spec? > + return 1; Well, it always seems to return 1? So this ends up being mostly a nop when the compiler is done? Or did the earlier version have the correct checks to make sure that we are in Long Mode? > +} > + > +/* Check if hypercall is valid > + * Returns: 0 if hcall is not valid with eax set to the errno to ret to guest Huh? Return 0 on invalid case? That looks odd. Why not anything but zero? Or perhaps just make it return a bool? That would be easier to grok. > + */ > +static int hcall_valid(struct cpu_user_regs *regs) > +{ > + struct segment_register sreg; > + > + if (!pvh_long_mode_enabled()) > + { > + gdprintk(XENLOG_ERR, "PVH Error: Expected long mode set\n"); Shouldn't this set: regs->eax = -ENOSYS? > + return 1; > + } > + hvm_get_segment_register(current, x86_seg_ss, &sreg); > + if ( unlikely(sreg.attr.fields.dpl == 3) ) > + { > + regs->eax = -EPERM; > + return 0; > + } > + > + /* domU's are not allowed following hcalls */ And that is b/c we can't handle it yet? In which case you should prefix it with a TODO or XXX to remember this. > + if ( !IS_PRIV(current->domain) && > + (regs->eax == __HYPERVISOR_xsm_op || > + regs->eax == __HYPERVISOR_platform_op || > + regs->eax == __HYPERVISOR_domctl) ) { /* for privcmd mmap */ > + > + regs->eax = -EPERM; > + return 0; > + } > + return 1; > +} > + > +int pvh_do_hypercall(struct cpu_user_regs *regs) > +{ > + uint32_t hnum = regs->eax; > + > + if ( hnum >= NR_hypercalls || pvh_hypercall64_table[hnum] == NULL ) > + { > + gdprintk(XENLOG_WARNING, "PVH: Unimplemented HCALL:%d. Returning " > + "-ENOSYS. domid:%d IP:%lx SP:%lx\n", > + hnum, current->domain->domain_id, regs->rip, regs->rsp); > + regs->eax = -ENOSYS; > + vmx_update_guest_eip(); > + return HVM_HCALL_completed; > + } > + > + if ( regs->eax == __HYPERVISOR_sched_op && regs->rdi == SCHEDOP_shutdown > ) Oh? We can't shutdown the guest? How come? > + { > + regs->eax = -ENOSYS; > + vmx_update_guest_eip(); > + > + /* PVH fixme: show_guest_stack() from domain crash causes PF */ > + domain_crash_synchronous(); > + return HVM_HCALL_completed; > + } > + > + if ( !hcall_valid(regs) ) > + return HVM_HCALL_completed; > + > + current->arch.hvm_vcpu.hcall_preempted = 0; > + regs->rax = pvh_hypercall64_table[hnum](regs->rdi, regs->rsi, regs->rdx, > + regs->r10, regs->r8, regs->r9); > + > + if ( current->arch.hvm_vcpu.hcall_preempted ) > + return HVM_HCALL_preempted; > + > + return HVM_HCALL_completed; > +} > + > diff --git a/xen/arch/x86/hvm/vmx/Makefile b/xen/arch/x86/hvm/vmx/Makefile > index 373b3d9..8b71dae 100644 > --- a/xen/arch/x86/hvm/vmx/Makefile > +++ b/xen/arch/x86/hvm/vmx/Makefile > @@ -5,3 +5,4 @@ obj-y += vmcs.o > obj-y += vmx.o > obj-y += vpmu_core2.o > obj-y += vvmx.o > +obj-y += vmx_pvh.o > diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c > index 194c87b..5503fc9 100644 > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -1529,6 +1529,8 @@ static struct hvm_function_table __read_mostly > vmx_function_table = { > .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled, > .process_isr = vmx_process_isr, > .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m, > + .pvh_set_vcpu_info = vmx_pvh_set_vcpu_info, > + .pvh_read_descriptor = vmx_pvh_read_descriptor, > }; > > struct hvm_function_table * __init start_vmx(void) > @@ -2364,6 +2366,11 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) > if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) ) > return vmx_failed_vmentry(exit_reason, regs); > > + if ( is_pvh_vcpu(v) ) { > + vmx_pvh_vmexit_handler(regs); > + return; > + } > + > if ( v->arch.hvm_vmx.vmx_realmode ) > { > /* Put RFLAGS back the way the guest wants it */ > diff --git a/xen/arch/x86/hvm/vmx/vmx_pvh.c b/xen/arch/x86/hvm/vmx/vmx_pvh.c > new file mode 100644 > index 0000000..14ca0f6 > --- /dev/null > +++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c > @@ -0,0 +1,587 @@ > +/* > + * Copyright (C) 2013, Mukesh Rathor, Oracle Corp. All rights reserved. > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public > + * License v2 as published by the Free Software Foundation. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + * > + * You should have received a copy of the GNU General Public > + * License along with this program; if not, write to the > + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, > + * Boston, MA 021110-1307, USA. Ditto. No address please. > + */ > + > +#include <xen/hypercall.h> > +#include <xen/guest_access.h> > +#include <asm/p2m.h> > +#include <asm/traps.h> > +#include <asm/hvm/vmx/vmx.h> > +#include <public/sched.h> > +#include <asm/pvh.h> > + This should probably be #ifdef DEBUG > +volatile int pvhdbg=0; And I think you can remove the 'volatile' part? > +#define dbgp1(...) {(pvhdbg==1) ? printk(__VA_ARGS__):0;} > +#define dbgp2(...) {(pvhdbg==2) ? printk(__VA_ARGS__):0;} #endif > + > + > +static void read_vmcs_selectors(struct cpu_user_regs *regs) > +{ > + regs->cs = __vmread(GUEST_CS_SELECTOR); > + regs->ss = __vmread(GUEST_SS_SELECTOR); > + regs->ds = __vmread(GUEST_DS_SELECTOR); > + regs->es = __vmread(GUEST_ES_SELECTOR); > + regs->gs = __vmread(GUEST_GS_SELECTOR); > + regs->fs = __vmread(GUEST_FS_SELECTOR); > +} > + > +/* returns : 0 success */ What are the non-success criteria? > +static int vmxit_msr_read(struct cpu_user_regs *regs) > +{ > + int rc=1; X86EMUL_UNHANDLEABLE ? > + > + u64 msr_content = 0; > + switch (regs->ecx) > + { > + case MSR_IA32_MISC_ENABLE: > + { > + rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); > + msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | > + MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; > + break; > + } > + default: > + { > + /* fixme: see hvm_msr_read_intercept() */ > + rdmsrl(regs->ecx, msr_content); > + break; > + } > + } > + regs->eax = (uint32_t)msr_content; > + regs->edx = (uint32_t)(msr_content >> 32); > + vmx_update_guest_eip(); > + rc = 0; > + > + dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n", regs->ecx, > regs->eax, > + regs->edx, regs->rip, regs->rsp); > + return rc; This function looks to return 0 (or X86EMUL_OKAY) irregardless of the MSR? Perhaps just make it return X86EMUL_OKAY without bothering to use 'rc'? > +} > + > +/* returns : 0 success */ > +static int vmxit_msr_write(struct cpu_user_regs *regs) > +{ > + uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32); > + int rc=1; X86EMUL_UNHANDLEABLE > + > + dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n", regs->ecx, > + regs->eax,regs->edx); > + > + if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) { > + vmx_update_guest_eip(); > + rc = 0; X86EMUL_OKAY > + } > + return rc; > +} > + > +/* Returns: rc == 0: handled the MTF vmexit */ > +static int vmxit_mtf(struct cpu_user_regs *regs) > +{ > + struct vcpu *vp = current; > + int rc=1, ss=vp->arch.hvm_vcpu.single_step; > + > + vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; > + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control); > + vp->arch.hvm_vcpu.single_step = 0; > + > + if ( vp->domain->debugger_attached && ss ) { > + domain_pause_for_debugger(); > + rc = 0; > + } > + return rc; > +} > + > +static int vmxit_int3(struct cpu_user_regs *regs) > +{ > + int ilen = vmx_get_instruction_length(); > + struct vcpu *vp = current; > + struct hvm_trap trap_info = { > + .vector = TRAP_int3, > + .type = X86_EVENTTYPE_SW_EXCEPTION, > + .error_code = HVM_DELIVER_NO_ERROR_CODE, > + .insn_len = ilen > + }; > + > + regs->eip += ilen; > + > + /* gdbsx or another debugger. Never pause dom0 */ > + if ( vp->domain->domain_id != 0 && guest_kernel_mode(vp, regs) ) > + { > + dbgp1("[%d]PVH: domain pause for debugger\n", smp_processor_id()); > + current->arch.gdbsx_vcpu_event = TRAP_int3; > + domain_pause_for_debugger(); > + return 0; > + } > + > + regs->eip -= ilen; > + hvm_inject_trap(&trap_info); > + > + return 0; > +} > + > +static int vmxit_invalid_op(struct cpu_user_regs *regs) > +{ > + ulong addr=0; > + > + if ( guest_kernel_mode(current, regs) || > + (addr = emulate_forced_invalid_op(regs)) == 0 ) > + { > + hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); > + return 0; > + } > + > + if (addr != EXCRET_fault_fixed) > + hvm_inject_page_fault(0, addr); > + > + return 0; > +} > + > +/* Returns: rc == 0: handled the exception/NMI */ > +static int vmxit_exception(struct cpu_user_regs *regs) > +{ > + unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) & > INTR_INFO_VECTOR_MASK; > + int rc=1; > + struct vcpu *vp = current; > + > + dbgp2(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector, > + __vmread(GUEST_CS_SELECTOR), regs->eip); > + > + if (vector == TRAP_debug) { > + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); > + write_debugreg(6, exit_qualification | 0xffff0ff0); > + rc = 0; > + > + /* gdbsx or another debugger */ > + if ( vp->domain->domain_id != 0 && /* never pause dom0 */ > + guest_kernel_mode(vp, regs) && vp->domain->debugger_attached ) > + { > + domain_pause_for_debugger(); > + } else { > + hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE); > + } > + } > + if (vector == TRAP_int3) { > + rc = vmxit_int3(regs); > + > + } else if (vector == TRAP_invalid_op) { > + rc = vmxit_invalid_op(regs); > + > + } else if (vector == TRAP_no_device) { > + hvm_funcs.fpu_dirty_intercept(); /* calls vmx_fpu_dirty_intercept */ > + rc = 0; > + > + } else if (vector == TRAP_gp_fault) { > + regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE); > + /* hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); */ So how come we don't inject it in? > + rc = 1; Huh? Not X86_EMUL_OK? > + > + } else if (vector == TRAP_page_fault) { > + printk("PVH: Unexpected vector page_fault. IP:%lx\n", regs->eip); printk(.. some prefix. > + rc = 1; > + } > + if (rc) > + printk("PVH: Unhandled trap vector:%d. IP:%lx\n", vector, regs->eip); > + > + return rc; > +} > + > +static int vmxit_invlpg(void) > +{ > + ulong vaddr = __vmread(EXIT_QUALIFICATION); > + > + vmx_update_guest_eip(); > + vpid_sync_vcpu_gva(current, vaddr); > + return 0; > +} > + > +static int vmxit_vmcall(struct cpu_user_regs *regs) > +{ > + if ( pvh_do_hypercall(regs) != HVM_HCALL_preempted) > + vmx_update_guest_eip(); > + > + return 0;; > +} > + > +/* Returns: rc == 0: success */ > +static int access_cr0(struct cpu_user_regs *regs, uint acc_typ, > + uint64_t *regp) > +{ > + struct vcpu *vp = current; > + > + if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR ) > + { > + unsigned long new_cr0 = *regp; > + unsigned long old_cr0 = __vmread(GUEST_CR0); > + > + dbgp2("PVH:writing to CR0. RIP:%lx val:0x%lx\n", regs->rip, *regp); > + if ( (u32)new_cr0 != new_cr0 ) > + { > + HVM_DBG_LOG(DBG_LEVEL_1, > + "Guest setting upper 32 bits in CR0: %lx", new_cr0); > + return 1; > + } > + > + new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS; > + /* ET is reserved and should be always be 1. */ > + new_cr0 |= X86_CR0_ET; > + > + /* pvh cannot change to real mode */ > + if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) != (X86_CR0_PG|X86_CR0_PE) > ) { > + printk("PVH attempting to turn off PE/PG. CR0:%lx\n", new_cr0); > + return 1; > + } > + /* TS going from 1 to 0 */ > + if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 & X86_CR0_TS)==0) ) > + vmx_fpu_enter(vp); Does this really happen? I thought in the PV mode you would be using the hypercalls for the fpu swap? Should it be print out an error saying something to the effect: "PVH guest is using cr0 instead of the paravirt lazy FPU switch!" and include the EIP? > + > + vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = new_cr0; > + __vmwrite(GUEST_CR0, new_cr0); > + __vmwrite(CR0_READ_SHADOW, new_cr0); > + } else { > + *regp = __vmread(GUEST_CR0); > + } > + return 0; > +} > + > +/* Returns: rc == 0: success */ > +static int access_cr4(struct cpu_user_regs *regs, uint acc_typ, > + uint64_t *regp) > +{ > + if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR ) > + { > + u64 old_cr4 = __vmread(GUEST_CR4); > + > + if ( (old_cr4 ^ (*regp)) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) > ) > + vpid_sync_all(); > + > + /* pvh_verify_cr4_wr(*regp)); */ Needed anymore? > + __vmwrite(GUEST_CR4, *regp); > + } else { > + *regp = __vmread(GUEST_CR4); > + } > + return 0; > +} > + > +/* Returns: rc == 0: success */ > +static int vmxit_cr_access(struct cpu_user_regs *regs) > +{ > + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); > + uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification); > + int cr, rc = 1; > + > + switch ( acc_typ ) > + { > + case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR: > + case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR: > + { > + uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification); > + uint64_t *regp = decode_register(gpr, regs, 0); > + cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification); > + > + if (regp == NULL) > + break; > + > + /* pl don't embed switch statements */ > + if (cr == 0) > + rc = access_cr0(regs, acc_typ, regp); > + else if (cr == 3) { > + printk("PVH: d%d: unexpected cr3 access vmexit. rip:%lx\n", > + current->domain->domain_id, regs->rip); > + domain_crash_synchronous(); Uh? Why wouldn't we want to handle it? > + } else if (cr == 4) > + rc = access_cr4(regs, acc_typ, regp); > + > + if (rc == 0) > + vmx_update_guest_eip(); > + break; > + } > + case VMX_CONTROL_REG_ACCESS_TYPE_CLTS: > + { > + struct vcpu *vp = current; > + unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] & ~X86_CR0_TS; > + vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = cr0; > + vmx_fpu_enter(vp); > + __vmwrite(GUEST_CR0, cr0); > + __vmwrite(CR0_READ_SHADOW, cr0); > + vmx_update_guest_eip(); > + rc = 0; > + } > + } > + return rc; > +} > + > +/* NOTE: a PVH sets IOPL natively by setting bits in the eflags and not by > + * hypercalls used by a PV */ Ahh, so there are now five? PV hypercall families that should not be used: - PHYSDEVOP_set_iopl (which I think in your earlier patch you did not check for?) - HYPERVISOR_update_va_mapping (for all the MMU stuff) - HYPERVISOR_update_descriptor (segments and such) - HYPERVISOR_fpu_taskswitch (you are doing it in the above function) - HYPERVISOR_set_trap_table (again, LDT and GDT are now done via HVM) - HYPERVISOR_set_segment_base - HYPERVISOR_set_gdt - HYPERVISOR_tmem .. and host of other. This should be documented somewhere in docs? Perhaps in docs/misc/pvh.txt and just outline which ones are not to be used anymore? > +static int vmxit_io_instr(struct cpu_user_regs *regs) > +{ > + int curr_lvl; > + int requested = (regs->rflags >> 12) & 3; > + > + read_vmcs_selectors(regs); > + curr_lvl = regs->cs & 3; > + > + if (requested >= curr_lvl && emulate_privileged_op(regs)) > + return 0; > + > + hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); > + return 0; > +} > + > +static int pvh_ept_handle_violation(unsigned long qualification, > + paddr_t gpa, struct cpu_user_regs *regs) > +{ > + unsigned long gla, gfn = gpa >> PAGE_SHIFT; > + p2m_type_t p2mt; > + mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt); > + > + gdprintk(XENLOG_ERR, "Dom:%d EPT violation %#lx (%c%c%c/%c%c%c), " > + "gpa %#"PRIpaddr", mfn %#lx, type %i. IP:0x%lx RSP:0x%lx\n", > + current->domain->domain_id, qualification, > + (qualification & EPT_READ_VIOLATION) ? 'r' : '-', > + (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-', > + (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-', > + (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-', > + (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-', > + (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-', > + gpa, mfn_x(mfn), p2mt, regs->rip, regs->rsp); > + > + ept_walk_table(current->domain, gfn); > + > + if ( qualification & EPT_GLA_VALID ) > + { > + gla = __vmread(GUEST_LINEAR_ADDRESS); > + gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla); AHA! There are gdprintk in your code! Could you please replace most (all) of the printk you have with either gdprintk or the printk(XENLOG_ERR? > + } > + > + hvm_inject_hw_exception(TRAP_gp_fault, 0); > + return 0; > +} > + > +static void pvh_user_cpuid(struct cpu_user_regs *regs) > +{ > + unsigned int eax, ebx, ecx, edx; > + > + asm volatile ( "cpuid" > + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) > + : "0" (regs->eax), "2" (regs->rcx) ); > + Could you use 'cpuid' macro defined in processor.h? > + regs->rax = eax; regs->rbx = ebx; regs->rcx = ecx; regs->rdx = edx; That would make this: cpuid(regs->eax, &regs->eax, &regs->ebx, &regs->ecx, &regs->edx) ? Or is that not an option since you are re-using the eax register? If so, could you do: unsigned int op = regs->eax; cpuid(op, &regs->eax, &regs->ebx, &regs->ecx, &regs->edx) ? > +} > + > +/* > + * Main exit handler for PVH case. Called from vmx_vmexit_handler(). > + * Note: in vmx_asm_vmexit_handler, rip/rsp/eflags are updated in regs{} > + */ > +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs) > +{ > + unsigned long exit_qualification; > + unsigned int exit_reason = __vmread(VM_EXIT_REASON); > + int rc=0, ccpu = smp_processor_id(); > + struct vcpu *vp = current; > + > + dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx > CR0:%lx\n", > + ccpu, exit_reason, regs->rip, regs->rsp, regs->rflags, > + __vmread(GUEST_CR0)); > + > + /* for guest_kernel_mode() */ > + regs->cs = __vmread(GUEST_CS_SELECTOR); > + > + switch ( (uint16_t)exit_reason ) Huh? Why the 'uint16_t'? Ah, b/c vmx_vmexit_handler does it too. I wonder why? > + { > + case EXIT_REASON_EXCEPTION_NMI: /* 0 */ > + rc = vmxit_exception(regs); > + break; > + > + case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */ > + case EXIT_REASON_MCE_DURING_VMENTRY: /* 41 */ > + break; /* handled in vmx_vmexit_handler() */ > + > + case EXIT_REASON_TRIPLE_FAULT: /* 2 */ > + { > + printk("PVH:Triple Flt:[%d] RIP:%lx RSP:%lx EFLAGS:%lx > CR3:%lx\n", > + ccpu, regs->rip, regs->rsp, regs->rflags, > + __vmread(GUEST_CR3)); > + > + rc = 1; > + break; > + } > + case EXIT_REASON_PENDING_VIRT_INTR: /* 7 */ > + { > + struct vcpu *v = current; > + > + /* Disable the interrupt window. */ > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; > + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, > v->arch.hvm_vmx.exec_control); > + break; > + } > + > + case EXIT_REASON_CPUID: /* 10 */ > + { > + if ( guest_kernel_mode(vp, regs) ) { > + pv_cpuid(regs); > + } else > + pvh_user_cpuid(regs); > + > + vmx_update_guest_eip(); > + dbgp2("cpuid:%ld RIP:%lx\n", regs->eax, regs->rip); > + break; > + } > + > + case EXIT_REASON_HLT: /* 12 */ > + { > + vmx_update_guest_eip(); > + hvm_hlt(regs->eflags); > + break; > + } > + > + case EXIT_REASON_INVLPG: /* 14 */ > + rc = vmxit_invlpg(); > + break; > + > + case EXIT_REASON_RDTSC: /* 16 */ > + rc = 1; > + break; > + > + case EXIT_REASON_VMCALL: /* 18 */ > + rc = vmxit_vmcall(regs); > + break; > + > + case EXIT_REASON_CR_ACCESS: /* 28 */ > + rc = vmxit_cr_access(regs); > + break; > + > + case EXIT_REASON_DR_ACCESS: /* 29 */ > + { > + exit_qualification = __vmread(EXIT_QUALIFICATION); > + vmx_dr_access(exit_qualification, regs); > + break; > + } > + > + case EXIT_REASON_IO_INSTRUCTION: > + vmxit_io_instr(regs); > + break; > + > + case EXIT_REASON_MSR_READ: /* 31 */ > + rc = vmxit_msr_read(regs); > + break; > + > + case EXIT_REASON_MSR_WRITE: /* 32 */ > + rc = vmxit_msr_write(regs); > + break; > + > + case EXIT_REASON_MONITOR_TRAP_FLAG: /* 37 */ > + rc = vmxit_mtf(regs); > + break; > + > + case EXIT_REASON_EPT_VIOLATION: > + { > + paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS); > + exit_qualification = __vmread(EXIT_QUALIFICATION); > + rc = pvh_ept_handle_violation(exit_qualification, gpa, regs); > + break; > + } > + default: > + rc = 1; > + printk("PVH: Unexpected exit reason:%d 0x%x\n", exit_reason, > + exit_reason); > + } > + if (rc) { Odd syntax. > + exit_qualification = __vmread(EXIT_QUALIFICATION); > + printk("PVH: [%d] exit_reas:%d 0x%x qual:%ld 0x%lx cr0:0x%016lx\n", > + ccpu, exit_reason, exit_reason, exit_qualification, > + exit_qualification, __vmread(GUEST_CR0)); > + printk("PVH: [%d] RIP:%lx RSP:%lx\n", ccpu, regs->rip, regs->rsp); > + domain_crash_synchronous(); > + } > +} > + > +/* > + * Sets info for non boot vcpu. VCPU 0 context is set by library. > + * We use this for nonboot vcpu in which case the call comes from the > + * kernel cpu_initialize_context(). > + */ > +int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp) > +{ > + if (v->vcpu_id == 0) > + return 0; > + > + vmx_vmcs_enter(v); > + __vmwrite(GUEST_GDTR_BASE, ctxtp->u.pvh.gdtaddr); > + __vmwrite(GUEST_GDTR_LIMIT, ctxtp->u.pvh.gdtsz); > + __vmwrite(GUEST_GS_BASE, ctxtp->gs_base_user); > + > + __vmwrite(GUEST_CS_SELECTOR, ctxtp->user_regs.cs); > + __vmwrite(GUEST_DS_SELECTOR, ctxtp->user_regs.ds); > + __vmwrite(GUEST_ES_SELECTOR, ctxtp->user_regs.es); > + __vmwrite(GUEST_SS_SELECTOR, ctxtp->user_regs.ss); > + __vmwrite(GUEST_GS_SELECTOR, ctxtp->user_regs.gs); > + > + if ( vmx_add_guest_msr(MSR_SHADOW_GS_BASE) ) > + return -EINVAL; > + > + vmx_write_guest_msr(MSR_SHADOW_GS_BASE, ctxtp->gs_base_kernel); > + > + vmx_vmcs_exit(v); > + return 0; > +} > + > +int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v, > + const struct cpu_user_regs *regs, > + unsigned long *base, unsigned long *limit, > + unsigned int *ar) > +{ How come you don't want to follow the syntax of vmx_pvh_read_descriptor? It has a lot less of arguments? > + unsigned int tmp_ar = 0; > + BUG_ON(v!=current); > + BUG_ON(!is_pvh_vcpu(v)); > + > + if (sel == (unsigned int)regs->cs) { > + *base = __vmread(GUEST_CS_BASE); > + *limit = __vmread(GUEST_CS_LIMIT); > + tmp_ar = __vmread(GUEST_CS_AR_BYTES); > + } else if (sel == (unsigned int)regs->ds) { > + *base = __vmread(GUEST_DS_BASE); > + *limit = __vmread(GUEST_DS_LIMIT); > + tmp_ar = __vmread(GUEST_DS_AR_BYTES); > + } else if (sel == (unsigned int)regs->ss) { > + *base = __vmread(GUEST_SS_BASE); > + *limit = __vmread(GUEST_SS_LIMIT); > + tmp_ar = __vmread(GUEST_SS_AR_BYTES); > + } else if (sel == (unsigned int)regs->gs) { > + *base = __vmread(GUEST_GS_BASE); > + *limit = __vmread(GUEST_GS_LIMIT); > + tmp_ar = __vmread(GUEST_GS_AR_BYTES); > + } else if (sel == (unsigned int)regs->fs) { > + *base = __vmread(GUEST_FS_BASE); > + *limit = __vmread(GUEST_FS_LIMIT); > + tmp_ar = __vmread(GUEST_FS_AR_BYTES); > + } else if (sel == (unsigned int)regs->es) { > + *base = __vmread(GUEST_ES_BASE); > + *limit = __vmread(GUEST_ES_LIMIT); > + tmp_ar = __vmread(GUEST_ES_AR_BYTES); > + } else { > + printk("Unmatched segment selector:%d\n", sel); > + return 0; > + } > + > + if (tmp_ar & X86_SEG_AR_CS_LM_ACTIVE) { /* x86 mess!! */ > + *base = 0UL; > + *limit = ~0UL; > + } > + /* Fixup ar so that it looks the same as in native mode */ > + *ar = (tmp_ar << 8); I am not sure I follow. Are you doing this to fit in the other bits of the segment (the upper limit)? Shouldn't the caller of vmx_pvh_read_descriptor do this? > + return 1; > +} > + > diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h > b/xen/include/asm-x86/hvm/vmx/vmx.h > index a742e16..5679e8d 100644 > --- a/xen/include/asm-x86/hvm/vmx/vmx.h > +++ b/xen/include/asm-x86/hvm/vmx/vmx.h > @@ -189,6 +189,7 @@ void vmx_update_secondary_exec_control(struct vcpu *v); > * Access Rights > */ > #define X86_SEG_AR_SEG_TYPE 0xf /* 3:0, segment type */ > +#define X86_SEG_AR_SEG_TYPE_CODE (1u << 3) /* code (vs data) segment */ > #define X86_SEG_AR_DESC_TYPE (1u << 4) /* 4, descriptor type */ > #define X86_SEG_AR_DPL 0x60 /* 6:5, descriptor privilege > level */ > #define X86_SEG_AR_SEG_PRESENT (1u << 7) /* 7, segment present */ > @@ -442,10 +443,14 @@ void ept_p2m_uninit(struct p2m_domain *p2m); > > void ept_walk_table(struct domain *d, unsigned long gfn); > void setup_ept_dump(void); > - Stray change. > void vmx_update_guest_eip(void); > void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs > *regs); > void vmx_do_extint(struct cpu_user_regs *regs); > +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs); > +int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp); > +int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v, > + const struct cpu_user_regs *regs, unsigned long > *base, > + unsigned long *limit, unsigned int *ar); > > int alloc_p2m_hap_data(struct p2m_domain *p2m); > void free_p2m_hap_data(struct p2m_domain *p2m); > diff --git a/xen/include/asm-x86/pvh.h b/xen/include/asm-x86/pvh.h > new file mode 100644 > index 0000000..73e59d3 > --- /dev/null > +++ b/xen/include/asm-x86/pvh.h > @@ -0,0 +1,6 @@ > +#ifndef __ASM_X86_PVH_H__ > +#define __ASM_X86_PVH_H__ > + > +int pvh_do_hypercall(struct cpu_user_regs *regs); > + > +#endif /* __ASM_X86_PVH_H__ */ > -- > 1.7.2.3 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@xxxxxxxxxxxxx > http://lists.xen.org/xen-devel > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.