[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c



On Fri, Mar 15, 2013 at 05:41:45PM -0700, Mukesh Rathor wrote:
> The heart of this patch is vmx exit handler for PVH guest. It is nicely
>  isolated in a separate module as preferred by most of us. A call to it
>  is added to vmx_pvh_vmexit_handler().
> 
> Changes in V2:
>   - Move non VMX generic code to arch/x86/hvm/pvh.c
>   - Remove get_gpr_ptr() and use existing decode_register() instead.
>   - Defer call to pvh vmx exit handler until interrupts are enabled. So the
>     caller vmx_pvh_vmexit_handler() handles the NMI/EXT-INT/TRIPLE_FAULT now.
>   - Fix the CPUID (wrongly) clearing bit 24. No need to do this now, set
>     the correct feature bits in CR4 during vmcs creation.
>   - Fix few hard tabs.
> 
> Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>
> ---
>  xen/arch/x86/hvm/Makefile         |    3 +-
>  xen/arch/x86/hvm/pvh.c            |  220 ++++++++++++++
>  xen/arch/x86/hvm/vmx/Makefile     |    1 +
>  xen/arch/x86/hvm/vmx/vmx.c        |    7 +
>  xen/arch/x86/hvm/vmx/vmx_pvh.c    |  587 
> +++++++++++++++++++++++++++++++++++++
>  xen/include/asm-x86/hvm/vmx/vmx.h |    7 +-
>  xen/include/asm-x86/pvh.h         |    6 +
>  7 files changed, 829 insertions(+), 2 deletions(-)
>  create mode 100644 xen/arch/x86/hvm/pvh.c
>  create mode 100644 xen/arch/x86/hvm/vmx/vmx_pvh.c
>  create mode 100644 xen/include/asm-x86/pvh.h
> 
> diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile
> index eea5555..65ff9f3 100644
> --- a/xen/arch/x86/hvm/Makefile
> +++ b/xen/arch/x86/hvm/Makefile
> @@ -22,4 +22,5 @@ obj-y += vlapic.o
>  obj-y += vmsi.o
>  obj-y += vpic.o
>  obj-y += vpt.o
> -obj-y += vpmu.o
> \ No newline at end of file
> +obj-y += vpmu.o
> +obj-y += pvh.o
> diff --git a/xen/arch/x86/hvm/pvh.c b/xen/arch/x86/hvm/pvh.c
> new file mode 100644
> index 0000000..c12c4b7
> --- /dev/null
> +++ b/xen/arch/x86/hvm/pvh.c
> @@ -0,0 +1,220 @@
> +/*
> + * Copyright (C) 2013, Mukesh Rathor, Oracle Corp.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License v2 as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this program; if not, write to the
> + * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
> + * Boston, MA 021110-1307, USA.


Don't use the address anymore. They might change their location
(if they haven't already - its a pretty price location).

> + */
> +
> +#include <xen/hypercall.h>
> +#include <xen/guest_access.h>
> +#include <asm/p2m.h>
> +#include <asm/traps.h>
> +#include <asm/hvm/vmx/vmx.h>
> +#include <public/sched.h>
> +
> +static int pvh_grant_table_op(
> +              unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int 
> count)
> +{
> +    switch (cmd)
> +    {
> +        case GNTTABOP_map_grant_ref:
> +        case GNTTABOP_unmap_grant_ref:
> +        case GNTTABOP_setup_table:
> +        case GNTTABOP_copy:
> +        case GNTTABOP_query_size:
> +        case GNTTABOP_set_version:
> +            return do_grant_table_op(cmd, uop, count);
> +    }
> +    return -ENOSYS;
> +}
> +
> +static long pvh_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
> +{
> +    long rc = -ENOSYS;
> +
> +    switch ( cmd )
> +    {
> +        case VCPUOP_register_runstate_memory_area:
> +        case VCPUOP_get_runstate_info:
> +        case VCPUOP_set_periodic_timer:
> +        case VCPUOP_stop_periodic_timer:
> +        case VCPUOP_set_singleshot_timer:
> +        case VCPUOP_stop_singleshot_timer:
> +        case VCPUOP_is_up:
> +        case VCPUOP_up:
> +        case VCPUOP_initialise:
> +            rc = do_vcpu_op(cmd, vcpuid, arg);
> +
> +            /* pvh boot vcpu setting context for bringing up smp vcpu */
> +            if (cmd == VCPUOP_initialise)
> +                vmx_vmcs_enter(current);
> +    }
> +    return rc;
> +}
> +
> +static long pvh_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
> +{
> +    switch ( cmd )
> +    {
> +        case PHYSDEVOP_map_pirq:
> +        case PHYSDEVOP_unmap_pirq:
> +        case PHYSDEVOP_eoi:
> +        case PHYSDEVOP_irq_status_query:
> +        case PHYSDEVOP_get_free_pirq:
> +            return do_physdev_op(cmd, arg);
> +
> +        default:
> +            if ( IS_PRIV(current->domain) )
> +                return do_physdev_op(cmd, arg);
> +    }
> +    return -ENOSYS;
> +}
> +
> +static long do_pvh_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
> +{
> +    long rc = -EINVAL;
> +    struct xen_hvm_param harg;
> +    struct domain *d;
> +
> +    if ( copy_from_guest(&harg, arg, 1) )
> +        return -EFAULT;
> +
> +    rc = rcu_lock_target_domain_by_id(harg.domid, &d);
> +    if ( rc != 0 )
> +        return rc;
> +
> +    if (is_hvm_domain(d)) {

Formatting is odd compared to the other 'if ( (something)'

> +        /* pvh dom0 is building an hvm guest */
> +        rcu_unlock_domain(d);
> +        return do_hvm_op(op, arg);  
> +    }
> +
> +    rc = -ENOSYS;
> +    if (op == HVMOP_set_param) {
> +        if (harg.index == HVM_PARAM_CALLBACK_IRQ) {
> +            struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
> +            uint64_t via = harg.value;
> +            uint8_t via_type = (uint8_t)(via >> 56) + 1;
> +
> +            if (via_type == HVMIRQ_callback_vector) {
> +                hvm_irq->callback_via_type = HVMIRQ_callback_vector;
> +                hvm_irq->callback_via.vector = (uint8_t)via;
> +                rc = 0;
> +            }

Perhaps it would make sense to also print out the -ENOSYS
ones for development purposes?

Say:

gdprintk(XENLOG_DEBUG, "d%, %s setting HVMOP_set_param[%d] - ENOSYS\n", 
d->domain_id, __func__);

And for the ops case:

gdprintk(XENLOG_DEBUG, "d%, %s - ENOSYS\n", ..)
? 
        
> +        }
> +    }
> +    rcu_unlock_domain(d);
> +    return rc;
> +}
> +
> +typedef unsigned long pvh_hypercall_t(
> +    unsigned long, unsigned long, unsigned long, unsigned long, unsigned 
> long,
> +    unsigned long);

No way to re-use the something standard? I am not sure what is 'PVH' specific
to it? It looks like a garden variety normal hypercalls.

Perhaps just copy-n-paste the hvm_hypercall_t for right now? And the later they
can be exported in a common header file?

> +
> +int hcall_a[NR_hypercalls];
> +
> +static pvh_hypercall_t *pvh_hypercall64_table[NR_hypercalls] = {
> +    [__HYPERVISOR_platform_op]     = (pvh_hypercall_t *)do_platform_op,
> +    [__HYPERVISOR_memory_op]       = (pvh_hypercall_t *)do_memory_op,

Could you include a comment saying why timers are not good?

> +    /* [__HYPERVISOR_set_timer_op]     = (pvh_hypercall_t *)do_set_timer_op, 
> */
> +    [__HYPERVISOR_xen_version]     = (pvh_hypercall_t *)do_xen_version,
> +    [__HYPERVISOR_console_io]      = (pvh_hypercall_t *)do_console_io,
> +    [__HYPERVISOR_grant_table_op]  = (pvh_hypercall_t *)pvh_grant_table_op,
> +    [__HYPERVISOR_vcpu_op]         = (pvh_hypercall_t *)pvh_vcpu_op,
> +    [__HYPERVISOR_mmuext_op]       = (pvh_hypercall_t *)do_mmuext_op,
> +    [__HYPERVISOR_xsm_op]          = (pvh_hypercall_t *)do_xsm_op,
> +    [__HYPERVISOR_sched_op]        = (pvh_hypercall_t *)do_sched_op,
> +    [__HYPERVISOR_event_channel_op]= (pvh_hypercall_t *)do_event_channel_op,
> +    [__HYPERVISOR_physdev_op]      = (pvh_hypercall_t *)pvh_physdev_op,
> +    [__HYPERVISOR_hvm_op]          = (pvh_hypercall_t *)do_pvh_hvm_op,

Most of these follow the 'do_X'. Then for the pvh ones you have:
'pvh_X', with one exception: do_pvh_hvm_op ?

Should it be just 'pvh_hvm_op' instead?


> +    [__HYPERVISOR_sysctl]          = (pvh_hypercall_t *)do_sysctl,
> +    [__HYPERVISOR_domctl]          = (pvh_hypercall_t *)do_domctl
> +};
> +
> +/* fixme: Do we need to worry about this and slow things down in this path? 
> */
> +static int pvh_long_mode_enabled(void)
> +{
> +    /* A 64bit linux guest should always run in this mode with CS.L selecting
> +     * either 64bit mode or 32bit compat mode */

I think they are called 'native' or '64-bit mode' per the AMD spec?

> +    return 1;

Well, it always seems to return 1? So this ends up being mostly a nop when
the compiler is done?

Or did the earlier version have the correct checks to make sure that we
are in Long Mode?


> +}
> +
> +/* Check if hypercall is valid 
> + * Returns: 0 if hcall is not valid with eax set to the errno to ret to guest


Huh? Return 0 on invalid case? That looks odd. Why not anything but zero?

Or perhaps just make it return a bool? That would be easier
to grok.

> + */
> +static int hcall_valid(struct cpu_user_regs *regs)
> +{
> +    struct segment_register sreg;
> +
> +    if (!pvh_long_mode_enabled()) 
> +    {
> +        gdprintk(XENLOG_ERR, "PVH Error: Expected long mode set\n");

Shouldn't this set:
        regs->eax = -ENOSYS?

> +        return 1;
> +    }
> +    hvm_get_segment_register(current, x86_seg_ss, &sreg);
> +    if ( unlikely(sreg.attr.fields.dpl == 3) ) 
> +    {
> +        regs->eax = -EPERM;
> +        return 0;
> +    }
> +
> +    /* domU's are not allowed following hcalls */

And that is b/c we can't handle it yet? In which case you should
prefix it with a TODO or XXX to remember this.

> +    if ( !IS_PRIV(current->domain) &&
> +         (regs->eax == __HYPERVISOR_xsm_op ||
> +          regs->eax == __HYPERVISOR_platform_op ||
> +          regs->eax == __HYPERVISOR_domctl) ) {     /* for privcmd mmap */
> +
> +        regs->eax = -EPERM;
> +        return 0;
> +    }
> +    return 1;
> +}
> +
> +int pvh_do_hypercall(struct cpu_user_regs *regs)
> +{
> +    uint32_t hnum = regs->eax;
> +
> +    if ( hnum >= NR_hypercalls || pvh_hypercall64_table[hnum] == NULL ) 
> +    {
> +        gdprintk(XENLOG_WARNING, "PVH: Unimplemented HCALL:%d. Returning " 
> +                 "-ENOSYS. domid:%d IP:%lx SP:%lx\n", 
> +                 hnum, current->domain->domain_id, regs->rip, regs->rsp);
> +        regs->eax = -ENOSYS;
> +        vmx_update_guest_eip();
> +        return HVM_HCALL_completed;
> +    }
> +

> +    if ( regs->eax == __HYPERVISOR_sched_op && regs->rdi == SCHEDOP_shutdown 
> ) 

Oh? We can't shutdown the guest? How come? 
> +    {
> +        regs->eax = -ENOSYS;
> +        vmx_update_guest_eip();
> +
> +        /* PVH fixme: show_guest_stack() from domain crash causes PF */
> +        domain_crash_synchronous();
> +        return HVM_HCALL_completed;
> +    }
> +
> +    if ( !hcall_valid(regs) )
> +        return HVM_HCALL_completed;
> +
> +    current->arch.hvm_vcpu.hcall_preempted = 0;
> +    regs->rax = pvh_hypercall64_table[hnum](regs->rdi, regs->rsi, regs->rdx,
> +                                            regs->r10, regs->r8, regs->r9);
> +
> +    if ( current->arch.hvm_vcpu.hcall_preempted )
> +        return HVM_HCALL_preempted;
> +         
> +    return HVM_HCALL_completed;
> +}
> +
> diff --git a/xen/arch/x86/hvm/vmx/Makefile b/xen/arch/x86/hvm/vmx/Makefile
> index 373b3d9..8b71dae 100644
> --- a/xen/arch/x86/hvm/vmx/Makefile
> +++ b/xen/arch/x86/hvm/vmx/Makefile
> @@ -5,3 +5,4 @@ obj-y += vmcs.o
>  obj-y += vmx.o
>  obj-y += vpmu_core2.o
>  obj-y += vvmx.o
> +obj-y += vmx_pvh.o
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index 194c87b..5503fc9 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -1529,6 +1529,8 @@ static struct hvm_function_table __read_mostly 
> vmx_function_table = {
>      .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled,
>      .process_isr          = vmx_process_isr,
>      .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
> +    .pvh_set_vcpu_info    = vmx_pvh_set_vcpu_info,
> +    .pvh_read_descriptor  = vmx_pvh_read_descriptor,
>  };
>  
>  struct hvm_function_table * __init start_vmx(void)
> @@ -2364,6 +2366,11 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>      if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
>          return vmx_failed_vmentry(exit_reason, regs);
>  
> +    if ( is_pvh_vcpu(v) ) {
> +        vmx_pvh_vmexit_handler(regs);
> +        return;
> +    }
> +
>      if ( v->arch.hvm_vmx.vmx_realmode )
>      {
>          /* Put RFLAGS back the way the guest wants it */
> diff --git a/xen/arch/x86/hvm/vmx/vmx_pvh.c b/xen/arch/x86/hvm/vmx/vmx_pvh.c
> new file mode 100644
> index 0000000..14ca0f6
> --- /dev/null
> +++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c
> @@ -0,0 +1,587 @@
> +/*
> + * Copyright (C) 2013, Mukesh Rathor, Oracle Corp.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License v2 as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this program; if not, write to the
> + * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
> + * Boston, MA 021110-1307, USA.

Ditto. No address please.

> + */
> +
> +#include <xen/hypercall.h>
> +#include <xen/guest_access.h>
> +#include <asm/p2m.h>
> +#include <asm/traps.h>
> +#include <asm/hvm/vmx/vmx.h>
> +#include <public/sched.h>
> +#include <asm/pvh.h>
> +

This should probably be #ifdef DEBUG
> +volatile int pvhdbg=0;

And I think you can remove the 'volatile' part?


> +#define dbgp1(...) {(pvhdbg==1) ? printk(__VA_ARGS__):0;}
> +#define dbgp2(...) {(pvhdbg==2) ? printk(__VA_ARGS__):0;}

#endif
> +
> +
> +static void read_vmcs_selectors(struct cpu_user_regs *regs)
> +{
> +    regs->cs = __vmread(GUEST_CS_SELECTOR);
> +    regs->ss = __vmread(GUEST_SS_SELECTOR);
> +    regs->ds = __vmread(GUEST_DS_SELECTOR);
> +    regs->es = __vmread(GUEST_ES_SELECTOR);
> +    regs->gs = __vmread(GUEST_GS_SELECTOR);
> +    regs->fs = __vmread(GUEST_FS_SELECTOR);
> +}
> +
> +/* returns : 0 success */

What are the non-success criteria?

> +static int vmxit_msr_read(struct cpu_user_regs *regs)
> +{
> +    int rc=1;

X86EMUL_UNHANDLEABLE ?

> +
> +    u64 msr_content = 0;
> +    switch (regs->ecx)
> +    {
> +        case MSR_IA32_MISC_ENABLE:
> +        {
> +            rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
> +            msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
> +                           MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
> +            break;
> +        }
> +        default:
> +        {
> +            /* fixme: see hvm_msr_read_intercept() */
> +            rdmsrl(regs->ecx, msr_content);  
> +            break;
> +        }
> +    }
> +    regs->eax = (uint32_t)msr_content;
> +    regs->edx = (uint32_t)(msr_content >> 32);
> +    vmx_update_guest_eip();
> +    rc = 0;
> +
> +    dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n", regs->ecx, 
> regs->eax, 
> +          regs->edx, regs->rip, regs->rsp);
> +    return rc;

This function looks to return 0 (or X86EMUL_OKAY) irregardless of the MSR?
Perhaps just make it return X86EMUL_OKAY without bothering to use 'rc'?

> +}
> +
> +/* returns : 0 success */
> +static int vmxit_msr_write(struct cpu_user_regs *regs)
> +{
> +    uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32);
> +    int rc=1;

X86EMUL_UNHANDLEABLE

> +
> +    dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n", regs->ecx, 
> +          regs->eax,regs->edx);
> +
> +    if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) {
> +        vmx_update_guest_eip();
> +        rc = 0;

X86EMUL_OKAY

> +    }
> +    return rc;
> +}
> +
> +/* Returns: rc == 0: handled the MTF vmexit */
> +static int vmxit_mtf(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *vp = current;
> +    int rc=1, ss=vp->arch.hvm_vcpu.single_step; 
> +
> +    vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
> +    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control);
> +    vp->arch.hvm_vcpu.single_step = 0;
> +
> +    if ( vp->domain->debugger_attached && ss ) {
> +        domain_pause_for_debugger();
> +        rc = 0;
> +    }
> +    return rc;
> +}
> +
> +static int vmxit_int3(struct cpu_user_regs *regs)
> +{
> +    int ilen = vmx_get_instruction_length();
> +    struct vcpu *vp = current;
> +    struct hvm_trap trap_info = { 
> +                        .vector = TRAP_int3, 
> +                        .type = X86_EVENTTYPE_SW_EXCEPTION,
> +                        .error_code = HVM_DELIVER_NO_ERROR_CODE, 
> +                        .insn_len = ilen
> +    };
> +
> +    regs->eip += ilen;
> +
> +    /* gdbsx or another debugger. Never pause dom0 */
> +    if ( vp->domain->domain_id != 0 && guest_kernel_mode(vp, regs) )
> +    {
> +        dbgp1("[%d]PVH: domain pause for debugger\n", smp_processor_id());
> +        current->arch.gdbsx_vcpu_event = TRAP_int3;
> +        domain_pause_for_debugger();
> +        return 0;
> +    }
> +
> +    regs->eip -= ilen;
> +    hvm_inject_trap(&trap_info);
> +
> +    return 0;
> +}
> +
> +static int vmxit_invalid_op(struct cpu_user_regs *regs)
> +{
> +    ulong addr=0;
> +
> +    if ( guest_kernel_mode(current, regs) || 
> +         (addr = emulate_forced_invalid_op(regs)) == 0 )
> +    {
> +        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
> +        return 0;
> +    } 
> +
> +    if (addr != EXCRET_fault_fixed)
> +        hvm_inject_page_fault(0, addr);
> +
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: handled the exception/NMI */
> +static int vmxit_exception(struct cpu_user_regs *regs)
> +{
> +    unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) & 
> INTR_INFO_VECTOR_MASK;
> +    int rc=1; 
> +    struct vcpu *vp = current;
> +
> +    dbgp2(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector, 
> +          __vmread(GUEST_CS_SELECTOR), regs->eip);
> +
> +    if (vector == TRAP_debug) {
> +        unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        write_debugreg(6, exit_qualification | 0xffff0ff0);
> +        rc = 0;
> +        
> +        /* gdbsx or another debugger */
> +        if ( vp->domain->domain_id != 0 &&    /* never pause dom0 */
> +             guest_kernel_mode(vp, regs) &&  vp->domain->debugger_attached )
> +        {
> +            domain_pause_for_debugger();
> +        } else {
> +            hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
> +        }
> +    } 
> +    if (vector == TRAP_int3) {
> +        rc = vmxit_int3(regs);
> +
> +    }  else if (vector == TRAP_invalid_op) {
> +        rc = vmxit_invalid_op(regs);
> +
> +    } else if (vector == TRAP_no_device) {
> +        hvm_funcs.fpu_dirty_intercept();  /* calls vmx_fpu_dirty_intercept */
> +        rc = 0;
> +
> +    } else if (vector == TRAP_gp_fault) {
> +        regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
> +        /* hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); */

So how come we don't inject it in?

> +        rc = 1;

Huh? Not X86_EMUL_OK?

> +
> +    } else if (vector == TRAP_page_fault) {
> +        printk("PVH: Unexpected vector page_fault. IP:%lx\n", regs->eip);

printk(.. some prefix.

> +        rc = 1;
> +    }
> +    if (rc)
> +        printk("PVH: Unhandled trap vector:%d. IP:%lx\n", vector, regs->eip);
> +
> +    return rc;
> +}
> +
> +static int vmxit_invlpg(void)
> +{
> +    ulong vaddr = __vmread(EXIT_QUALIFICATION);
> +
> +    vmx_update_guest_eip();
> +    vpid_sync_vcpu_gva(current, vaddr);
> +    return 0;
> +}
> +
> +static int vmxit_vmcall(struct cpu_user_regs *regs)
> +{
> +    if ( pvh_do_hypercall(regs) != HVM_HCALL_preempted)
> +        vmx_update_guest_eip();
> +
> +    return 0;;
> +}
> +
> +/* Returns: rc == 0: success */
> +static int access_cr0(struct cpu_user_regs *regs, uint acc_typ, 
> +                      uint64_t *regp)
> +{
> +    struct vcpu *vp = current;
> +
> +    if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
> +    {
> +        unsigned long new_cr0 = *regp;
> +        unsigned long old_cr0 = __vmread(GUEST_CR0);
> +
> +        dbgp2("PVH:writing to CR0. RIP:%lx val:0x%lx\n", regs->rip, *regp);
> +        if ( (u32)new_cr0 != new_cr0 )
> +        {
> +            HVM_DBG_LOG(DBG_LEVEL_1, 
> +                        "Guest setting upper 32 bits in CR0: %lx", new_cr0);
> +            return 1;
> +        }
> +
> +        new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
> +        /* ET is reserved and should be always be 1. */
> +        new_cr0 |= X86_CR0_ET;
> +
> +        /* pvh cannot change to real mode */
> +        if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) != (X86_CR0_PG|X86_CR0_PE) 
> ) {
> +            printk("PVH attempting to turn off PE/PG. CR0:%lx\n", new_cr0);
> +            return 1;
> +        }
> +        /* TS going from 1 to 0 */
> +        if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 & X86_CR0_TS)==0) )
> +            vmx_fpu_enter(vp);

Does this really happen? I thought in the PV mode you would be using the 
hypercalls
for the fpu swap? Should it be print out an error saying something to the 
effect:

        "PVH guest is using cr0 instead of the paravirt lazy FPU switch!" and
        include the EIP?

> +
> +        vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = new_cr0;
> +        __vmwrite(GUEST_CR0, new_cr0);
> +        __vmwrite(CR0_READ_SHADOW, new_cr0);
> +    } else {
> +        *regp = __vmread(GUEST_CR0);
> +    } 
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: success */
> +static int access_cr4(struct cpu_user_regs *regs, uint acc_typ, 
> +                      uint64_t *regp)
> +{
> +    if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
> +    {
> +        u64 old_cr4 = __vmread(GUEST_CR4);
> +
> +        if ( (old_cr4 ^ (*regp)) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) 
> )
> +            vpid_sync_all();
> +
> +        /* pvh_verify_cr4_wr(*regp)); */

Needed anymore?

> +        __vmwrite(GUEST_CR4, *regp);
> +    } else {
> +        *regp = __vmread(GUEST_CR4);
> +    } 
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: success */
> +static int vmxit_cr_access(struct cpu_user_regs *regs)
> +{
> +    unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +    uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
> +    int cr, rc = 1;
> +
> +    switch ( acc_typ )
> +    {
> +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
> +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
> +        {
> +            uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
> +            uint64_t *regp = decode_register(gpr, regs, 0);
> +            cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
> +
> +            if (regp == NULL)
> +                break;
> +
> +            /* pl don't embed switch statements */
> +            if (cr == 0)
> +                rc = access_cr0(regs, acc_typ, regp);
> +            else if (cr == 3) {
> +                printk("PVH: d%d: unexpected cr3 access vmexit. rip:%lx\n", 
> +                       current->domain->domain_id, regs->rip);
> +                domain_crash_synchronous();

Uh? Why wouldn't we want to handle it?

> +            } else if (cr == 4) 
> +                rc = access_cr4(regs, acc_typ, regp);
> +
> +            if (rc == 0)
> +                vmx_update_guest_eip();
> +            break;
> +        }
> +        case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
> +        {
> +            struct vcpu *vp = current;
> +            unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] & ~X86_CR0_TS;
> +            vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = cr0;
> +            vmx_fpu_enter(vp);
> +            __vmwrite(GUEST_CR0, cr0);
> +            __vmwrite(CR0_READ_SHADOW, cr0);
> +            vmx_update_guest_eip();
> +            rc = 0;
> +        }


> +    }
> +    return rc;
> +}
> +
> +/* NOTE: a PVH sets IOPL natively by setting bits in the eflags and not by
> + * hypercalls used by a PV */


Ahh, so there are now five? PV hypercall families that should not be used:

 - PHYSDEVOP_set_iopl (which I think in your earlier patch you did not check
   for?)
 - HYPERVISOR_update_va_mapping (for all the MMU stuff)
 - HYPERVISOR_update_descriptor (segments and such)
 - HYPERVISOR_fpu_taskswitch (you are doing it in the above function)
 - HYPERVISOR_set_trap_table (again, LDT and GDT are now done via HVM)
 - HYPERVISOR_set_segment_base
 - HYPERVISOR_set_gdt
 - HYPERVISOR_tmem
 .. and host of other.

This should be documented somewhere in docs?
Perhaps in docs/misc/pvh.txt and just outline which ones are not to be used
anymore?


> +static int vmxit_io_instr(struct cpu_user_regs *regs)
> +{
> +    int curr_lvl;
> +    int requested = (regs->rflags >> 12) & 3;
> +
> +    read_vmcs_selectors(regs);
> +    curr_lvl = regs->cs & 3;
> +
> +    if (requested >= curr_lvl && emulate_privileged_op(regs)) 
> +        return 0;
> +
> +    hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
> +    return 0;
> +}
> +
> +static int pvh_ept_handle_violation(unsigned long qualification, 
> +                                    paddr_t gpa, struct cpu_user_regs *regs)
> +{
> +    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
> +    p2m_type_t p2mt;
> +    mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt);
> +
> +    gdprintk(XENLOG_ERR, "Dom:%d EPT violation %#lx (%c%c%c/%c%c%c), "
> +             "gpa %#"PRIpaddr", mfn %#lx, type %i. IP:0x%lx RSP:0x%lx\n",
> +             current->domain->domain_id, qualification, 
> +             (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
> +             (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
> +             (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
> +             (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
> +             (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
> +             (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
> +             gpa, mfn_x(mfn), p2mt, regs->rip, regs->rsp);
> +             
> +    ept_walk_table(current->domain, gfn);
> +
> +    if ( qualification & EPT_GLA_VALID )
> +    {
> +        gla = __vmread(GUEST_LINEAR_ADDRESS);
> +        gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);

AHA! There are gdprintk in your code! Could you please replace most (all)
of the printk you have with either gdprintk or the printk(XENLOG_ERR?

> +    }
> +
> +    hvm_inject_hw_exception(TRAP_gp_fault, 0);
> +    return 0;
> +}
> +
> +static void pvh_user_cpuid(struct cpu_user_regs *regs)
> +{
> +    unsigned int eax, ebx, ecx, edx;
> +
> +    asm volatile ( "cpuid"
> +              : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
> +              : "0" (regs->eax), "2" (regs->rcx) );
> +

Could you use 'cpuid' macro defined in processor.h?

> +    regs->rax = eax; regs->rbx = ebx; regs->rcx = ecx; regs->rdx = edx;

That would make this:

        cpuid(regs->eax, &regs->eax, &regs->ebx, &regs->ecx, &regs->edx) ?


Or is that not an option since you are re-using the eax register? If so, could
you do:

        unsigned int op = regs->eax;
        cpuid(op, &regs->eax, &regs->ebx, &regs->ecx, &regs->edx) ?

> +}
> +
> +/* 
> + * Main exit handler for PVH case. Called from vmx_vmexit_handler(). 
> + * Note: in vmx_asm_vmexit_handler, rip/rsp/eflags are updated in regs{} 
> + */
> +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs)
> +{
> +    unsigned long exit_qualification;
> +    unsigned int exit_reason = __vmread(VM_EXIT_REASON);
> +    int rc=0, ccpu = smp_processor_id();
> +    struct vcpu *vp = current;
> +
> +    dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx 
> CR0:%lx\n", 
> +          ccpu, exit_reason, regs->rip, regs->rsp, regs->rflags, 
> +          __vmread(GUEST_CR0));
> +
> +    /* for guest_kernel_mode() */
> +    regs->cs = __vmread(GUEST_CS_SELECTOR); 
> +
> +    switch ( (uint16_t)exit_reason )

Huh? Why the 'uint16_t'? Ah, b/c vmx_vmexit_handler does it too. I wonder why?

> +    {
> +        case EXIT_REASON_EXCEPTION_NMI:      /* 0 */
> +            rc = vmxit_exception(regs);
> +            break;
> +            
> +        case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */
> +        case EXIT_REASON_MCE_DURING_VMENTRY: /* 41 */
> +            break;              /* handled in vmx_vmexit_handler() */
> +
> +        case EXIT_REASON_TRIPLE_FAULT:       /* 2 */
> +        {
> +            printk("PVH:Triple Flt:[%d] RIP:%lx RSP:%lx EFLAGS:%lx 
> CR3:%lx\n",
> +                   ccpu, regs->rip, regs->rsp, regs->rflags, 
> +                   __vmread(GUEST_CR3));
> +
> +            rc = 1;
> +            break;
> +        }
> +        case EXIT_REASON_PENDING_VIRT_INTR:  /* 7 */
> +        {
> +            struct vcpu *v = current;
> +
> +            /* Disable the interrupt window. */
> +            v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +            __vmwrite(CPU_BASED_VM_EXEC_CONTROL, 
> v->arch.hvm_vmx.exec_control);
> +            break;
> +        }
> +
> +        case EXIT_REASON_CPUID:              /* 10 */
> +        {
> +            if ( guest_kernel_mode(vp, regs) ) {
> +                pv_cpuid(regs);
> +            } else
> +                pvh_user_cpuid(regs);
> +
> +            vmx_update_guest_eip();
> +            dbgp2("cpuid:%ld RIP:%lx\n", regs->eax, regs->rip);
> +            break;
> +        }
> +
> +        case EXIT_REASON_HLT:             /* 12 */
> +        {
> +            vmx_update_guest_eip();
> +            hvm_hlt(regs->eflags);
> +            break;
> +        }
> +
> +        case EXIT_REASON_INVLPG:             /* 14 */
> +            rc = vmxit_invlpg();
> +            break;
> +
> +        case EXIT_REASON_RDTSC:              /* 16 */
> +            rc = 1;
> +            break;
> +
> +        case EXIT_REASON_VMCALL:             /* 18 */
> +            rc = vmxit_vmcall(regs);
> +            break;
> +
> +        case EXIT_REASON_CR_ACCESS:          /* 28 */
> +            rc = vmxit_cr_access(regs);
> +            break;
> +
> +        case EXIT_REASON_DR_ACCESS:          /* 29 */
> +        {
> +            exit_qualification = __vmread(EXIT_QUALIFICATION);
> +            vmx_dr_access(exit_qualification, regs);
> +            break;
> +        }
> +
> +        case EXIT_REASON_IO_INSTRUCTION:
> +            vmxit_io_instr(regs);
> +            break;
> +
> +        case EXIT_REASON_MSR_READ:           /* 31 */
> +            rc = vmxit_msr_read(regs);
> +            break;
> +
> +        case EXIT_REASON_MSR_WRITE:          /* 32 */
> +            rc = vmxit_msr_write(regs);
> +            break;
> +
> +        case EXIT_REASON_MONITOR_TRAP_FLAG:  /* 37 */
> +            rc = vmxit_mtf(regs);
> +            break;
> +
> +        case EXIT_REASON_EPT_VIOLATION:
> +        {
> +            paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
> +            exit_qualification = __vmread(EXIT_QUALIFICATION);
> +            rc = pvh_ept_handle_violation(exit_qualification, gpa, regs);
> +            break;
> +        }
> +        default: 
> +            rc = 1;

> +            printk("PVH: Unexpected exit reason:%d 0x%x\n", exit_reason, 
> +                   exit_reason);
> +    }
> +    if (rc) {

Odd syntax.

> +        exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        printk("PVH: [%d] exit_reas:%d 0x%x qual:%ld 0x%lx cr0:0x%016lx\n", 
> +             ccpu, exit_reason, exit_reason, exit_qualification,
> +             exit_qualification, __vmread(GUEST_CR0));
> +        printk("PVH: [%d] RIP:%lx RSP:%lx\n", ccpu, regs->rip, regs->rsp);
> +        domain_crash_synchronous();
> +    }
> +}
> +
> +/* 
> + * Sets info for non boot vcpu. VCPU 0 context is set by library.
> + * We use this for nonboot vcpu in which case the call comes from the 
> + * kernel cpu_initialize_context().
> + */
> +int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp)
> +{
> +    if (v->vcpu_id == 0)
> +        return 0;
> +
> +    vmx_vmcs_enter(v);
> +    __vmwrite(GUEST_GDTR_BASE, ctxtp->u.pvh.gdtaddr);
> +    __vmwrite(GUEST_GDTR_LIMIT, ctxtp->u.pvh.gdtsz);
> +    __vmwrite(GUEST_GS_BASE, ctxtp->gs_base_user);
> +
> +    __vmwrite(GUEST_CS_SELECTOR, ctxtp->user_regs.cs);
> +    __vmwrite(GUEST_DS_SELECTOR, ctxtp->user_regs.ds);
> +    __vmwrite(GUEST_ES_SELECTOR, ctxtp->user_regs.es);
> +    __vmwrite(GUEST_SS_SELECTOR, ctxtp->user_regs.ss);
> +    __vmwrite(GUEST_GS_SELECTOR, ctxtp->user_regs.gs);
> +
> +    if ( vmx_add_guest_msr(MSR_SHADOW_GS_BASE) )
> +        return -EINVAL;
> +
> +    vmx_write_guest_msr(MSR_SHADOW_GS_BASE, ctxtp->gs_base_kernel);
> +
> +    vmx_vmcs_exit(v);
> +    return 0;
> +}
> +
> +int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
> +                            const struct cpu_user_regs *regs,
> +                            unsigned long *base, unsigned long *limit,
> +                            unsigned int *ar)
> +{

How come you don't want to follow the syntax of vmx_pvh_read_descriptor?
It has a lot less of arguments?

> +    unsigned int tmp_ar = 0;
> +    BUG_ON(v!=current);
> +    BUG_ON(!is_pvh_vcpu(v));
> +
> +    if (sel == (unsigned int)regs->cs) {
> +        *base = __vmread(GUEST_CS_BASE);
> +        *limit = __vmread(GUEST_CS_LIMIT);
> +        tmp_ar = __vmread(GUEST_CS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->ds) {
> +        *base = __vmread(GUEST_DS_BASE);
> +        *limit = __vmread(GUEST_DS_LIMIT);
> +        tmp_ar = __vmread(GUEST_DS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->ss) {
> +        *base = __vmread(GUEST_SS_BASE);
> +        *limit = __vmread(GUEST_SS_LIMIT);
> +        tmp_ar = __vmread(GUEST_SS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->gs) {
> +        *base = __vmread(GUEST_GS_BASE);
> +        *limit = __vmread(GUEST_GS_LIMIT);
> +        tmp_ar = __vmread(GUEST_GS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->fs) {
> +        *base = __vmread(GUEST_FS_BASE);
> +        *limit = __vmread(GUEST_FS_LIMIT);
> +        tmp_ar = __vmread(GUEST_FS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->es) {
> +        *base = __vmread(GUEST_ES_BASE);
> +        *limit = __vmread(GUEST_ES_LIMIT);
> +        tmp_ar = __vmread(GUEST_ES_AR_BYTES); 
> +    } else {
> +        printk("Unmatched segment selector:%d\n", sel);
> +        return 0;
> +    }
> +
> +    if (tmp_ar & X86_SEG_AR_CS_LM_ACTIVE) {           /* x86 mess!! */
> +        *base = 0UL;
> +        *limit = ~0UL;
> +    }
> +    /* Fixup ar so that it looks the same as in native mode */
> +    *ar = (tmp_ar << 8);

I am not sure I follow. Are you doing this to fit in the other bits
of the segment (the upper limit)? Shouldn't the caller of 
vmx_pvh_read_descriptor do this?

> +    return 1;
> +}
> +
> diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h 
> b/xen/include/asm-x86/hvm/vmx/vmx.h
> index a742e16..5679e8d 100644
> --- a/xen/include/asm-x86/hvm/vmx/vmx.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmx.h
> @@ -189,6 +189,7 @@ void vmx_update_secondary_exec_control(struct vcpu *v);
>   * Access Rights
>   */
>  #define X86_SEG_AR_SEG_TYPE     0xf        /* 3:0, segment type */
> +#define X86_SEG_AR_SEG_TYPE_CODE (1u << 3) /* code (vs data) segment */
>  #define X86_SEG_AR_DESC_TYPE    (1u << 4)  /* 4, descriptor type */
>  #define X86_SEG_AR_DPL          0x60       /* 6:5, descriptor privilege 
> level */
>  #define X86_SEG_AR_SEG_PRESENT  (1u << 7)  /* 7, segment present */
> @@ -442,10 +443,14 @@ void ept_p2m_uninit(struct p2m_domain *p2m);
>  
>  void ept_walk_table(struct domain *d, unsigned long gfn);
>  void setup_ept_dump(void);
> -

Stray change.
>  void vmx_update_guest_eip(void);
>  void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs 
> *regs);
>  void vmx_do_extint(struct cpu_user_regs *regs);
> +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs);
> +int  vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp);
> +int  vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
> +                         const struct cpu_user_regs *regs, unsigned long 
> *base,
> +                         unsigned long *limit, unsigned int *ar);
>  
>  int alloc_p2m_hap_data(struct p2m_domain *p2m);
>  void free_p2m_hap_data(struct p2m_domain *p2m);
> diff --git a/xen/include/asm-x86/pvh.h b/xen/include/asm-x86/pvh.h
> new file mode 100644
> index 0000000..73e59d3
> --- /dev/null
> +++ b/xen/include/asm-x86/pvh.h
> @@ -0,0 +1,6 @@
> +#ifndef __ASM_X86_PVH_H__
> +#define __ASM_X86_PVH_H__
> +
> +int pvh_do_hypercall(struct cpu_user_regs *regs);
> +
> +#endif  /* __ASM_X86_PVH_H__ */
> -- 
> 1.7.2.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.