[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 19/23] PVH xen: vmcs related changes



This patch contains vmcs changes related for PVH, mainly creating a VMCS
for PVH guest.

Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>
---
 xen/arch/x86/hvm/vmx/vmcs.c |  247 ++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 245 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 36f167f..8d35370 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -634,7 +634,7 @@ void vmx_vmcs_exit(struct vcpu *v)
     {
         /* Don't confuse vmx_do_resume (for @v or @current!) */
         vmx_clear_vmcs(v);
-        if ( is_hvm_vcpu(current) )
+        if ( !is_pv_vcpu(current) )
             vmx_load_vmcs(current);
 
         spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
@@ -856,6 +856,239 @@ static void vmx_set_common_host_vmcs_fields(struct vcpu 
*v)
     __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
 }
 
+static int pvh_check_requirements(struct vcpu *v)
+{
+    u64 required, tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
+
+    if ( !paging_mode_hap(v->domain) )
+    {
+        printk(XENLOG_G_INFO "HAP is required for PVH guest.\n");
+        return -EINVAL;
+    }
+    if ( !cpu_has_vmx_pat )
+    {
+        printk(XENLOG_G_INFO "PVH: CPU does not have PAT support\n");
+        return -ENOSYS;
+    }
+    if ( !cpu_has_vmx_msr_bitmap )
+    {
+        printk(XENLOG_G_INFO "PVH: CPU does not have msr bitmap\n");
+        return -ENOSYS;
+    }
+    if ( !cpu_has_vmx_vpid )
+    {
+        printk(XENLOG_G_INFO "PVH: CPU doesn't have VPID support\n");
+        return -ENOSYS;
+    }
+    if ( !cpu_has_vmx_secondary_exec_control )
+    {
+        printk(XENLOG_G_INFO "CPU Secondary exec is required to run PVH\n");
+        return -ENOSYS;
+    }
+
+    if ( v->domain->arch.vtsc )
+    {
+        printk(XENLOG_G_INFO
+               "At present PVH only supports the default timer mode\n");
+        return -ENOSYS;
+    }
+
+    required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR;
+    if ( (tmpval & required) != required )
+    {
+        printk(XENLOG_G_INFO "PVH: required CR4 features not available:%lx\n",
+               required);
+        return -ENOSYS;
+    }
+
+    return 0;
+}
+
+static int pvh_construct_vmcs(struct vcpu *v)
+{
+    int rc, msr_type;
+    unsigned long *msr_bitmap;
+    struct domain *d = v->domain;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    struct ept_data *ept = &p2m->ept;
+    u32 vmexit_ctl = vmx_vmexit_control;
+    u32 vmentry_ctl = vmx_vmentry_control;
+    u64 host_pat, tmpval = -1;
+
+    if ( (rc = pvh_check_requirements(v)) )
+        return rc;
+
+    msr_bitmap = alloc_xenheap_page();
+    if ( msr_bitmap == NULL )
+        return -ENOMEM;
+
+    /* 1. Pin-Based Controls: */
+    __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
+
+    v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
+
+    /* 2. Primary Processor-based controls: */
+    /*
+     * If rdtsc exiting is turned on and it goes thru emulate_privileged_op,
+     * then pv_vcpu.ctrlreg must be added to the pvh struct.
+     */
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;
+
+    v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
+                                      CPU_BASED_CR3_LOAD_EXITING |
+                                      CPU_BASED_CR3_STORE_EXITING);
+    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_MSR_BITMAP;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+
+    /* 3. Secondary Processor-based controls (Intel SDM: resvd bits are 0): */
+    v->arch.hvm_vmx.secondary_exec_control = SECONDARY_EXEC_ENABLE_EPT;
+    v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_VPID;
+    v->arch.hvm_vmx.secondary_exec_control |= 
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+
+    __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+              v->arch.hvm_vmx.secondary_exec_control);
+
+    __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
+    __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE));
+
+    /* MSR bitmap for intercepts. */
+    memset(msr_bitmap, ~0, PAGE_SIZE);
+    v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
+    __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
+
+    msr_type = MSR_TYPE_R | MSR_TYPE_W;
+    /* Disable interecepts for MSRs that have corresponding VMCS fields. */
+    vmx_disable_intercept_for_msr(v, MSR_FS_BASE, msr_type);
+    vmx_disable_intercept_for_msr(v, MSR_GS_BASE, msr_type);
+    vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, msr_type);
+    vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, msr_type);
+    vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, msr_type);
+    vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, msr_type);
+    vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, msr_type);
+
+    /*
+     * We don't disable intercepts for MSRs: MSR_STAR, MSR_LSTAR, MSR_CSTAR,
+     * and MSR_SYSCALL_MASK because we need to specify save/restore area to
+     * save/restore at every VM exit and entry. Instead, let the intercept
+     * functions save them into vmx_msr_state fields. See comment in
+     * vmx_restore_host_msrs(). See also vmx_restore_guest_msrs().
+     */
+    __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+    __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+    __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+
+    __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
+
+    /*
+     * Note: we run with default VM_ENTRY_LOAD_DEBUG_CTLS of 1, which means
+     * upon vmentry, the cpu reads/loads VMCS.DR7 and VMCS.DEBUGCTLS, and not
+     * use the host values. 0 would cause it to not use the VMCS values.
+     */
+    vmentry_ctl &= ~VM_ENTRY_LOAD_GUEST_EFER;
+    vmentry_ctl &= ~VM_ENTRY_SMM;
+    vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR;
+    /* PVH 32bitfixme. */
+    vmentry_ctl |= VM_ENTRY_IA32E_MODE;       /* GUEST_EFER.LME/LMA ignored */
+
+    __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
+
+    vmx_set_common_host_vmcs_fields(v);
+
+    __vmwrite(VM_ENTRY_INTR_INFO, 0);
+    __vmwrite(CR3_TARGET_COUNT, 0);
+    __vmwrite(GUEST_ACTIVITY_STATE, 0);
+
+    /* These are sorta irrelevant as we load the discriptors directly. */
+    __vmwrite(GUEST_CS_SELECTOR, 0);
+    __vmwrite(GUEST_DS_SELECTOR, 0);
+    __vmwrite(GUEST_SS_SELECTOR, 0);
+    __vmwrite(GUEST_ES_SELECTOR, 0);
+    __vmwrite(GUEST_FS_SELECTOR, 0);
+    __vmwrite(GUEST_GS_SELECTOR, 0);
+
+    __vmwrite(GUEST_CS_BASE, 0);
+    __vmwrite(GUEST_CS_LIMIT, ~0u);
+    /* CS.L == 1, exec, read/write, accessed. PVH 32bitfixme. */
+    __vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
+
+    __vmwrite(GUEST_DS_BASE, 0);
+    __vmwrite(GUEST_DS_LIMIT, ~0u);
+    __vmwrite(GUEST_DS_AR_BYTES, 0xc093); /* read/write, accessed */
+
+    __vmwrite(GUEST_SS_BASE, 0);
+    __vmwrite(GUEST_SS_LIMIT, ~0u);
+    __vmwrite(GUEST_SS_AR_BYTES, 0xc093); /* read/write, accessed */
+
+    __vmwrite(GUEST_ES_BASE, 0);
+    __vmwrite(GUEST_ES_LIMIT, ~0u);
+    __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
+
+    __vmwrite(GUEST_FS_BASE, 0);
+    __vmwrite(GUEST_FS_LIMIT, ~0u);
+    __vmwrite(GUEST_FS_AR_BYTES, 0xc093); /* read/write, accessed */
+
+    __vmwrite(GUEST_GS_BASE, 0);
+    __vmwrite(GUEST_GS_LIMIT, ~0u);
+    __vmwrite(GUEST_GS_AR_BYTES, 0xc093); /* read/write, accessed */
+
+    __vmwrite(GUEST_GDTR_BASE, 0);
+    __vmwrite(GUEST_GDTR_LIMIT, 0);
+
+    __vmwrite(GUEST_LDTR_BASE, 0);
+    __vmwrite(GUEST_LDTR_LIMIT, 0);
+    __vmwrite(GUEST_LDTR_AR_BYTES, 0x82); /* LDT */
+    __vmwrite(GUEST_LDTR_SELECTOR, 0);
+
+    /* Guest TSS. */
+    __vmwrite(GUEST_TR_BASE, 0);
+    __vmwrite(GUEST_TR_LIMIT, 0xff);
+    __vmwrite(GUEST_TR_AR_BYTES, 0x8b); /* 32-bit TSS (busy) */
+
+    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+    __vmwrite(GUEST_DR7, 0);
+    __vmwrite(VMCS_LINK_POINTER, ~0UL);
+
+    __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+    __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+
+    v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK | (1U << TRAP_debug) |
+                                   (1U << TRAP_int3) | (1U << TRAP_no_device);
+    __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap);
+
+    /* Set WP bit so rdonly pages are not written from CPL 0. */
+    tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP;
+    __vmwrite(GUEST_CR0, tmpval);
+    __vmwrite(CR0_READ_SHADOW, tmpval);
+    v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] = tmpval;
+
+    tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
+    __vmwrite(GUEST_CR4, tmpval);
+    __vmwrite(CR4_READ_SHADOW, tmpval);
+    v->arch.hvm_vcpu.guest_cr[4] = tmpval;
+
+    __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
+    __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
+
+     v->arch.hvm_vmx.vmx_realmode = 0;
+
+    ept->asr  = pagetable_get_pfn(p2m_get_pagetable(p2m));
+    __vmwrite(EPT_POINTER, ept_get_eptp(ept));
+
+    rdmsrl(MSR_IA32_CR_PAT, host_pat);
+    __vmwrite(HOST_PAT, host_pat);
+    __vmwrite(GUEST_PAT, MSR_IA32_CR_PAT_RESET);
+
+    /* The paging mode is updated for PVH by arch_set_info_guest(). */
+
+    return 0;
+}
+
 static int construct_vmcs(struct vcpu *v)
 {
     struct domain *d = v->domain;
@@ -864,6 +1097,13 @@ static int construct_vmcs(struct vcpu *v)
 
     vmx_vmcs_enter(v);
 
+    if ( is_pvh_vcpu(v) )
+    {
+        int rc = pvh_construct_vmcs(v);
+        vmx_vmcs_exit(v);
+        return rc;
+    }
+
     /* VMCS controls. */
     __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
 
@@ -1294,6 +1534,9 @@ void vmx_do_resume(struct vcpu *v)
         hvm_asid_flush_vcpu(v);
     }
 
+    if ( is_pvh_vcpu(v) )
+        reset_stack_and_jump(vmx_asm_do_vmentry);
+
     debug_state = v->domain->debugger_attached
                   || 
v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3]
                   || 
v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP];
@@ -1477,7 +1720,7 @@ static void vmcs_dump(unsigned char ch)
 
     for_each_domain ( d )
     {
-        if ( !is_hvm_domain(d) )
+        if ( is_pv_domain(d) )
             continue;
         printk("\n>>> Domain %d <<<\n", d->domain_id);
         for_each_vcpu ( d, v )
-- 
1.7.2.3


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.