ChangeSet 1.1345, 2005/04/20 21:35:17+01:00, leendert@xxxxxxxxxxxxxx
[PATCH] VMX world switch
The attached code implements a VMX world switch to vmxassist (a small
assist
module residing in a VMX enabled partition where it is responsible for
emulating real mode) whever CR0.PE is disabled.
The patch temporarily disables the PGE feature flag in cpuid as it is
currently broken (try running an unmodified 2.6 kernel that sets PGE in
mm/init.c/paging_init()).
The patch adds consistency checks before setting the ARCH_VMX_IO_WAIT
state
to detect race conditions on SMP systems.
Signed-Off-By: Leendert van Doorn <leendert@xxxxxxxxxxxxxx>
Signed-off-by: ian@xxxxxxxxxxxxx
arch/x86/vmx.c | 281 ++++++++++++++++++++++++++++++++++++++++++--
arch/x86/vmx_platform.c | 5
include/asm-x86/vmx_vmcs.h | 13 --
include/public/vmx_assist.h | 101 +++++++++++++++
4 files changed, 382 insertions(+), 18 deletions(-)
diff -Nru a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c
--- a/xen/arch/x86/vmx.c 2005-04-20 17:02:40 -04:00
+++ b/xen/arch/x86/vmx.c 2005-04-20 17:02:40 -04:00
@@ -195,6 +195,7 @@
cpuid(input, &eax, &ebx, &ecx, &edx);
if (input == 1) {
+ clear_bit(X86_FEATURE_PGE, &edx); /* temporarily disabled */
clear_bit(X86_FEATURE_PSE, &edx);
clear_bit(X86_FEATURE_PAE, &edx);
clear_bit(X86_FEATURE_PSE36, &edx);
@@ -382,10 +383,261 @@
do_block();
}
-static int
-vm86assist(struct exec_domain *d)
+enum { COPY_IN = 0, COPY_OUT };
+
+static inline int
+vmx_copy(void *buf, unsigned long laddr, int size, int dir)
+{
+ unsigned char *addr;
+ unsigned long mfn;
+
+ if ((size + (laddr & (PAGE_SIZE - 1))) >= PAGE_SIZE) {
+ printf("vmx_copy exceeds page boundary\n");
+ return 0;
+ }
+
+ mfn = phys_to_machine_mapping(gva_to_gpte(laddr) >> PAGE_SHIFT);
+ addr = map_domain_mem((mfn << PAGE_SHIFT) | (laddr & ~PAGE_MASK));
+
+ if (dir == COPY_IN)
+ memcpy(buf, addr, size);
+ else
+ memcpy(addr, buf, size);
+
+ unmap_domain_mem(addr);
+ return 1;
+}
+
+int
+vmx_world_save(struct exec_domain *d, struct vmx_assist_context *c)
+{
+ unsigned long inst_len;
+ int error = 0;
+
+ error |= __vmread(INSTRUCTION_LEN, &inst_len);
+ error |= __vmread(GUEST_EIP, &c->eip);
+ c->eip += inst_len; /* skip transition instruction */
+ error |= __vmread(GUEST_ESP, &c->esp);
+ error |= __vmread(GUEST_EFLAGS, &c->eflags);
+
+ error |= __vmread(CR0_READ_SHADOW, &c->cr0);
+ c->cr3 = d->arch.arch_vmx.cpu_cr3;
+ error |= __vmread(CR4_READ_SHADOW, &c->cr4);
+
+ error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
+ error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
+
+ error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
+ error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
+
+ error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
+ error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
+ error |= __vmread(GUEST_CS_BASE, &c->cs_base);
+ error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
+
+ error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
+ error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
+ error |= __vmread(GUEST_DS_BASE, &c->ds_base);
+ error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
+
+ error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
+ error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
+ error |= __vmread(GUEST_ES_BASE, &c->es_base);
+ error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
+
+ error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
+ error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
+ error |= __vmread(GUEST_SS_BASE, &c->ss_base);
+ error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
+
+ error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
+ error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
+ error |= __vmread(GUEST_FS_BASE, &c->fs_base);
+ error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
+
+ error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
+ error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
+ error |= __vmread(GUEST_GS_BASE, &c->gs_base);
+ error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
+
+ error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
+ error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
+ error |= __vmread(GUEST_TR_BASE, &c->tr_base);
+ error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
+
+ error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
+ error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
+ error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
+ error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
+
+ return !error;
+}
+
+int
+vmx_world_restore(struct exec_domain *d, struct vmx_assist_context *c)
+{
+ unsigned long mfn, old_cr4;
+ int error = 0;
+
+ error |= __vmwrite(GUEST_EIP, c->eip);
+ error |= __vmwrite(GUEST_ESP, c->esp);
+ error |= __vmwrite(GUEST_EFLAGS, c->eflags);
+
+ error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
+
+ if (c->cr3 == d->arch.arch_vmx.cpu_cr3) {
+ /*
+ * This is simple TLB flush, implying the guest has
+ * removed some translation or changed page attributes.
+ * We simply invalidate the shadow.
+ */
+ mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
+ if ((mfn << PAGE_SHIFT) != pagetable_val(d->arch.guest_table)) {
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3);
+ domain_crash_synchronous();
+ return 0;
+ }
+ shadow_sync_all(d->domain);
+ } else {
+ /*
+ * If different, make a shadow. Check if the PDBR is valid
+ * first.
+ */
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %lx", c->cr3);
+ if ((c->cr3 >> PAGE_SHIFT) > d->domain->max_pages) {
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3);
+ domain_crash_synchronous();
+ return 0;
+ }
+ mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
+ d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+ update_pagetables(d);
+ /*
+ * arch.shadow_table should now hold the next CR3 for shadow
+ */
+ d->arch.arch_vmx.cpu_cr3 = c->cr3;
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", c->cr3);
+ __vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table));
+ }
+
+ error |= __vmread(CR4_READ_SHADOW, &old_cr4);
+ error |= __vmwrite(GUEST_CR4, (c->cr4 | X86_CR4_VMXE));
+ error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
+
+ error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
+ error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
+
+ error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
+ error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
+
+ error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
+ error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
+ error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
+ error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
+
+ error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
+ error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
+ error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
+ error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
+
+ error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
+ error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
+ error |= __vmwrite(GUEST_ES_BASE, c->es_base);
+ error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
+
+ error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
+ error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
+ error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
+ error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
+
+ error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
+ error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
+ error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
+ error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
+
+ error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
+ error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
+ error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
+ error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
+
+ error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
+ error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
+ error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
+ error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
+
+ error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
+ error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
+ error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
+ error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
+
+ return !error;
+}
+
+enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
+
+int
+vmx_assist(struct exec_domain *d, int mode)
{
- /* stay tuned ... */
+ struct vmx_assist_context c;
+ unsigned long magic, cp;
+
+ /* make sure vmxassist exists (this is not an error) */
+ if (!vmx_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), COPY_IN))
+ return 0;
+ if (magic != VMXASSIST_MAGIC)
+ return 0;
+
+ switch (mode) {
+ /*
+ * Transfer control to vmxassist.
+ * Store the current context in VMXASSIST_OLD_CONTEXT and load
+ * the new VMXASSIST_NEW_CONTEXT context. This context was created
+ * by vmxassist and will transfer control to it.
+ */
+ case VMX_ASSIST_INVOKE:
+ /* save the old context */
+ if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
+ goto error;
+ if (cp != 0) {
+ if (!vmx_world_save(d, &c))
+ goto error;
+ if (!vmx_copy(&c, cp, sizeof(c), COPY_OUT))
+ goto error;
+ }
+
+ /* restore the new context, this should activate vmxassist */
+ if (!vmx_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), COPY_IN))
+ goto error;
+ if (cp != 0) {
+ if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
+ goto error;
+ if (!vmx_world_restore(d, &c))
+ goto error;
+ return 1;
+ }
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|