# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1228840082 0
# Node ID 6595393a3d28a7bf95f02b198f52d754bcfa7a80
# Parent 5535efd8e01141f840f9a8cbc31a9b3a4c9d49e9
Use virtual 8086 mode for VMX guests with CR0.PE == 0
When a VMX guest tries to enter real mode, put it in virtual 8086 mode
instead, if that's possible. Handle all errors and corner cases by
falling back to the real-mode emulator.
This is similar to the old VMXASSIST system except it uses Xen's
x86_emulate emulator instead of having a partial emulator in the guest
firmware. It more than doubles the speed of real-mode operation on
VMX.
Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---
tools/firmware/hvmloader/hvmloader.c | 19 ++
tools/libxc/xc_domain_restore.c | 16 ++
tools/libxc/xc_domain_save.c | 26 ++-
xen/arch/x86/hvm/vmx/entry.S | 14 +
xen/arch/x86/hvm/vmx/realmode.c | 45 ++---
xen/arch/x86/hvm/vmx/vmcs.c | 51 ++++--
xen/arch/x86/hvm/vmx/vmx.c | 250 ++++++++++++++++++++++++++++-----
xen/arch/x86/x86_32/asm-offsets.c | 4
xen/arch/x86/x86_64/asm-offsets.c | 4
xen/arch/x86/x86_emulate/x86_emulate.h | 1
xen/include/asm-x86/hvm/vmx/vmcs.h | 13 +
xen/include/asm-x86/perfc_defn.h | 3
xen/include/public/hvm/params.h | 5
13 files changed, 356 insertions(+), 95 deletions(-)
diff -r 5535efd8e011 -r 6595393a3d28 tools/firmware/hvmloader/hvmloader.c
--- a/tools/firmware/hvmloader/hvmloader.c Tue Dec 09 13:23:15 2008 +0000
+++ b/tools/firmware/hvmloader/hvmloader.c Tue Dec 09 16:28:02 2008 +0000
@@ -536,6 +536,23 @@ static uint16_t init_xen_platform_io_bas
return bios_info->xen_pfiob;
}
+/* Set up an empty TSS area for virtual 8086 mode to use.
+ * The only important thing is that it musn't have any bits set
+ * in the interrupt redirection bitmap, so all zeros will do. */
+static void init_vm86_tss(void)
+{
+ uint32_t tss;
+ struct xen_hvm_param p;
+
+ tss = e820_malloc(128, 128);
+ memset((char *)tss, 0, 128);
+ p.domid = DOMID_SELF;
+ p.index = HVM_PARAM_VM86_TSS;
+ p.value = tss;
+ hypercall_hvm_op(HVMOP_set_param, &p);
+ printf("vm86 TSS at %08x\n", tss);
+}
+
int main(void)
{
int option_rom_sz = 0, vgabios_sz = 0, etherboot_sz = 0;
@@ -605,6 +622,8 @@ int main(void)
printf("Loading ACPI ...\n");
acpi_build_tables();
}
+
+ init_vm86_tss();
cmos_write_memory_size();
diff -r 5535efd8e011 -r 6595393a3d28 tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c Tue Dec 09 13:23:15 2008 +0000
+++ b/tools/libxc/xc_domain_restore.c Tue Dec 09 16:28:02 2008 +0000
@@ -490,6 +490,22 @@ int xc_domain_restore(int xc_handle, int
continue;
}
+ if ( j == -4 )
+ {
+ uint64_t vm86_tss;
+
+ /* Skip padding 4 bytes then read the vm86 TSS location. */
+ if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
+ read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
+ {
+ ERROR("error read the address of the vm86 TSS");
+ goto out;
+ }
+
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
+ continue;
+ }
+
if ( j == 0 )
break; /* our work here is done */
diff -r 5535efd8e011 -r 6595393a3d28 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c Tue Dec 09 13:23:15 2008 +0000
+++ b/tools/libxc/xc_domain_save.c Tue Dec 09 16:28:02 2008 +0000
@@ -1388,18 +1388,30 @@ int xc_domain_save(int xc_handle, int io
if ( hvm )
{
struct {
- int minusthree;
+ int id;
uint32_t pad;
- uint64_t ident_pt;
- } chunk = { -3, 0 };
-
+ uint64_t data;
+ } chunk = { 0, };
+
+ chunk.id = -3;
xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
- (unsigned long *)&chunk.ident_pt);
-
- if ( (chunk.ident_pt != 0) &&
+ (unsigned long *)&chunk.data);
+
+ if ( (chunk.data != 0) &&
write_exact(io_fd, &chunk, sizeof(chunk)) )
{
PERROR("Error when writing the ident_pt for EPT guest");
+ goto out;
+ }
+
+ chunk.id = -4;
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
+ (unsigned long *)&chunk.data);
+
+ if ( (chunk.data != 0) &&
+ write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing the vm86 TSS for guest");
goto out;
}
}
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/entry.S
--- a/xen/arch/x86/hvm/vmx/entry.S Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/entry.S Tue Dec 09 16:28:02 2008 +0000
@@ -133,9 +133,15 @@ vmx_asm_do_vmentry:
cmpl $0,(r(dx),r(ax),1)
jnz .Lvmx_process_softirqs
- testb $0xff,VCPU_vmx_emul(r(bx))
- jnz .Lvmx_goto_realmode
-
+ testb $0xff,VCPU_vmx_emulate(r(bx))
+ jnz .Lvmx_goto_emulator
+ testb $0xff,VCPU_vmx_realmode(r(bx))
+ jz .Lvmx_not_realmode
+ cmpw $0,VCPU_vm86_seg_mask(r(bx))
+ jnz .Lvmx_goto_emulator
+ call_with_regs(vmx_enter_realmode)
+
+.Lvmx_not_realmode:
mov VCPU_hvm_guest_cr2(r(bx)),r(ax)
mov r(ax),%cr2
call vmx_trace_vmentry
@@ -189,7 +195,7 @@ vmx_asm_do_vmentry:
call vm_launch_fail
ud2
-.Lvmx_goto_realmode:
+.Lvmx_goto_emulator:
sti
call_with_regs(vmx_realmode)
jmp vmx_asm_do_vmentry
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/realmode.c
--- a/xen/arch/x86/hvm/vmx/realmode.c Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/realmode.c Tue Dec 09 16:28:02 2008 +0000
@@ -103,30 +103,12 @@ static void realmode_emulate_one(struct
static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
{
struct vcpu *curr = current;
- unsigned long seg_reg_dirty;
uint32_t intr_info;
int rc;
- seg_reg_dirty = hvmemul_ctxt->seg_reg_dirty;
- hvmemul_ctxt->seg_reg_dirty = 0;
+ perfc_incr(realmode_emulations);
rc = hvm_emulate_one(hvmemul_ctxt);
-
- if ( test_bit(x86_seg_cs, &hvmemul_ctxt->seg_reg_dirty) )
- {
- curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_CS;
- if ( hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt)->sel & 3 )
- curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_CS;
- }
-
- if ( test_bit(x86_seg_ss, &hvmemul_ctxt->seg_reg_dirty) )
- {
- curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_SS;
- if ( hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt)->sel & 3 )
- curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_SS;
- }
-
- hvmemul_ctxt->seg_reg_dirty |= seg_reg_dirty;
if ( rc == X86EMUL_UNHANDLEABLE )
{
@@ -210,7 +192,8 @@ void vmx_realmode(struct cpu_user_regs *
intr_info = 0;
}
- while ( curr->arch.hvm_vmx.vmxemul &&
+ curr->arch.hvm_vmx.vmx_emulate = 1;
+ while ( curr->arch.hvm_vmx.vmx_emulate &&
!softirq_pending(smp_processor_id()) &&
(curr->arch.hvm_vcpu.io_state == HVMIO_none) )
{
@@ -220,13 +203,27 @@ void vmx_realmode(struct cpu_user_regs *
* in real mode, because we don't emulate protected-mode IDT vectoring.
*/
if ( unlikely(!(++emulations & 15)) &&
- !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) &&
+ curr->arch.hvm_vmx.vmx_realmode &&
hvm_local_events_need_delivery(curr) )
break;
+
realmode_emulate_one(&hvmemul_ctxt);
- }
-
- if ( !curr->arch.hvm_vmx.vmxemul )
+
+ /* Stop emulating unless our segment state is not safe */
+ if ( curr->arch.hvm_vmx.vmx_realmode )
+ curr->arch.hvm_vmx.vmx_emulate =
+ (curr->arch.hvm_vmx.vm86_segment_mask != 0);
+ else
+ curr->arch.hvm_vmx.vmx_emulate =
+ ((hvmemul_ctxt.seg_reg[x86_seg_cs].sel & 3)
+ || (hvmemul_ctxt.seg_reg[x86_seg_ss].sel & 3));
+ }
+
+ /* Need to emulate next time if we've started an IO operation */
+ if ( curr->arch.hvm_vcpu.io_state != HVMIO_none )
+ curr->arch.hvm_vmx.vmx_emulate = 1;
+
+ if ( !curr->arch.hvm_vmx.vmx_emulate && !curr->arch.hvm_vmx.vmx_realmode )
{
/*
* Cannot enter protected mode with bogus selector RPLs and DPLs.
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/vmcs.c Tue Dec 09 16:28:02 2008 +0000
@@ -880,21 +880,34 @@ void vmx_do_resume(struct vcpu *v)
reset_stack_and_jump(vmx_asm_do_vmentry);
}
-static void vmx_dump_sel(char *name, enum x86_segment seg)
-{
- struct segment_register sreg;
- hvm_get_segment_register(current, seg, &sreg);
- printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n",
- name, sreg.sel, sreg.attr.bytes, sreg.limit,
- (unsigned long long)sreg.base);
-}
-
static unsigned long vmr(unsigned long field)
{
int rc;
unsigned long val;
val = __vmread_safe(field, &rc);
return rc ? 0 : val;
+}
+
+static void vmx_dump_sel(char *name, uint32_t selector)
+{
+ uint32_t sel, attr, limit;
+ uint64_t base;
+ sel = vmr(selector);
+ attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR));
+ limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR));
+ base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR));
+ printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016"PRIx64"\n",
+ name, sel, attr, limit, base);
+}
+
+static void vmx_dump_sel2(char *name, uint32_t lim)
+{
+ uint32_t limit;
+ uint64_t base;
+ limit = vmr(lim);
+ base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+ printk("%s: limit=0x%08x, base=0x%016"PRIx64"\n",
+ name, limit, base);
}
void vmcs_dump_vcpu(struct vcpu *v)
@@ -938,16 +951,16 @@ void vmcs_dump_vcpu(struct vcpu *v)
(unsigned long long)vmr(GUEST_SYSENTER_ESP),
(int)vmr(GUEST_SYSENTER_CS),
(unsigned long long)vmr(GUEST_SYSENTER_EIP));
- vmx_dump_sel("CS", x86_seg_cs);
- vmx_dump_sel("DS", x86_seg_ds);
- vmx_dump_sel("SS", x86_seg_ss);
- vmx_dump_sel("ES", x86_seg_es);
- vmx_dump_sel("FS", x86_seg_fs);
- vmx_dump_sel("GS", x86_seg_gs);
- vmx_dump_sel("GDTR", x86_seg_gdtr);
- vmx_dump_sel("LDTR", x86_seg_ldtr);
- vmx_dump_sel("IDTR", x86_seg_idtr);
- vmx_dump_sel("TR", x86_seg_tr);
+ vmx_dump_sel("CS", GUEST_CS_SELECTOR);
+ vmx_dump_sel("DS", GUEST_DS_SELECTOR);
+ vmx_dump_sel("SS", GUEST_SS_SELECTOR);
+ vmx_dump_sel("ES", GUEST_ES_SELECTOR);
+ vmx_dump_sel("FS", GUEST_FS_SELECTOR);
+ vmx_dump_sel("GS", GUEST_GS_SELECTOR);
+ vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT);
+ vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR);
+ vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT);
+ vmx_dump_sel("TR", GUEST_TR_SELECTOR);
x = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32;
x |= (uint32_t)vmr(TSC_OFFSET);
printk("TSC Offset = %016llx\n", x);
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/vmx.c Tue Dec 09 16:28:02 2008 +0000
@@ -704,6 +704,26 @@ static void vmx_ctxt_switch_to(struct vc
vpmu_load(v);
}
+
+/* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode
+ * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments.
+ * The guest thinks it's got ring-0 segments, so we need to fudge
+ * things. We store the ring-3 version in the VMCS to avoid lots of
+ * shuffling on vmenter and vmexit, and translate in these accessors. */
+
+#define rm_cs_attr (((union segment_attributes) { \
+ .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
+ .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define rm_ds_attr (((union segment_attributes) { \
+ .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
+ .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define vm86_ds_attr (((union segment_attributes) { \
+ .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0, \
+ .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define vm86_tr_attr (((union segment_attributes) { \
+ .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0, \
+ .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+
static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
struct segment_register *reg)
{
@@ -779,14 +799,85 @@ static void vmx_get_segment_register(str
/* Unusable flag is folded into Present flag. */
if ( attr & (1u<<16) )
reg->attr.fields.p = 0;
+
+ /* Adjust for virtual 8086 mode */
+ if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr
+ && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
+ {
+ struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
+ if ( seg == x86_seg_tr )
+ *reg = *sreg;
+ else if ( reg->base != sreg->base || seg == x86_seg_ss )
+ {
+ /* If the guest's reloaded the segment, remember the new version.
+ * We can't tell if the guest reloaded the segment with another
+ * one that has the same base. By default we assume it hasn't,
+ * since we don't want to lose big-real-mode segment attributes,
+ * but for SS we assume it has: the Ubuntu graphical bootloader
+ * does this and gets badly confused if we leave the old SS in
+ * place. */
+ reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
+ *sreg = *reg;
+ }
+ else
+ {
+ /* Always give realmode guests a selector that matches the base
+ * but keep the attr and limit from before */
+ *reg = *sreg;
+ reg->sel = reg->base >> 4;
+ }
+ }
}
static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
struct segment_register *reg)
{
- uint32_t attr;
-
+ uint32_t attr, sel, limit;
+ uint64_t base;
+
+ sel = reg->sel;
attr = reg->attr.bytes;
+ limit = reg->limit;
+ base = reg->base;
+
+ /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
+ if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
+ {
+ /* Remember the proper contents */
+ v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
+
+ if ( seg == x86_seg_tr )
+ {
+ if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] )
+ {
+ sel = 0;
+ attr = vm86_tr_attr;
+ limit = 0xff;
+ base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS];
+ v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
+ }
+ else
+ v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
+ }
+ else
+ {
+ /* Try to fake it out as a 16bit data segment. This could
+ * cause confusion for the guest if it reads the selector,
+ * but otherwise we have to emulate if *any* segment hasn't
+ * been reloaded. */
+ if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff
+ && reg->attr.fields.p )
+ {
+ sel = base >> 4;
+ attr = vm86_ds_attr;
+ limit = 0xffff;
+ v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
+ }
+ else
+ v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
+ }
+ }
+
attr = ((attr & 0xf00) << 4) | (attr & 0xff);
/* Not-present must mean unusable. */
@@ -794,67 +885,67 @@ static void vmx_set_segment_register(str
attr |= (1u << 16);
/* VMX has strict consistency requirement for flag G. */
- attr |= !!(reg->limit >> 20) << 15;
+ attr |= !!(limit >> 20) << 15;
vmx_vmcs_enter(v);
switch ( seg )
{
case x86_seg_cs:
- __vmwrite(GUEST_CS_SELECTOR, reg->sel);
- __vmwrite(GUEST_CS_LIMIT, reg->limit);
- __vmwrite(GUEST_CS_BASE, reg->base);
+ __vmwrite(GUEST_CS_SELECTOR, sel);
+ __vmwrite(GUEST_CS_LIMIT, limit);
+ __vmwrite(GUEST_CS_BASE, base);
__vmwrite(GUEST_CS_AR_BYTES, attr);
break;
case x86_seg_ds:
- __vmwrite(GUEST_DS_SELECTOR, reg->sel);
- __vmwrite(GUEST_DS_LIMIT, reg->limit);
- __vmwrite(GUEST_DS_BASE, reg->base);
+ __vmwrite(GUEST_DS_SELECTOR, sel);
+ __vmwrite(GUEST_DS_LIMIT, limit);
+ __vmwrite(GUEST_DS_BASE, base);
__vmwrite(GUEST_DS_AR_BYTES, attr);
break;
case x86_seg_es:
- __vmwrite(GUEST_ES_SELECTOR, reg->sel);
- __vmwrite(GUEST_ES_LIMIT, reg->limit);
- __vmwrite(GUEST_ES_BASE, reg->base);
+ __vmwrite(GUEST_ES_SELECTOR, sel);
+ __vmwrite(GUEST_ES_LIMIT, limit);
+ __vmwrite(GUEST_ES_BASE, base);
__vmwrite(GUEST_ES_AR_BYTES, attr);
break;
case x86_seg_fs:
- __vmwrite(GUEST_FS_SELECTOR, reg->sel);
- __vmwrite(GUEST_FS_LIMIT, reg->limit);
- __vmwrite(GUEST_FS_BASE, reg->base);
+ __vmwrite(GUEST_FS_SELECTOR, sel);
+ __vmwrite(GUEST_FS_LIMIT, limit);
+ __vmwrite(GUEST_FS_BASE, base);
__vmwrite(GUEST_FS_AR_BYTES, attr);
break;
case x86_seg_gs:
- __vmwrite(GUEST_GS_SELECTOR, reg->sel);
- __vmwrite(GUEST_GS_LIMIT, reg->limit);
- __vmwrite(GUEST_GS_BASE, reg->base);
+ __vmwrite(GUEST_GS_SELECTOR, sel);
+ __vmwrite(GUEST_GS_LIMIT, limit);
+ __vmwrite(GUEST_GS_BASE, base);
__vmwrite(GUEST_GS_AR_BYTES, attr);
break;
case x86_seg_ss:
- __vmwrite(GUEST_SS_SELECTOR, reg->sel);
- __vmwrite(GUEST_SS_LIMIT, reg->limit);
- __vmwrite(GUEST_SS_BASE, reg->base);
+ __vmwrite(GUEST_SS_SELECTOR, sel);
+ __vmwrite(GUEST_SS_LIMIT, limit);
+ __vmwrite(GUEST_SS_BASE, base);
__vmwrite(GUEST_SS_AR_BYTES, attr);
break;
case x86_seg_tr:
- __vmwrite(GUEST_TR_SELECTOR, reg->sel);
- __vmwrite(GUEST_TR_LIMIT, reg->limit);
- __vmwrite(GUEST_TR_BASE, reg->base);
+ __vmwrite(GUEST_TR_SELECTOR, sel);
+ __vmwrite(GUEST_TR_LIMIT, limit);
+ __vmwrite(GUEST_TR_BASE, base);
/* VMX checks that the the busy flag (bit 1) is set. */
__vmwrite(GUEST_TR_AR_BYTES, attr | 2);
break;
case x86_seg_gdtr:
- __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
- __vmwrite(GUEST_GDTR_BASE, reg->base);
+ __vmwrite(GUEST_GDTR_LIMIT, limit);
+ __vmwrite(GUEST_GDTR_BASE, base);
break;
case x86_seg_idtr:
- __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
- __vmwrite(GUEST_IDTR_BASE, reg->base);
+ __vmwrite(GUEST_IDTR_LIMIT, limit);
+ __vmwrite(GUEST_IDTR_BASE, base);
break;
case x86_seg_ldtr:
- __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
- __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
- __vmwrite(GUEST_LDTR_BASE, reg->base);
+ __vmwrite(GUEST_LDTR_SELECTOR, sel);
+ __vmwrite(GUEST_LDTR_LIMIT, limit);
+ __vmwrite(GUEST_LDTR_BASE, base);
__vmwrite(GUEST_LDTR_AR_BYTES, attr);
break;
default:
@@ -970,6 +1061,7 @@ static void vmx_update_guest_cr(struct v
switch ( cr )
{
case 0: {
+ int realmode;
unsigned long hw_cr0_mask =
X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
@@ -998,9 +1090,44 @@ static void vmx_update_guest_cr(struct v
vmx_fpu_enter(v);
}
- v->arch.hvm_vmx.vmxemul &= ~VMXEMUL_REALMODE;
- if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
- v->arch.hvm_vmx.vmxemul |= VMXEMUL_REALMODE;
+ realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE);
+ if ( realmode != v->arch.hvm_vmx.vmx_realmode )
+ {
+ enum x86_segment s;
+ struct segment_register reg[x86_seg_tr + 1];
+
+ /* Entering or leaving real mode: adjust the segment registers.
+ * Need to read them all either way, as realmode reads can update
+ * the saved values we'll use when returning to prot mode. */
+ for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
+ vmx_get_segment_register(v, s, ®[s]);
+ v->arch.hvm_vmx.vmx_realmode = realmode;
+
+ if ( realmode )
+ {
+ for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
+ vmx_set_segment_register(v, s, ®[s]);
+ v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
+ __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
+ __vmwrite(EXCEPTION_BITMAP, 0xffffffff);
+ }
+ else
+ {
+ for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
+ if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
+ vmx_set_segment_register(
+ v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
+ v->arch.hvm_vcpu.hw_cr[4] =
+ ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
+ |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
+ __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
+ __vmwrite(EXCEPTION_BITMAP,
+ HVM_TRAP_MASK
+ | (paging_mode_hap(v->domain) ?
+ 0 : (1U << TRAP_page_fault))
+ | (1U << TRAP_no_device));
+ }
+ }
v->arch.hvm_vcpu.hw_cr[0] =
v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
@@ -1028,6 +1155,8 @@ static void vmx_update_guest_cr(struct v
if ( paging_mode_hap(v->domain) )
v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
+ if ( v->arch.hvm_vmx.vmx_realmode )
+ v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
{
v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
@@ -1097,6 +1226,7 @@ static void __vmx_inject_exception(int t
static void __vmx_inject_exception(int trap, int type, int error_code)
{
unsigned long intr_fields;
+ struct vcpu *curr = current;
/*
* NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
@@ -1113,6 +1243,11 @@ static void __vmx_inject_exception(int t
}
__vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
+
+ /* Can't inject exceptions in virtual 8086 mode because they would
+ * use the protected-mode IDT. Emulate at the next vmenter instead. */
+ if ( curr->arch.hvm_vmx.vmx_realmode )
+ curr->arch.hvm_vmx.vmx_emulate = 1;
}
void vmx_inject_hw_exception(int trap, int error_code)
@@ -2072,6 +2207,17 @@ static void vmx_failed_vmentry(unsigned
domain_crash(curr->domain);
}
+asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs)
+{
+ struct vcpu *v = current;
+
+ /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3. Since
+ * we have CR4.VME == 1 and our own TSS with an empty interrupt
+ * redirection bitmap, all software INTs will be handled by vm86 */
+ v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
+ regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
+}
+
asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
{
unsigned int exit_reason, idtv_info;
@@ -2099,6 +2245,42 @@ asmlinkage void vmx_vmexit_handler(struc
if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
return vmx_failed_vmentry(exit_reason, regs);
+
+ if ( v->arch.hvm_vmx.vmx_realmode )
+ {
+ unsigned int vector;
+
+ /* Put RFLAGS back the way the guest wants it */
+ regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
+ regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
+
+ /* Unless this exit was for an interrupt, we've hit something
+ * vm86 can't handle. Try again, using the emulator. */
+ switch ( exit_reason )
+ {
+ case EXIT_REASON_EXCEPTION_NMI:
+ vector = __vmread(VM_EXIT_INTR_INFO) & INTR_INFO_VECTOR_MASK;;
+ if ( vector != TRAP_page_fault
+ && vector != TRAP_nmi
+ && vector != TRAP_machine_check )
+ {
+ perfc_incr(realmode_exits);
+ v->arch.hvm_vmx.vmx_emulate = 1;
+ return;
+ }
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ case EXIT_REASON_INIT:
+ case EXIT_REASON_SIPI:
+ case EXIT_REASON_PENDING_VIRT_INTR:
+ case EXIT_REASON_PENDING_VIRT_NMI:
+ case EXIT_REASON_MACHINE_CHECK:
+ break;
+ default:
+ v->arch.hvm_vmx.vmx_emulate = 1;
+ perfc_incr(realmode_exits);
+ return;
+ }
+ }
hvm_maybe_deassert_evtchn_irq();
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/x86_32/asm-offsets.c Tue Dec 09 16:28:02 2008 +0000
@@ -88,7 +88,9 @@ void __dummy__(void)
BLANK();
OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
- OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
+ OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
+ OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
+ OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
BLANK();
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/x86_64/asm-offsets.c
--- a/xen/arch/x86/x86_64/asm-offsets.c Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/x86_64/asm-offsets.c Tue Dec 09 16:28:02 2008 +0000
@@ -107,7 +107,9 @@ void __dummy__(void)
BLANK();
OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
- OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
+ OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
+ OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
+ OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
BLANK();
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/x86_emulate/x86_emulate.h
--- a/xen/arch/x86/x86_emulate/x86_emulate.h Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h Tue Dec 09 16:28:02 2008 +0000
@@ -67,6 +67,7 @@ typedef union segment_attributes {
uint16_t l: 1; /* 9; Bit 53 */
uint16_t db: 1; /* 10; Bit 54 */
uint16_t g: 1; /* 11; Bit 55 */
+ uint16_t pad: 4;
} fields;
} __attribute__ ((packed)) segment_attributes_t;
diff -r 5535efd8e011 -r 6595393a3d28 xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Tue Dec 09 16:28:02 2008 +0000
@@ -109,11 +109,16 @@ struct arch_vmx_struct {
unsigned long host_cr0;
+ /* Is the guest in real mode? */
+ uint8_t vmx_realmode;
/* Are we emulating rather than VMENTERing? */
-#define VMXEMUL_REALMODE 1 /* Yes, because CR0.PE == 0 */
-#define VMXEMUL_BAD_CS 2 /* Yes, because CS.RPL != CPL */
-#define VMXEMUL_BAD_SS 4 /* Yes, because SS.RPL != CPL */
- uint8_t vmxemul;
+ uint8_t vmx_emulate;
+ /* Bitmask of segments that we can't safely use in virtual 8086 mode */
+ uint16_t vm86_segment_mask;
+ /* Shadow CS, SS, DS, ES, FS, GS, TR while in virtual 8086 mode */
+ struct segment_register vm86_saved_seg[x86_seg_tr + 1];
+ /* Remember EFLAGS while in virtual 8086 mode */
+ uint32_t vm86_saved_eflags;
};
int vmx_create_vmcs(struct vcpu *v);
diff -r 5535efd8e011 -r 6595393a3d28 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/include/asm-x86/perfc_defn.h Tue Dec 09 16:28:02 2008 +0000
@@ -127,4 +127,7 @@ PERFCOUNTER(mshv_wrmsr_tpr,
PERFCOUNTER(mshv_wrmsr_tpr, "MS Hv wrmsr tpr")
PERFCOUNTER(mshv_wrmsr_eoi, "MS Hv wrmsr eoi")
+PERFCOUNTER(realmode_emulations, "realmode instructions emulated")
+PERFCOUNTER(realmode_exits, "vmexits from realmode")
+
/*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r 5535efd8e011 -r 6595393a3d28 xen/include/public/hvm/params.h
--- a/xen/include/public/hvm/params.h Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/include/public/hvm/params.h Tue Dec 09 16:28:02 2008 +0000
@@ -100,6 +100,9 @@
/* ACPI S state: currently support S0 and S3 on x86. */
#define HVM_PARAM_ACPI_S_STATE 14
-#define HVM_NR_PARAMS 15
+/* TSS used on Intel when CR0.PE=0. */
+#define HVM_PARAM_VM86_TSS 15
+
+#define HVM_NR_PARAMS 16
#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|