x86emul: conditionally clear BNDn for branches Considering that we surface MPX to HVM guests, instructions we emulate should also correctly deal with MPX state. While for now BND* instructions don't get emulated, the effect of branches (which we do emulate) without BND prefix should be taken care of. Signed-off-by: Jan Beulich --- RFC reasons: - I haven't been able to find a layout specification for the BNDCSR area in the SDM, therefore struct xstate_bndcsr's layout is guessed. - The SDM volumes contradict one another regarding short conditional branch behavior: SDM Vol 1 says they init BNDn (except for J{,E,R}CXZ), while Vol 2 says BND only affects near branches. The code for now assumes only near ones are affected. - The SDM leaves open whether only taken Jcc will cause BNDn init, or also not taken ones. The code for now assumes only taken ones do. - It is also left unclear whether addressing forms without base address register are valid for BNDMK (the instruction page references SRCMEM.base without indicating what this means in that specific case). The code uses an explicit base register for that reason (which also yields slightly smaller code). --- a/tools/tests/x86_emulator/x86_emulate.c +++ b/tools/tests/x86_emulator/x86_emulate.c @@ -10,11 +10,15 @@ typedef bool bool_t; #define is_canonical_address(x) (((int64_t)(x) >> 47) == ((int64_t)(x) >> 63)) +#define EFER_SCE (1 << 0) +#define EFER_LMA (1 << 10) + #define BUG() abort() #define ASSERT assert #define cpu_has_amd_erratum(nr) 0 #define mark_regs_dirty(r) ((void)(r)) +#define read_bndcfgu() 0 #define __packed __attribute__((packed)) --- a/xen/arch/x86/Rules.mk +++ b/xen/arch/x86/Rules.mk @@ -14,6 +14,7 @@ $(call as-insn-check,CFLAGS,CC,"vmcall", $(call as-insn-check,CFLAGS,CC,"vmcall",-DHAVE_GAS_VMX) $(call as-insn-check,CFLAGS,CC,"invept (%rax)$$(comma)%rax",-DHAVE_GAS_EPT) $(call as-insn-check,CFLAGS,CC,"rdfsbase %rax",-DHAVE_GAS_FSGSBASE) +$(call as-insn-check,CFLAGS,CC,"bndmov %bnd0$$(comma)%bnd0",-DHAVE_GAS_MPX) $(call as-insn-check,CFLAGS,CC,".equ \"x\"$$(comma)1", \ -U__OBJECT_LABEL__ -DHAVE_GAS_QUOTED_SYM \ '-D__OBJECT_LABEL__=$(subst $(BASEDIR)/,,$(CURDIR))/$$@') --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -421,6 +421,8 @@ int vcpu_initialise(struct vcpu *v) vmce_init_vcpu(v); } + else if ( (rc = xstate_alloc_save_area(v)) != 0 ) + return rc; spin_lock_init(&v->arch.vpmu.vpmu_lock); --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -2623,6 +2623,11 @@ static int vmx_msr_read_intercept(unsign case MSR_IA32_DEBUGCTLMSR: __vmread(GUEST_IA32_DEBUGCTL, msr_content); break; + case MSR_IA32_BNDCFGS: + if ( !cpu_has_mpx ) + goto gp_fault; + __vmread(GUEST_BNDCFGS, msr_content); + break; case IA32_FEATURE_CONTROL_MSR: case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_TRUE_ENTRY_CTLS: if ( !nvmx_msr_read_intercept(msr, msr_content) ) --- a/xen/arch/x86/x86_emulate.c +++ b/xen/arch/x86/x86_emulate.c @@ -13,6 +13,7 @@ #include #include /* mark_regs_dirty() */ #include /* current_cpu_info */ +#include /* read_bndcfgu() */ #include /* cpu_has_amd_erratum() */ /* Avoid namespace pollution. */ --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -355,9 +355,10 @@ typedef union { #define MSR_SYSENTER_CS 0x00000174 #define MSR_SYSENTER_ESP 0x00000175 #define MSR_SYSENTER_EIP 0x00000176 +#define MSR_BNDCFGS 0x00000d90 +#define BNDCFG_ENABLE (1 << 0) +#define BNDCFG_PRESERVE (1 << 1) #define MSR_EFER 0xc0000080 -#define EFER_SCE (1u<<0) -#define EFER_LMA (1u<<10) #define MSR_STAR 0xc0000081 #define MSR_LSTAR 0xc0000082 #define MSR_CSTAR 0xc0000083 @@ -1090,6 +1091,7 @@ static bool_t vcpu_has( #define vcpu_has_clflush() vcpu_has( 1, EDX, 19, ctxt, ops) #define vcpu_has_lzcnt() vcpu_has(0x80000001, ECX, 5, ctxt, ops) #define vcpu_has_bmi1() vcpu_has(0x00000007, EBX, 3, ctxt, ops) +#define vcpu_has_mpx() vcpu_has(0x00000007, EBX, 14, ctxt, ops) #define vcpu_must_have(leaf, reg, bit) \ generate_exception_if(!vcpu_has(leaf, reg, bit, ctxt, ops), EXC_UD, -1) @@ -1491,6 +1493,41 @@ static int inject_swint(enum x86_swint_t return ops->inject_hw_exception(fault_type, error_code, ctxt); } +static void clear_bnd(struct x86_emulate_ctxt *ctxt, + const struct x86_emulate_ops *ops, enum vex_pfx pfx) +{ + uint64_t bndcfg; + int rc; + + if ( pfx == vex_f2 || !vcpu_has_mpx() ) + return; + +#ifdef __XEN__ + ASSERT(cpu_has_mpx); +#endif + + if ( !mode_ring0() ) + bndcfg = read_bndcfgu(); + else if ( !ops->read_msr || + ops->read_msr(MSR_BNDCFGS, &bndcfg, ctxt) ) + return; + if ( (bndcfg & BNDCFG_ENABLE) && !(bndcfg & BNDCFG_PRESERVE) ) + { +#if HAVE_GAS_MPX + asm volatile ( "bndmk -1(%0), %%bnd0" :: "r" (0L) ); + asm volatile ( "bndmk -1(%0), %%bnd1" :: "r" (0L) ); + asm volatile ( "bndmk -1(%0), %%bnd2" :: "r" (0L) ); + asm volatile ( "bndmk -1(%0), %%bnd3" :: "r" (0L) ); +#else + asm volatile ( ".byte 0xf3, 0x0f, 0x1b, 0x41, -1" :: "c" (0L) ); + asm volatile ( ".byte 0xf3, 0x0f, 0x1b, 0x49, -1" :: "c" (0L) ); + asm volatile ( ".byte 0xf3, 0x0f, 0x1b, 0x51, -1" :: "c" (0L) ); + asm volatile ( ".byte 0xf3, 0x0f, 0x1b, 0x59, -1" :: "c" (0L) ); +#endif + } + done:; +} + int x86emul_unhandleable_rw( enum x86_segment seg, unsigned long offset, @@ -2720,6 +2757,7 @@ x86_emulate( (rc = ops->insn_fetch(x86_seg_cs, dst.val, NULL, 0, ctxt)) ) goto done; _regs.eip = dst.val; + clear_bnd(ctxt, ops, vex.pfx); break; } @@ -3465,6 +3503,7 @@ x86_emulate( op_bytes = ((op_bytes == 4) && mode_64bit()) ? 8 : op_bytes; src.val = _regs.eip; jmp_rel(rel); + clear_bnd(ctxt, ops, vex.pfx); goto push; } @@ -3473,6 +3512,7 @@ x86_emulate( ? (int32_t)insn_fetch_type(int16_t) : insn_fetch_type(int32_t)); jmp_rel(rel); + clear_bnd(ctxt, ops, vex.pfx); break; } @@ -4443,7 +4483,10 @@ x86_emulate( ? (int32_t)insn_fetch_type(int16_t) : insn_fetch_type(int32_t)); if ( test_cc(b, _regs.eflags) ) + { jmp_rel(rel); + clear_bnd(ctxt, ops, vex.pfx); + } break; } --- a/xen/arch/x86/xstate.c +++ b/xen/arch/x86/xstate.c @@ -470,15 +470,33 @@ bool_t xsave_enabled(const struct vcpu * int xstate_alloc_save_area(struct vcpu *v) { struct xsave_struct *save_area; + unsigned int size; - if ( !cpu_has_xsave || is_idle_vcpu(v) ) + if ( !cpu_has_xsave ) return 0; - BUG_ON(xsave_cntxt_size < XSTATE_AREA_MIN_SIZE); + if ( !is_idle_vcpu(v) || !cpu_has_xsavec ) + { + size = xsave_cntxt_size; + BUG_ON(size < XSTATE_AREA_MIN_SIZE); + } + else + { + /* + * For idle vcpus on XSAVEC-capable CPUs allocate an area large + * enough to save any individual extended state. + */ + unsigned int i; + + for ( size = 0, i = 2; i < xstate_features; ++i ) + if ( size < xstate_sizes[i] ) + size = xstate_sizes[i]; + size += XSTATE_AREA_MIN_SIZE; + } /* XSAVE/XRSTOR requires the save area be 64-byte-boundary aligned. */ BUILD_BUG_ON(__alignof(*save_area) < 64); - save_area = _xzalloc(xsave_cntxt_size, __alignof(*save_area)); + save_area = _xzalloc(size, __alignof(*save_area)); if ( save_area == NULL ) return -ENOMEM; @@ -697,6 +715,40 @@ int handle_xsetbv(u32 index, u64 new_bv) return 0; } +uint64_t read_bndcfgu(void) +{ + unsigned long cr0 = read_cr0(); + struct xsave_struct *xstate + = idle_vcpu[smp_processor_id()]->arch.xsave_area; + const struct xstate_bndcsr *bndcsr; + + clts(); + + if ( cpu_has_xsavec ) + { + asm ( ".byte 0x0f,0xc7,0x27\n" /* xsavec */ + : "=m" (*xstate) + : "a" (XSTATE(BNDCSR)), "d" (0), "D" (xstate) ); + + bndcsr = (void *)(xstate + 1); + } + else + { + alternative_io(".byte 0x0f,0xae,0x27\n", /* xsave */ + ".byte 0x0f,0xae,0x37\n", /* xsaveopt */ + X86_FEATURE_XSAVEOPT, + "=m" (*xstate), + "a" (XSTATE(BNDCSR)), "d" (0), "D" (xstate)); + + bndcsr = (void *)xstate + xstate_offsets[_XSTATE_BNDCSR]; + } + + if ( cr0 & X86_CR0_TS ) + write_cr0(cr0); + + return xstate->xsave_hdr.xstate_bv & XSTATE(BNDCSR) ? bndcsr->bndcfgu : 0; +} + /* * Local variables: * mode: C --- a/xen/include/asm-x86/xstate.h +++ b/xen/include/asm-x86/xstate.h @@ -26,16 +26,30 @@ #define XSAVE_HDR_OFFSET FXSAVE_SIZE #define XSTATE_AREA_MIN_SIZE (FXSAVE_SIZE + XSAVE_HDR_SIZE) -#define XSTATE_FP (1ULL << 0) -#define XSTATE_SSE (1ULL << 1) -#define XSTATE_YMM (1ULL << 2) -#define XSTATE_BNDREGS (1ULL << 3) -#define XSTATE_BNDCSR (1ULL << 4) -#define XSTATE_OPMASK (1ULL << 5) -#define XSTATE_ZMM (1ULL << 6) -#define XSTATE_HI_ZMM (1ULL << 7) -#define XSTATE_PKRU (1ULL << 9) -#define XSTATE_LWP (1ULL << 62) /* AMD lightweight profiling */ +#define _XSTATE_FP 0 +#define _XSTATE_SSE 1 +#define _XSTATE_YMM 2 +#define _XSTATE_BNDREGS 3 +#define _XSTATE_BNDCSR 4 +#define _XSTATE_OPMASK 5 +#define _XSTATE_ZMM 6 +#define _XSTATE_HI_ZMM 7 +#define _XSTATE_PKRU 9 +#define _XSTATE_LWP 62 /* AMD lightweight profiling */ + +#define XSTATE(n) (1ULL << _XSTATE_##n) + +#define XSTATE_FP XSTATE(FP) +#define XSTATE_SSE XSTATE(SSE) +#define XSTATE_YMM XSTATE(YMM) +#define XSTATE_BNDREGS XSTATE(BNDREGS) +#define XSTATE_BNDCSR XSTATE(BNDCSR) +#define XSTATE_OPMASK XSTATE(OPMASK) +#define XSTATE_ZMM XSTATE(ZMM) +#define XSTATE_HI_ZMM XSTATE(HI_ZMM) +#define XSTATE_PKRU XSTATE(PKRU) +#define XSTATE_LWP XSTATE(LWP) + #define XSTATE_FP_SSE (XSTATE_FP | XSTATE_SSE) #define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE | XSTATE_YMM | XSTATE_OPMASK | \ XSTATE_ZMM | XSTATE_HI_ZMM | XSTATE_NONLAZY) @@ -87,11 +101,17 @@ struct __attribute__((aligned (64))) xsa char data[]; /* Variable layout states */ }; +struct xstate_bndcsr { + uint64_t bndcfgu; + uint64_t bndstatus; +}; + /* extended state operations */ bool_t __must_check set_xcr0(u64 xfeatures); uint64_t get_xcr0(void); void set_msr_xss(u64 xss); uint64_t get_msr_xss(void); +uint64_t read_bndcfgu(void); void xsave(struct vcpu *v, uint64_t mask); void xrstor(struct vcpu *v, uint64_t mask); bool_t xsave_enabled(const struct vcpu *v);