x86emul: support FMA4 insns Signed-off-by: Jan Beulich --- a/.gitignore +++ b/.gitignore @@ -226,7 +226,7 @@ tools/tests/x86_emulator/asm tools/tests/x86_emulator/avx*.[ch] tools/tests/x86_emulator/blowfish.h +tools/tests/x86_emulator/fma*.[ch] tools/tests/x86_emulator/sse*.[ch] tools/tests/x86_emulator/test_x86_emulator tools/tests/x86_emulator/x86_emulate --- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -12,7 +12,8 @@ run: $(TARGET) ./$(TARGET) SIMD := sse sse2 sse4 avx -TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx +FMA := fma4 +TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA) blowfish-cflags := "" blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic=" @@ -29,6 +30,9 @@ sse4-flts := $(sse2-flts) avx-vecs := 16 32 avx-ints := avx-flts := 4 8 +fma4-vecs := $(avx-vecs) +fma4-ints := +fma4-flts := $(avx-flts) # When converting SSE to AVX, have the compiler avoid XMM0 to widen # coverage of the VEX.vvvv checks in the emulator. We must not do this, @@ -58,7 +62,7 @@ $(1)-avx-cflags := \ "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)")) endef -$(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor)))) +$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor)))) $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile rm -f $@.new $*.bin @@ -77,6 +81,11 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t $(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))): ln -sf simd.c $@ +$(addsuffix .c,$(FMA)): + ln -sf simd-fma.c $@ + +$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h + $(TARGET): x86_emulate.o test_x86_emulator.o $(HOSTCC) -o $@ $^ --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -1,71 +1,6 @@ -#include +#include "simd.h" -asm ( - "\t.text\n" - "\t.globl _start\n" - "_start:\n" -#if defined(__i386__) && VEC_SIZE == 16 - "\tpush %ebp\n" - "\tmov %esp,%ebp\n" - "\tand $~0xf,%esp\n" - "\tcall simd_test\n" - "\tleave\n" - "\tret" -#else - "\tjmp simd_test" -#endif - ); - -typedef -#if defined(INT_SIZE) -# define ELEM_SIZE INT_SIZE -signed int -# if INT_SIZE == 1 -# define MODE QI -# elif INT_SIZE == 2 -# define MODE HI -# elif INT_SIZE == 4 -# define MODE SI -# elif INT_SIZE == 8 -# define MODE DI -# endif -#elif defined(UINT_SIZE) -# define ELEM_SIZE UINT_SIZE -unsigned int -# if UINT_SIZE == 1 -# define MODE QI -# elif UINT_SIZE == 2 -# define MODE HI -# elif UINT_SIZE == 4 -# define MODE SI -# elif UINT_SIZE == 8 -# define MODE DI -# endif -#elif defined(FLOAT_SIZE) -float -# define ELEM_SIZE FLOAT_SIZE -# if FLOAT_SIZE == 4 -# define MODE SF -# elif FLOAT_SIZE == 8 -# define MODE DF -# endif -#endif -#ifndef VEC_SIZE -# define VEC_SIZE ELEM_SIZE -#endif -__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t; - -#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE) - -typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t; - -/* Various builtins want plain char / int / long long vector types ... */ -typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t; -typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t; -typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t; -#if VEC_SIZE >= 8 -typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t; -#endif +ENTRY(simd_test); #if VEC_SIZE == 8 && defined(__SSE__) # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff) @@ -418,13 +353,6 @@ static inline bool _to_bool(byte_vec_t b # endif #endif -/* - * Suppress value propagation by the compiler, preventing unwanted - * optimization. This at once makes the compiler use memory operands - * more often, which for our purposes is the more interesting case. - */ -#define touch(var) asm volatile ( "" : "+m" (var) ) - int simd_test(void) { unsigned int i, j; --- /dev/null +++ b/tools/tests/x86_emulator/simd.h @@ -0,0 +1,78 @@ +#include + +#if defined(__i386__) && VEC_SIZE == 16 +# define ENTRY(name) \ +asm ( "\t.text\n" \ + "\t.globl _start\n" \ + "_start:\n" \ + "\tpush %ebp\n" \ + "\tmov %esp,%ebp\n" \ + "\tand $~0xf,%esp\n" \ + "\tcall " #name "\n" \ + "\tleave\n" \ + "\tret" ) +#else +# define ENTRY(name) \ +asm ( "\t.text\n" \ + "\t.globl _start\n" \ + "_start:\n" \ + "\tjmp " #name ) +#endif + +typedef +#if defined(INT_SIZE) +# define ELEM_SIZE INT_SIZE +signed int +# if INT_SIZE == 1 +# define MODE QI +# elif INT_SIZE == 2 +# define MODE HI +# elif INT_SIZE == 4 +# define MODE SI +# elif INT_SIZE == 8 +# define MODE DI +# endif +#elif defined(UINT_SIZE) +# define ELEM_SIZE UINT_SIZE +unsigned int +# if UINT_SIZE == 1 +# define MODE QI +# elif UINT_SIZE == 2 +# define MODE HI +# elif UINT_SIZE == 4 +# define MODE SI +# elif UINT_SIZE == 8 +# define MODE DI +# endif +#elif defined(FLOAT_SIZE) +float +# define ELEM_SIZE FLOAT_SIZE +# if FLOAT_SIZE == 4 +# define MODE SF +# elif FLOAT_SIZE == 8 +# define MODE DF +# endif +#endif +#ifndef VEC_SIZE +# define VEC_SIZE ELEM_SIZE +#endif +__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t; + +#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE) + +typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t; + +/* Various builtins want plain char / int / long long vector types ... */ +typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t; +typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t; +typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t; +#if VEC_SIZE >= 8 +typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t; +#endif + +/* + * Suppress value propagation by the compiler, preventing unwanted + * optimization. This at once makes the compiler use memory operands + * more often, which for our purposes is the more interesting case. + */ +#define touch(var) asm volatile ( "" : "+m" (var) ) --- /dev/null +++ b/tools/tests/x86_emulator/simd-fma.c @@ -0,0 +1,121 @@ +#include "simd.h" + +ENTRY(fma_test); + +#if VEC_SIZE < 16 +# define to_bool(cmp) (!~(cmp)[0]) +#elif VEC_SIZE == 16 +# if FLOAT_SIZE == 4 +# define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0) +# elif FLOAT_SIZE == 8 +# define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0) +# endif +#elif VEC_SIZE == 32 +# if FLOAT_SIZE == 4 +# define to_bool(cmp) __builtin_ia32_vtestcps256(cmp, (vec_t){} == 0) +# elif FLOAT_SIZE == 8 +# define to_bool(cmp) __builtin_ia32_vtestcpd256(cmp, (vec_t){} == 0) +# endif +#endif + +#if VEC_SIZE == 16 +# if FLOAT_SIZE == 4 +# define addsub(x, y) __builtin_ia32_addsubps(x, y) +# if defined(__FMA4__) +# define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps(x, y, z) +# endif +# elif FLOAT_SIZE == 8 +# define addsub(x, y) __builtin_ia32_addsubpd(x, y) +# if defined(__FMA4__) +# define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd(x, y, z) +# endif +# endif +#elif VEC_SIZE == 32 +# if FLOAT_SIZE == 4 +# define addsub(x, y) __builtin_ia32_addsubps256(x, y) +# if defined(__FMA4__) +# define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps256(x, y, z) +# endif +# elif FLOAT_SIZE == 8 +# define addsub(x, y) __builtin_ia32_addsubpd256(x, y) +# if defined(__FMA4__) +# define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd256(x, y, z) +# endif +# endif +#endif + +int fma_test(void) +{ + unsigned int i; + vec_t x, y, z, src, inv, one; + + for ( i = 0; i < ELEM_COUNT; ++i ) + { + src[i] = i + 1; + inv[i] = ELEM_COUNT - i; + one[i] = 1; + } + + x = (src + one) * inv; + y = (src - one) * inv; + touch(src); + z = inv * src + inv; + if ( !to_bool(x == z) ) return __LINE__; + + touch(src); + z = -inv * src - inv; + if ( !to_bool(-x == z) ) return __LINE__; + + touch(src); + z = inv * src - inv; + if ( !to_bool(y == z) ) return __LINE__; + + touch(src); + z = -inv * src + inv; + if ( !to_bool(-y == z) ) return __LINE__; + touch(src); + + x = src + inv; + y = src - inv; + touch(inv); + z = src * one + inv; + if ( !to_bool(x == z) ) return __LINE__; + + touch(inv); + z = -src * one - inv; + if ( !to_bool(-x == z) ) return __LINE__; + + touch(inv); + z = src * one - inv; + if ( !to_bool(y == z) ) return __LINE__; + + touch(inv); + z = -src * one + inv; + if ( !to_bool(-y == z) ) return __LINE__; + touch(inv); + +#if defined(addsub) && defined(fmaddsub) + x = addsub(src * inv, one); + y = addsub(src * inv, -one); + touch(one); + z = fmaddsub(src, inv, one); + if ( !to_bool(x == z) ) return __LINE__; + + touch(one); + z = fmaddsub(src, inv, -one); + if ( !to_bool(y == z) ) return __LINE__; + touch(one); + + x = addsub(src * inv, one); + touch(inv); + z = fmaddsub(src, inv, one); + if ( !to_bool(x == z) ) return __LINE__; + + touch(inv); + z = fmaddsub(src, inv, -one); + if ( !to_bool(y == z) ) return __LINE__; + touch(inv); +#endif + + return 0; +} --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -11,6 +11,7 @@ #include "sse2-avx.h" #include "sse4-avx.h" #include "avx.h" +#include "fma4.h" #define verbose false /* Switch to true for far more logging. */ @@ -47,6 +48,11 @@ static bool simd_check_avx(void) #define simd_check_sse2_avx simd_check_avx #define simd_check_sse4_avx simd_check_avx +static bool simd_check_fma4(void) +{ + return cpu_has_fma4; +} + static void simd_set_regs(struct cpu_user_regs *regs) { if ( cpu_has_mmx ) @@ -143,6 +149,12 @@ static const struct { SIMD(AVX scalar double, avx, f8), SIMD(AVX 128bit double, avx, 16f8), SIMD(AVX 256bit double, avx, 32f8), + SIMD(FMA4 scalar single, fma4, f4), + SIMD(FMA4 128bit single, fma4, 16f4), + SIMD(FMA4 256bit single, fma4, 32f4), + SIMD(FMA4 scalar double, fma4, f8), + SIMD(FMA4 128bit double, fma4, 16f8), + SIMD(FMA4 256bit double, fma4, 32f8), #undef SIMD_ #undef SIMD }; --- a/tools/tests/x86_emulator/x86_emulate.h +++ b/tools/tests/x86_emulator/x86_emulate.h @@ -164,6 +164,16 @@ static inline uint64_t xgetbv(uint32_t x (res.c & (1U << 6)) != 0; \ }) +#define cpu_has_fma4 ({ \ + struct cpuid_leaf res; \ + emul_test_cpuid(1, 0, &res, NULL); \ + if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \ + res.c = 0; \ + else \ + emul_test_cpuid(0x80000001, 0, &res, NULL); \ + (res.c & (1U << 16)) != 0; \ +}) + #define cpu_has_tbm ({ \ struct cpuid_leaf res; \ emul_test_cpuid(0x80000001, 0, &res, NULL); \ --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -421,7 +421,16 @@ static const struct { [0x44] = { .simd_size = simd_packed_int }, [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0x68 ... 0x69] = { .simd_size = simd_packed_fp, .four_op = 1 }, + [0x6a ... 0x6b] = { .simd_size = simd_scalar_fp, .four_op = 1 }, + [0x6c ... 0x6d] = { .simd_size = simd_packed_fp, .four_op = 1 }, + [0x6e ... 0x6f] = { .simd_size = simd_scalar_fp, .four_op = 1 }, + [0x78 ... 0x79] = { .simd_size = simd_packed_fp, .four_op = 1 }, + [0x7a ... 0x7b] = { .simd_size = simd_scalar_fp, .four_op = 1 }, + [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 }, + [0x7e ... 0x7f] = { .simd_size = simd_scalar_fp, .four_op = 1 }, [0xcc] = { .simd_size = simd_other }, [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xf0] = {}, @@ -1612,6 +1621,7 @@ static bool vcpu_has( #define vcpu_has_lzcnt() vcpu_has(0x80000001, ECX, 5, ctxt, ops) #define vcpu_has_sse4a() vcpu_has(0x80000001, ECX, 6, ctxt, ops) #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX, 7, ctxt, ops) +#define vcpu_has_fma4() vcpu_has(0x80000001, ECX, 16, ctxt, ops) #define vcpu_has_tbm() vcpu_has(0x80000001, ECX, 21, ctxt, ops) #define vcpu_has_bmi1() vcpu_has( 7, EBX, 3, ctxt, ops) #define vcpu_has_hle() vcpu_has( 7, EBX, 4, ctxt, ops) @@ -6155,6 +6165,7 @@ x86_emulate( simd_0f_imm8_avx: host_and_vcpu_must_have(avx); } + simd_0f_imm8_ymm: get_fpu(X86EMUL_FPU_ymm, &fic); } else if ( vex.pfx ) @@ -7710,6 +7721,49 @@ x86_emulate( generate_exception_if(vex.w, EXC_UD); goto simd_0f_int_imm8; + case X86EMUL_OPC_VEX_66(0x0f3a, 0x5c): /* vfmaddsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfmaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x5d): /* vfmaddsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfmaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x5e): /* vfmsubaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfmsubaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x5f): /* vfmsubaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfmsubaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x68): /* vfmaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfmaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x69): /* vfmaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfmaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x6a): /* vfmaddss xmm,xmm/m32,xmm,xmm */ + /* vfmaddss xmm/m32,xmm,xmm,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x6b): /* vfmaddsd xmm,xmm/m64,xmm,xmm */ + /* vfmaddsd xmm/m64,xmm,xmm,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x6c): /* vfmsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfmsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x6d): /* vfmsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfmsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x6e): /* vfmsubss xmm,xmm/m32,xmm,xmm */ + /* vfmsubss xmm/m32,xmm,xmm,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x6f): /* vfmsubsd xmm,xmm/m64,xmm,xmm */ + /* vfmsubsd xmm/m64,xmm,xmm,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x78): /* vfnmaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfnmaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x79): /* vfnmaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfnmaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x7a): /* vfnmaddss xmm,xmm/m32,xmm,xmm */ + /* vfnmaddss xmm/m32,xmm,xmm,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x7b): /* vfnmaddsd xmm,xmm/m64,xmm,xmm */ + /* vfnmaddsd xmm/m64,xmm,xmm,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x7c): /* vfnmsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfnmsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x7d): /* vfnmsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + /* vfnmsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x7e): /* vfnmsubss xmm,xmm/m32,xmm,xmm */ + /* vfnmsubss xmm/m32,xmm,xmm,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x7f): /* vfnmsubsd xmm,xmm/m64,xmm,xmm */ + /* vfnmsubsd xmm/m64,xmm,xmm,xmm */ + host_and_vcpu_must_have(fma4); + goto simd_0f_imm8_ymm; + case X86EMUL_OPC_66(0x0f3a, 0x60): /* pcmpestrm $imm8,xmm/m128,xmm */ case X86EMUL_OPC_VEX_66(0x0f3a, 0x60): /* vpcmpestrm $imm8,xmm/m128,xmm */ case X86EMUL_OPC_66(0x0f3a, 0x61): /* pcmpestri $imm8,xmm/m128,xmm */ --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -76,6 +76,7 @@ #define cpu_has_svm boot_cpu_has(X86_FEATURE_SVM) #define cpu_has_sse4a boot_cpu_has(X86_FEATURE_SSE4A) #define cpu_has_lwp boot_cpu_has(X86_FEATURE_LWP) +#define cpu_has_fma4 boot_cpu_has(X86_FEATURE_FMA4) #define cpu_has_tbm boot_cpu_has(X86_FEATURE_TBM) /* CPUID level 0x0000000D:1.eax */