x86emul: support most remaining AVX2 insns I.e. those not being equivalents of SSEn ones, but with the exception of the various gather operations. Signed-off-by: Jan Beulich --- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -11,9 +11,9 @@ all: $(TARGET) run: $(TARGET) ./$(TARGET) -SIMD := sse sse2 sse4 avx +SIMD := sse sse2 sse4 avx avx2 FMA := fma4 fma -TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA) +TESTCASES := blowfish $(SIMD) $(FMA) blowfish-cflags := "" blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic=" @@ -36,13 +36,9 @@ fma4-flts := $(avx-flts) fma-vecs := $(avx-vecs) fma-ints := fma-flts := $(avx-flts) - -# When converting SSE to AVX, have the compiler avoid XMM0 to widen -# coverage of the VEX.vvvv checks in the emulator. We must not do this, -# however, for SSE4.1 and later, as there are instructions with XMM0 as -# an implicit operand. -sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx -sse2avx-sse4 := -Wa,-msse2avx +avx2-vecs := $(avx-vecs) +avx2-ints := 1 2 4 8 +avx2-flts := 4 8 # For AVX and later, have the compiler avoid XMM0 to widen coverage of # the VEX.vvvv checks in the emulator. @@ -58,11 +54,6 @@ $(1)-cflags := \ "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \ $(foreach flt,$($(1)-flts), \ "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)") -$(1)-avx-cflags := \ - $(foreach vec,$($(1)-vecs), \ - $(foreach int,$($(1)-ints), \ - "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \ - "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)")) endef $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor)))) @@ -81,13 +72,13 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t ) mv $@.new $@ -$(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))): +$(addsuffix .c,$(SIMD)): ln -sf simd.c $@ $(addsuffix .c,$(FMA)): ln -sf simd-fma.c $@ -$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h +$(addsuffix .o,$(SIMD) $(FMA)): simd.h $(TARGET): x86_emulate.o test_x86_emulator.o $(HOSTCC) -o $@ $^ --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -23,7 +23,9 @@ ENTRY(simd_test); # endif # endif #elif VEC_SIZE == 32 -# if defined(__AVX__) && ELEM_SIZE == 4 +# if defined(__AVX2__) +# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vdi_t){} == 0) +# elif defined(__AVX__) && ELEM_SIZE == 4 # define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff) # elif defined(__AVX__) && ELEM_SIZE == 8 # define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf) @@ -80,10 +82,14 @@ static inline bool _to_bool(byte_vec_t b vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \ __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \ }) -# define swap2(x) ({ \ - vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \ - __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \ +# ifdef __AVX2__ +# define swap2(x) __builtin_ia32_permvarsf256(x, __builtin_ia32_cvtps2dq256(inv) - 1) +# else +# define swap2(x) ({ \ + vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \ + __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \ }) +# endif # elif VEC_SIZE == 16 # ifdef __AVX__ # define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); }) @@ -128,6 +134,9 @@ static inline bool _to_bool(byte_vec_t b vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \ __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \ }) +# ifdef __AVX2__ +# define swap2(x) __builtin_ia32_permdf256(x, 0b00011011) +# endif # elif VEC_SIZE == 16 # define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y) # define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y) @@ -184,6 +193,104 @@ static inline bool _to_bool(byte_vec_t b __builtin_ia32_maskmovdqu((vqi_t)(x), m_, d_); \ __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \ }) +#elif VEC_SIZE == 32 && defined(__AVX2__) +# define swap_lanes(x, y, func, type) ({ \ + long long __attribute__((vector_size(16))) t_ = __builtin_ia32_extract128i256((vdi_t)(y), 0); \ + type t1_ = (type)__builtin_ia32_insert128i256((vdi_t)(x), t_, 1), t2_; \ + t_ = __builtin_ia32_extract128i256((vdi_t)(x), 1); \ + t2_ = (type)__builtin_ia32_insert128i256((vdi_t)(y), t_, 0); \ + func(t1_, t2_); \ +}) +# if INT_SIZE == 1 || UINT_SIZE == 1 +# define broadcast(x) ({ char s_ = (x); vec_t d_; asm ( "vpbroadcastb %1,%0" : "=x" (d_) : "m" (s_)); d_; }) +# define copysignz(x, y) ((vec_t)__builtin_ia32_psignb256((vqi_t)(x), (vqi_t)(y))) +# define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \ + (vdi_t)(x), (n) * 8)) +# elif INT_SIZE == 2 || UINT_SIZE == 2 +# define broadcast(x) ({ short s_ = (x); vec_t d_; asm ( "vpbroadcastw %1,%0" : "=x" (d_) : "m" (s_)); d_; }) +# define copysignz(x, y) ((vec_t)__builtin_ia32_psignw256((vhi_t)(x), (vhi_t)(y))) +# define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddw256, vhi_t)) +# define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubw256, vhi_t)) +# define mix(x, y) ((vec_t)__builtin_ia32_pblendw256((vhi_t)(x), (vhi_t)(y), 0b10101010)) +# define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \ + (vdi_t)(x), (n) * 16)) +# elif INT_SIZE == 4 || UINT_SIZE == 4 +# define broadcast(x) ({ int s_ = (x); vec_t d_; asm ( "vpbroadcastd %1,%0" : "=x" (d_) : "m" (s_)); d_; }) +# define copysignz(x, y) ((vec_t)__builtin_ia32_psignd256((vsi_t)(x), (vsi_t)(y))) +# define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddd256, vsi_t)) +# define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubd256, vsi_t)) +# define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b10101010)) +# define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \ + (vdi_t)(x), (n) * 32)) +# define select(d, x, y, m) ({ \ + vsi_t m_ = (vsi_t)(m); \ + *(d) = (vec_t)__builtin_ia32_maskloadd256((vsi_t *)&(x), m_); \ + __builtin_ia32_maskstored256((vsi_t *)(d), ~m_, (vsi_t)(y)); \ +}) +# define swap(x) ((vec_t)__builtin_ia32_permvarsi256((vsi_t)(x), (vsi_t)inv - 1)) +# elif INT_SIZE == 8 || UINT_SIZE == 8 +# define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b11001100)) +# define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \ + (vdi_t)(x), (n) * 64)) +# define select(d, x, y, m) ({ \ + vdi_t m_ = (vdi_t)(m); \ + *(d) = (vec_t)__builtin_ia32_maskloadq256((vdi_t *)&(x), m_); \ + __builtin_ia32_maskstoreq256((vdi_t *)(d), ~m_, (vdi_t)(y)); \ +}) +# define swap(x) ((vec_t)__builtin_ia32_permdi256((vdi_t)(x), 0b00011011)) +# define swap2(x) ({ \ + vdi_t t_ = __builtin_ia32_permdi256((vdi_t)(x), 0b10110001); \ + (vec_t)__builtin_ia32_permti256(t_, t_, 0b00000001); \ +}) +# endif +# if INT_SIZE == 1 +# define abs(x) ((vec_t)__builtin_ia32_pabsb256((vqi_t)(x))) +# define max(x, y) ((vec_t)__builtin_ia32_pmaxsb256((vqi_t)(x), (vqi_t)(y))) +# define min(x, y) ((vec_t)__builtin_ia32_pminsb256((vqi_t)(x), (vqi_t)(y))) +# define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw256((vqi_t)(x))) +# define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd256((vqi_t)(x))) +# define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq256((vqi_t)(x))) +# elif UINT_SIZE == 1 +# define max(x, y) ((vec_t)__builtin_ia32_pmaxub256((vqi_t)(x), (vqi_t)(y))) +# define min(x, y) ((vec_t)__builtin_ia32_pminub256((vqi_t)(x), (vqi_t)(y))) +# define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw256((vqi_t)(x))) +# define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd256((vqi_t)(x))) +# define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq256((vqi_t)(x))) +# elif INT_SIZE == 2 +# define abs(x) __builtin_ia32_pabsw256(x) +# define max(x, y) __builtin_ia32_pmaxsw256(x, y) +# define min(x, y) __builtin_ia32_pminsw256(x, y) +# define mul_hi(x, y) __builtin_ia32_pmulhw256(x, y) +# define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd256(x)) +# define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq256(x)) +# elif UINT_SIZE == 2 +# define max(x, y) ((vec_t)__builtin_ia32_pmaxuw256((vhi_t)(x), (vhi_t)(y))) +# define min(x, y) ((vec_t)__builtin_ia32_pminuw256((vhi_t)(x), (vhi_t)(y))) +# define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw256((vhi_t)(x), (vhi_t)(y))) +# define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd256((vhi_t)(x))) +# define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq256((vhi_t)(x))) +# elif INT_SIZE == 4 +# define abs(x) __builtin_ia32_pabsd256(x) +# define max(x, y) __builtin_ia32_pmaxsd256(x, y) +# define min(x, y) __builtin_ia32_pminsd256(x, y) +# define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq256(x)) +# elif UINT_SIZE == 4 +# define max(x, y) ((vec_t)__builtin_ia32_pmaxud256((vsi_t)(x), (vsi_t)(y))) +# define min(x, y) ((vec_t)__builtin_ia32_pminud256((vsi_t)(x), (vsi_t)(y))) +# define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq256((vsi_t)(x), (vsi_t)(y))) +# define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq256((vsi_t)(x))) +# elif INT_SIZE == 8 +# define broadcast(x) ({ \ + long long s_ = (x); \ + long long __attribute__((vector_size(16))) t_; \ + vec_t d_; \ + asm ( "vpbroadcastq %1,%0" : "=x" (t_) : "m" (s_)); \ + asm ( "vbroadcasti128 %1,%0" : "=x" (d_) : "m" (t_)); \ + d_; \ +}) +# elif UINT_SIZE == 8 +# define broadcast(x) ({ long long s_ = (x); vec_t d_; asm ( "vpbroadcastq %1,%0" : "=x" (d_) : "m" (s_)); d_; }) +# endif #endif #if VEC_SIZE == 16 && defined(__SSE3__) # if FLOAT_SIZE == 4 @@ -207,25 +314,37 @@ static inline bool _to_bool(byte_vec_t b # define addsub(x, y) __builtin_ia32_addsubps256(x, y) # define dup_hi(x) __builtin_ia32_movshdup256(x) # define dup_lo(x) __builtin_ia32_movsldup256(x) -# define hadd(x, y) ({ \ +# ifdef __AVX2__ +# define hadd(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_haddps256(x, y), \ + (vsi_t){0, 1, 4, 5, 2, 3, 6, 7}) +# define hsub(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_hsubps256(x, y), \ + (vsi_t){0, 1, 4, 5, 2, 3, 6, 7}) +# else +# define hadd(x, y) ({ \ vec_t t_ = __builtin_ia32_haddps256(x, y); \ (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \ }) -# define hsub(x, y) ({ \ +# define hsub(x, y) ({ \ vec_t t_ = __builtin_ia32_hsubps256(x, y); \ (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \ }) +# endif # elif FLOAT_SIZE == 8 # define addsub(x, y) __builtin_ia32_addsubpd256(x, y) # define dup_lo(x) __builtin_ia32_movddup256(x) -# define hadd(x, y) ({ \ +# ifdef __AVX2__ +# define hadd(x, y) __builtin_ia32_permdf256(__builtin_ia32_haddpd256(x, y), 0b11011000) +# define hsub(x, y) __builtin_ia32_permdf256(__builtin_ia32_hsubpd256(x, y), 0b11011000) +# else +# define hadd(x, y) ({ \ vec_t t_ = __builtin_ia32_haddpd256(x, y); \ (vec_t){t_[0], t_[2], t_[1], t_[3]}; \ }) -# define hsub(x, y) ({ \ +# define hsub(x, y) ({ \ vec_t t_ = __builtin_ia32_hsubpd256(x, y); \ (vec_t){t_[0], t_[2], t_[1], t_[3]}; \ }) +# endif # endif #endif #if VEC_SIZE == 16 && defined(__SSSE3__) @@ -546,7 +665,7 @@ int simd_test(void) z *= alt; # endif /* - * Zap elements for which the shift count is negative (and the hence the + * Zap elements for which the shift count is zero (and the hence the * decrement below would yield a negative count. */ z &= (sh > 0); @@ -556,9 +675,14 @@ int simd_test(void) --sh; touch(sh); y = z << sh; - touch(sh); if ( !to_bool(x == y + y) ) return __LINE__; +# if defined(__AVX2__) && ELEM_SIZE >= 4 + touch(sh); + x = y >> sh; + if ( !to_bool(x == z) ) return __LINE__; +# endif + # endif #endif --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -8,11 +8,10 @@ #include "sse.h" #include "sse2.h" #include "sse4.h" -#include "sse2-avx.h" -#include "sse4-avx.h" #include "avx.h" #include "fma4.h" #include "fma.h" +#include "avx2.h" #define verbose false /* Switch to true for far more logging. */ @@ -46,8 +45,6 @@ static bool simd_check_avx(void) { return cpu_has_avx; } -#define simd_check_sse2_avx simd_check_avx -#define simd_check_sse4_avx simd_check_avx static bool simd_check_fma4(void) { @@ -59,6 +56,11 @@ static bool simd_check_fma(void) return cpu_has_fma; } +static bool simd_check_avx2(void) +{ + return cpu_has_avx2; +} + static void simd_set_regs(struct cpu_user_regs *regs) { if ( cpu_has_mmx ) @@ -133,22 +135,6 @@ static const struct { SIMD(SSE4 packed u32, sse4, 16u4), SIMD(SSE4 packed s64, sse4, 16i8), SIMD(SSE4 packed u64, sse4, 16u8), - SIMD(SSE2/AVX packed s8, sse2_avx, 16i1), - SIMD(SSE2/AVX packed u8, sse2_avx, 16u1), - SIMD(SSE2/AVX packed s16, sse2_avx, 16i2), - SIMD(SSE2/AVX packed u16, sse2_avx, 16u2), - SIMD(SSE2/AVX packed s32, sse2_avx, 16i4), - SIMD(SSE2/AVX packed u32, sse2_avx, 16u4), - SIMD(SSE2/AVX packed s64, sse2_avx, 16i8), - SIMD(SSE2/AVX packed u64, sse2_avx, 16u8), - SIMD(SSE4/AVX packed s8, sse4_avx, 16i1), - SIMD(SSE4/AVX packed u8, sse4_avx, 16u1), - SIMD(SSE4/AVX packed s16, sse4_avx, 16i2), - SIMD(SSE4/AVX packed u16, sse4_avx, 16u2), - SIMD(SSE4/AVX packed s32, sse4_avx, 16i4), - SIMD(SSE4/AVX packed u32, sse4_avx, 16u4), - SIMD(SSE4/AVX packed s64, sse4_avx, 16i8), - SIMD(SSE4/AVX packed u64, sse4_avx, 16u8), SIMD(AVX scalar single, avx, f4), SIMD(AVX 128bit single, avx, 16f4), SIMD(AVX 256bit single, avx, 32f4), @@ -167,6 +153,26 @@ static const struct { SIMD(FMA scalar double, fma, f8), SIMD(FMA 128bit double, fma, 16f8), SIMD(FMA 256bit double, fma, 32f8), + SIMD(AVX2 128bit single, avx2, 16f4), + SIMD(AVX2 256bit single, avx2, 32f4), + SIMD(AVX2 128bit double, avx2, 16f8), + SIMD(AVX2 256bit double, avx2, 32f8), + SIMD(AVX2 s8x16, avx2, 16i1), + SIMD(AVX2 u8x16, avx2, 16u1), + SIMD(AVX2 s16x8, avx2, 16i2), + SIMD(AVX2 u16x8, avx2, 16u2), + SIMD(AVX2 s32x4, avx2, 16i4), + SIMD(AVX2 u32x4, avx2, 16u4), + SIMD(AVX2 s64x2, avx2, 16i8), + SIMD(AVX2 u64x2, avx2, 16u8), + SIMD(AVX2 s8x32, avx2, 32i1), + SIMD(AVX2 u8x32, avx2, 32u1), + SIMD(AVX2 s16x16, avx2, 32i2), + SIMD(AVX2 u16x16, avx2, 32u2), + SIMD(AVX2 s32x8, avx2, 32i4), + SIMD(AVX2 u32x8, avx2, 32u4), + SIMD(AVX2 s64x4, avx2, 32i8), + SIMD(AVX2 u64x4, avx2, 32u8), #undef SIMD_ #undef SIMD }; @@ -2925,6 +2931,91 @@ int main(int argc, char **argv) res[0] || res[1] || memcmp(res + 2, res + 4, 8) ) goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing vpmaskmovd %xmm1,%xmm2,(%edx)..."); + if ( stack_exec && cpu_has_avx2 ) + { + decl_insn(vpmaskmovd); + + asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t" + "vpinsrd $0b00, %1, %%xmm1, %%xmm2\n\t" +#if 0 /* Don't use AVX2 instructions for now */ + put_insn(vpmaskmovd, "vpmaskmovd %%xmm1, %%xmm2, (%0)") +#else + put_insn(vpmaskmovd, + ".byte 0xc4, 0xe2, 0x69, 0x8e, 0x0a") +#endif + :: "d" (NULL), "r" (~0) ); + + memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32); + set_insn(vpmaskmovd); + regs.edx = (unsigned long)res + MMAP_SZ - 4; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) || + res[MMAP_SZ / sizeof(*res) - 1] || + memcmp(res + MMAP_SZ / sizeof(*res) - 8, + res + MMAP_SZ / sizeof(*res) - 4, 12) ) + goto fail; + + asm volatile ( "vpinsrd $0b11, %0, %%xmm1, %%xmm2" :: "r" (~0) ); + memset(res, 0xdb, 32); + set_insn(vpmaskmovd); + regs.edx = (unsigned long)(res - 3); + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) || + res[0] || memcmp(res + 1, res + 4, 12) ) + goto fail; + + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing vpmaskmovq %xmm1,%xmm2,(%edx)..."); + if ( stack_exec && cpu_has_avx2 ) + { + decl_insn(vpmaskmovq); + + asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t" + "vpcmpeqd %%xmm0, %%xmm0, %%xmm0\n\t" +#if 0 /* Don't use AVX2 instructions for now */ + "vpblendd $0b0011, %%xmm0, %%xmm1, %%xmm2\n\t" + put_insn(vpmaskmovq, "vpmaskmovq %%xmm1, %%xmm2, (%0)") +#else + ".byte 0xc4, 0xe3, 0x71, 0x02, 0xd0, 0b0011\n\t" + put_insn(vpmaskmovq, + ".byte 0xc4, 0xe2, 0xe9, 0x8e, 0x0a") +#endif + :: "d" (NULL) ); + + memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32); + set_insn(vpmaskmovq); + regs.edx = (unsigned long)res + MMAP_SZ - 8; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) || + res[MMAP_SZ / sizeof(*res) - 1] || + res[MMAP_SZ / sizeof(*res) - 2] || + memcmp(res + MMAP_SZ / sizeof(*res) - 8, + res + MMAP_SZ / sizeof(*res) - 4, 8) ) + goto fail; + +#if 0 /* Don't use AVX2 instructions for now */ + asm volatile ( "vpermq $0b00000001, %ymm2, %ymm2" ); +#else + asm volatile ( ".byte 0xc4, 0xe3, 0xfd, 0x00, 0xd2, 0b00000001" ); +#endif + memset(res, 0xdb, 32); + set_insn(vpmaskmovq); + regs.edx = (unsigned long)(res - 2); + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) || + res[0] || res[1] || memcmp(res + 2, res + 4, 8) ) + goto fail; + printf("okay\n"); } else --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -370,7 +370,7 @@ static const struct { [0x0c ... 0x0f] = { .simd_size = simd_packed_fp }, [0x10] = { .simd_size = simd_packed_int }, [0x13] = { .simd_size = simd_other, .two_op = 1 }, - [0x14 ... 0x15] = { .simd_size = simd_packed_fp }, + [0x14 ... 0x16] = { .simd_size = simd_packed_fp }, [0x17] = { .simd_size = simd_packed_int, .two_op = 1 }, [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 }, [0x1a] = { .simd_size = simd_128, .two_op = 1 }, @@ -382,9 +382,15 @@ static const struct { [0x2c ... 0x2d] = { .simd_size = simd_other }, [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 }, [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 }, - [0x37 ... 0x3f] = { .simd_size = simd_packed_int }, + [0x36 ... 0x3f] = { .simd_size = simd_packed_int }, [0x40] = { .simd_size = simd_packed_int }, [0x41] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0x45 ... 0x47] = { .simd_size = simd_packed_int }, + [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 }, + [0x5a] = { .simd_size = simd_128, .two_op = 1 }, + [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 }, + [0x8c] = { .simd_size = simd_other }, + [0x8e] = { .simd_size = simd_other, .to_mem = 1 }, [0x96 ... 0x9f] = { .simd_size = simd_packed_fp }, [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp }, [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp }, @@ -406,6 +412,9 @@ static const struct { uint8_t two_op:1; uint8_t four_op:1; } ext0f3a_table[256] = { + [0x00] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 }, + [0x02] = { .simd_size = simd_packed_int }, [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 }, [0x06] = { .simd_size = simd_packed_fp }, [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 }, @@ -419,9 +428,12 @@ static const struct { [0x20] = { .simd_size = simd_none }, [0x21] = { .simd_size = simd_other }, [0x22] = { .simd_size = simd_none }, + [0x38] = { .simd_size = simd_128 }, + [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 }, [0x40 ... 0x41] = { .simd_size = simd_packed_fp }, [0x42] = { .simd_size = simd_packed_int }, [0x44] = { .simd_size = simd_packed_int }, + [0x46] = { .simd_size = simd_packed_int }, [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 }, [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 }, @@ -2964,7 +2976,7 @@ x86_decode( } break; - case simd_scalar_fp: + case simd_scalar_fp: /* case simd_scalar_dq: */ op_bytes = 4 << (ctxt->opcode & 1); break; @@ -6057,6 +6069,10 @@ x86_emulate( case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */ if ( !vex.l ) goto simd_0f_avx; + /* fall through */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x45): /* vpsrlv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x47): /* vpsllv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + simd_0f_avx2: host_and_vcpu_must_have(avx2); goto simd_0f_ymm; } @@ -6156,7 +6172,10 @@ x86_emulate( case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */ if ( vex.l ) + { + simd_0f_imm8_avx2: host_and_vcpu_must_have(avx2); + } else { case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */ @@ -7240,6 +7259,11 @@ x86_emulate( op_bytes = 8 << vex.l; goto simd_0f_ymm; + case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */ + generate_exception_if(!vex.l || vex.w, EXC_UD); + goto simd_0f_avx2; + case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */ @@ -7356,6 +7380,80 @@ x86_emulate( generate_exception_if(vex.l, EXC_UD); goto simd_0f_avx; + case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */ + op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1)); + /* fall through */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */ + generate_exception_if(vex.w, EXC_UD); + goto simd_0f_avx2; + + case X86EMUL_OPC_VEX_66(0x0f38, 0x5a): /* vbroadcasti128 m128,ymm */ + generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD); + goto simd_0f_avx2; + + case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */ + { + typeof(vex) *pvex; + unsigned int mask = vex.w ? 0x80808080U : 0x88888888U; + + generate_exception_if(ea.type != OP_MEM, EXC_UD); + host_and_vcpu_must_have(avx2); + get_fpu(X86EMUL_FPU_ymm, &fic); + + /* + * While we can't reasonably provide fully correct behavior here + * (in particular, for writes, avoiding the memory read in anticipation + * of all elements in the range eventually being written), we can (and + * should) still limit the memory access to the smallest possible range + * (suppressing it altogether if all mask bits are clear), to provide + * correct faulting behavior. Read the mask bits via vmovmskp{s,d} + * for that purpose. + */ + opc = init_prefixes(stub); + pvex = copy_VEX(opc, vex); + pvex->opcx = vex_0f; + opc[0] = 0xd7; /* vpmovmskb */ + /* Use %rax as GPR destination and VEX.vvvv as source. */ + pvex->r = 1; + pvex->b = !mode_64bit() || (vex.reg >> 3); + opc[1] = 0xc0 | (~vex.reg & 7); + pvex->reg = 0xf; + opc[2] = 0xc3; + + invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0)); + put_stub(stub); + + /* Convert byte granular result to dword/qword granularity. */ + ea.val &= mask; + if ( !ea.val ) + goto complete_insn; + + first_byte = __builtin_ctz(ea.val) & ~((4 << vex.w) - 1); + ea.val >>= first_byte; + op_bytes = 32 - __builtin_clz(ea.val); + + /* + * Even for the memory write variant a memory read is needed, unless + * all set mask bits are contiguous. + */ + if ( ea.val & (ea.val + ~mask + 1) ) + d = (d & ~SrcMask) | SrcMem; + + opc = init_prefixes(stub); + opc[0] = b; + /* Convert memory operand to (%rAX). */ + rex_prefix &= ~REX_B; + vex.b = 1; + opc[1] = modrm & 0x38; + fic.insn_bytes = PFX_BYTES + 2; + + break; + } + case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ @@ -7564,6 +7662,20 @@ x86_emulate( : "0" ((uint32_t)src.val), "rm" (_regs.edx) ); break; + case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x01): /* vpermpd $imm8,ymm/m256,ymm */ + generate_exception_if(!vex.l || !vex.w, EXC_UD); + goto simd_0f_imm8_avx2; + + case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */ + generate_exception_if(!vex.l, EXC_UD); + /* fall through */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x02): /* vpblendd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + generate_exception_if(vex.w, EXC_UD); + goto simd_0f_imm8_avx2; + case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */ case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */ case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */